---
## Random Forest: Red Wine Quality

Data set: http://archive.ics.uce.edu/ml/datasets/Wine+Quality



In [1]:
import pandas as pd

#import the dataset
WineQuality_Red = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
WineQuality_Red.to_csv('WineQuality_Red.csv')
print(WineQuality_Red.columns)
                              

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


### Select Data Set Metrics and Classifier

In [12]:
#Create variables for metrics and classifier so we can use this on different data sets
my_metrics = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
my_classifier = 'quality'

## Split the data into training and test portions

In [13]:
from sklearn.model_selection import train_test_split


X = WineQuality_Red[my_metrics]
y = WineQuality_Red[my_classifier]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) #training set size = 1-test_size

In [14]:
#show metrics

X_train.head() #first 5 rows

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
776,6.9,0.765,0.18,2.4,0.243,5.5,48.0,0.99612,3.4,0.6,10.3
1244,5.9,0.29,0.25,13.4,0.067,72.0,160.0,0.99721,3.33,0.54,10.3
552,9.5,0.46,0.24,2.7,0.092,14.0,44.0,0.998,3.12,0.74,10.0
598,8.5,0.585,0.18,2.1,0.078,5.0,30.0,0.9967,3.2,0.48,9.8
1177,7.1,0.66,0.0,2.4,0.052,6.0,11.0,0.99318,3.35,0.66,12.7


In [15]:
#show quality scores

y_train.head() #prints out first 5 rows

776     6
1244    6
552     6
598     6
1177    7
Name: quality, dtype: int64

## Training the Model

In [16]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier ?
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

## Evaluate the Model

In [17]:
#Import Scikit-Learn metrics module for accuracy calculation
from sklearn import metrics

#Model Accuracy -- how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.695


## What Drives the Model's Decision Making Abilities?

In [18]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_, index=my_metrics).sort_values(ascending=False)

feature_imp

alcohol                 0.145886
sulphates               0.111403
volatile acidity        0.107234
total sulfur dioxide    0.105525
density                 0.088132
chlorides               0.079917
pH                      0.077130
citric acid             0.074473
fixed acidity           0.073915
residual sugar          0.070868
free sulfur dioxide     0.065516
dtype: float64