---
## Random Forest: Red Wine Quality

Data set: http://archive.ics.uce.edu/ml/datasets/Wine+Quality



In [1]:
import pandas as pd

#import the dataset
WineQuality_Red = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
WineQuality_Red.to_csv('WineQuality_Red.csv')
print(WineQuality_Red.columns)
                              

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


### Select Data Set Metrics and Classifier

In [2]:
#Create variables for metrics and classifier so we can use this on different data sets
my_metrics = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
my_classifier = 'quality'

## Split the data into training and test portions

In [3]:
from sklearn.model_selection import train_test_split


X = WineQuality_Red[my_metrics]
y = WineQuality_Red[my_classifier]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5) #training set size = 1-test_size

In [4]:
#show metrics

X_train.head() #first 5 rows

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
267,7.9,0.35,0.46,3.6,0.078,15.0,37.0,0.9973,3.35,0.86,12.8
526,7.3,0.365,0.49,2.5,0.088,39.0,106.0,0.9966,3.36,0.78,11.0
478,9.6,0.68,0.24,2.2,0.087,5.0,28.0,0.9988,3.14,0.6,10.2
34,5.2,0.32,0.25,1.8,0.103,13.0,50.0,0.9957,3.38,0.55,9.2
853,9.3,0.36,0.39,1.5,0.08,41.0,55.0,0.99652,3.47,0.73,10.9


In [5]:
#show quality scores

y_train.head() #prints out first 5 rows

267    8
526    5
478    5
34     5
853    6
Name: quality, dtype: int64

## Training the Model

In [6]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier ?
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)

## Evaluate the Model

In [11]:
#Import Scikit-Learn metrics module for accuracy calculation
from sklearn import metrics

#Model Accuracy -- how often is the classifier correct?
print("Accuracy:",str(round(metrics.accuracy_score(y_test, y_pred)*100,2))+'%')

Accuracy: 65.0%


## What Drives the Model's Decision Making Abilities?

In [12]:
import pandas as pd
feature_imp = pd.Series(clf.feature_importances_, index=my_metrics).sort_values(ascending=False)

for key, value in feature_imp.items():
    feature_imp[key]=str(round(value*100,2))+'%'

feature_imp
    

alcohol                 13.73%
volatile acidity        11.44%
sulphates               11.23%
total sulfur dioxide     9.95%
density                  8.97%
fixed acidity            8.23%
chlorides                7.99%
pH                       7.85%
citric acid              7.05%
free sulfur dioxide      6.78%
residual sugar           6.78%
dtype: object