# Random Forest on White Wine Data Set
For the purpose of this we will use Random Forest to predict the quality of White Wine 
Data Set is available at http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/

## Load Data and Prepare for cleaning

In [43]:
# Load Packages, Libraries and Modules
import numpy as np # Numeriacal computations
import pandas as pd # Data manipulation

from sklearn.model_selection import train_test_split # Sampling Helper
from sklearn import preprocessing # Preprocessing Module
from sklearn.ensemble import RandomForestRegressor # Random Forest Module
from sklearn.pipeline import Pipeline # Pipeline Module to clean up the modelled code
from sklearn.pipeline import make_pipeline # Pipeline Module to clean up the modelled code
from sklearn.model_selection import GridSearchCV # GridSearchCV Implements a Fit and Score Method
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # Model Evaluation metrics
from sklearn.externals import joblib # Module used to save Scikit-learn models
from sklearn.compose import ColumnTransformer # Preprocessing of Columns
from sklearn.impute import SimpleImputer # Ensure that are missing data is filled
from sklearn.model_selection import cross_val_score # Module for cross validation

In [44]:
#Load Dataset
data_path = "Dataset/White_wine_quality.csv"
df = pd.read_csv(data_path)

In [45]:
df.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [46]:
# Since the content of the dataSet is separated by semicolons we need to reload it into the dataframe
df = pd.read_csv(data_path, sep = ';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [47]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [48]:
df.shape

(4898, 12)

In [49]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [50]:
# Find the Categorical Columns in the dataset
col = (df.dtypes == 'object')
cat_col = list(col[col].index)
print (cat_col)

[]


In [51]:
# Find the Numerical columns in the dataset
num_col = df.select_dtypes([np.number]).columns
print (num_col)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [52]:
# Since the data doesn't have any categorical columns we can proceed
# Separate target and Predictors
y = df.quality  # Same as y = df['diagnosis']
X = df.drop(['quality'], axis = 1) # Drop the diagnosis column and use all the other columns as predictors

## Preprocessing, Modelling, Prediction, Evaluataion

In [53]:
from sklearn.model_selection import train_test_split

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size = 0.8, test_size = 0.2, random_state = 123)

 ### We are going to test 2 approaches to pipeline the data
 ### Approach 1

In [54]:
# Preprocessing and Standardization for numerical data
numerical_transformer = SimpleImputer(strategy = 'constant')

In [55]:
# Drop the "quality" column from the numerical columns list since it is out target variable
num_col_update = num_col.drop('quality')

In [56]:
# Bundle Preprossing for the Data
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer, num_col_update)])

In [57]:
# Define the model
model = RandomForestRegressor(n_estimators = 100, random_state = 123)

In [58]:
# Bundle the pPreprocessind and Modelling code in a pipeline
my_pipeline = Pipeline(steps = [('preprocessor',preprocessor),('model',model)])

# Preprocessing of Training Data, fit model
my_pipeline.fit(X_train,y_train)

# Preprocessing of Validation data, Get Predictions
preds = my_pipeline.predict(X_valid)

In [59]:
# Evaluate the model
print('Mean Absolute Error: ',mean_absolute_error(y_valid, preds))
print('\nMean Sqaured Error: ',mean_squared_error(y_valid, preds))
print('\nCoefficient of Determination: ', r2_score(y_valid, preds))


Mean Absolute Error:  0.4264795918367346

Mean Sqaured Error:  0.36683010204081634

Coefficient of Determination:  0.5227602541820698


In [60]:
# Tune Model using a Cross-Validation Pipeline
scores1 = -1 * cross_val_score(my_pipeline,X,y, cv = 10, scoring  = 'neg_mean_absolute_error')
scores2 = -1 * cross_val_score(my_pipeline,X,y, cv = 10, scoring  = 'neg_mean_squared_error')
scores3 = cross_val_score(my_pipeline,X,y, cv = 10, scoring  = 'r2')
print('Mean Absolute Error Scores: ', scores1)
print ('\nAverage of MAE: ', scores1.mean())
print('\nMean Absolute Squared Scores: ', scores2)
print ('\nAverage of MSE: ', scores2.mean())
print('\nCoefficient of Determination Scores: ', scores3)
print ('\nAverage of COD: ', scores3.mean())

Mean Absolute Error Scores:  [0.57716327 0.63134694 0.64283673 0.54077551 0.55220408 0.5374898
 0.58830612 0.52938776 0.48364008 0.51552147]

Average of MAE:  0.5598671758273861

Mean Absolute Squared Scores:  [0.54671245 0.67200122 0.64946959 0.48336163 0.51394653 0.46469041
 0.58742388 0.47007837 0.37721227 0.41316094]

Average of MSE:  0.51780572922666

Coefficient of Determination Scores:  [0.30562334 0.29153076 0.29027632 0.35626496 0.3026951  0.42094879
 0.24407912 0.40014766 0.44029081 0.2159464 ]

Average of COD:  0.3267803252344895


## Approach 2
### Perform Standardization of the Data
Standardization is the process of subtracting the means from each feature and then dividing by the feature standard deviation. The process involves the use of the Transformer API which follow the below steps 
1. Fit the transformer on the training set. (Saving the mean and standard deviation)
2. Apply the transformer to the training set ( scaling the training data)
3. Apply the transformer to the test set (using the same mean and standard deviation)

In [20]:
# Fitting the Tranformer API
# The scaler object has the value of mean and standard deviation from X_train 
scaler  = preprocessing.StandardScaler().fit(X_train)

# Apply the transformer to the training data
X_train_scaled  = scaler.transform(X_train)

# Transform Validation set using the same mean and standard deviation
X_valid_scaled = scaler.transform(X_valid)


In [21]:
# Set up a modelling pipeline for the data. Transform the data using standard scalar and fit the model using
# Random Forest Regressor
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators = 100))

### Declare the Hyperparameters to tune the data
The two types of parameter we worry about are 
1. Model Parameters - Parameters can be learned directly form the data 
2. Hyperparameters - Parameters express structural information and are set before training the data.

In [34]:
# Declare hyperparameters to tune
hyperparameters = {'randomforestregressor__max_features': ['auto','sqrt','log2'],
                  'randomforestregressor__max_depth': [None,5,3,1]}

In [35]:
# Cross-validation with pipelne
model = GridSearchCV(pipeline, hyperparameters, cv = 10)

In [36]:
# Fit and tune the model
model.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

In [37]:
# Print the Best set of parameters used for the cross validation
print (model.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


In [38]:
# Confirm is refitting is ON for GridSearchCV
print (model.refit)

True


### Evaluate model pipeline on test data

In [41]:
# Predict a new set of data
pred = model.predict(X_valid)

In [42]:
# Evaluate the model
print('Mean Absolute Error: ',mean_absolute_error(y_valid, preds))
print('\nMean Sqaured Error: ',mean_squared_error(y_valid, preds))
print('\nCoefficient of Determination: ', r2_score(y_valid, preds))

Mean Absolute Error:  0.4264795918367346

Mean Sqaured Error:  0.36683010204081634

Coefficient of Determination:  0.5227602541820698


### Save Model for future use

In [61]:
# Save model to a .pkl file
joblib.dump(model,'Random_Forest_Wine.pkl')

['Random_Forest_Wine.pkl']

In [62]:
# Load model from .pkl file 
model_current = joblib.load('Random_Forest_Wine.pkl')

In [65]:
# Predict data using the model
new_pred = model_current.predict(X_valid)