# Starting in applied ML
#### A biginner project
- [x] Import Libraries and Modules
- [x] Load red wine data
- [x] Split data into training and test sets
- [x] Declare data preprocessing steps
- [x] Declare hyperparameters to tune
- [x] Tune model using cross-validation pipeline
- [x] Refit on the entre training set
- [x] Evaluate model pipeline on test data
- [x] Save model for further use

# What to do next
- [x] Re-read and understand every bit of the code
- [x] Measuring the accuracy of Regression Model
- [x] Try other regression model families (e.g. regularized regression, boosted trees, etc)
- [x] Collect more data if possible (wasn't possible)

## Import libraries and modules

In [1]:
import numpy as np # numberical computation
import pandas as pd # dataframes
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

# Import cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# For evaluating out model later
from sklearn.metrics import mean_squared_error, r2_score

# For saving sklearn models, an alternative to Python's pickle package. More effecient for storing large np arrays
from sklearn.externals import joblib

## Load red wine data

In [2]:
# Load wine data from remote url
#dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv('winequality-red.csv')

In [3]:
# First 5 rows
data.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [10]:
# Seperate data using ;
data = pd.read_csv('winequality-red.csv', sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
data.shape # 1,599 samples and 12 features

(1599, 12)

In [7]:
# Summary statistics 
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


## Spliting data into training and test sets

In [11]:
# Seperate target from training features
y = data.quality
X = data.drop('quality', axis=1)

In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=.2,
                                                    random_state=123, # seed 
                                                    stratify=y) # makes sure training set look like test set y
                                                                # makes evaluation metrics more reliable

## Declare data preprocessing steps 

In [14]:
# Fitting the Transformer API
scaler = preprocessing.StandardScaler().fit(X_train) # has the mean and std for the features in the training set

In [18]:
# Confirm that worked
X_train_scaled = scaler.transform(X_train)

print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [19]:
# Applying the transformer to test data
X_test_scaled = scaler.transform(X_test)

print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[1.02160495 1.00135689 0.97456598 0.91099054 0.86716698 0.94193125
 1.03673213 1.03145119 0.95734849 0.83829505 1.0286218 ]


### In practice we use pipeline
`pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))`

In [20]:
pipeline = make_pipeline(preprocessing.StandardScaler(),
                        RandomForestRegressor(n_estimators=100))

## Declare hypterparameters to tune

In [21]:
pipeline.get_params()

{'memory': None,
 'steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('randomforestregressor',
   RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
              oob_score=False, random_state=None, verbose=0, warm_start=False))],
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_s

In [22]:
# Hyperparameters to tune
hyperparameters = {'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth' : [None, 5, 3, 1]}

In [23]:
# Sklearn cross-validation with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...ors=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [24]:
# Best set of parameters found using CV
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

## Refit on the entire training set 

In [25]:
# Confirm that the model will be retrained
clf.refit

True

## Evaluate model pipeline on test data


In [26]:
# Predict a new set of data
y_pred = clf.predict(X_test)

In [43]:
# Evaluate our model performance
print(r2_score(y_test, y_pred))
acc_rf = mean_squared_error(y_test, y_pred)
print(mean_squared_error(y_test, y_pred))

0.47591161692596406
0.3381803125


## Save model for future use

In [28]:
# Save model to .pkl file
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [29]:
# Load model from .pkl file
clf2 = joblib.load('rf_regressor.pkl')

# Predict data set using loaded model
clf2.predict(X_test)

array([6.39, 5.75, 4.99, 5.51, 6.37, 5.68, 5.03, 4.78, 5.  , 6.05, 5.33,
       5.84, 5.81, 5.04, 5.73, 5.73, 6.54, 5.71, 5.83, 6.96, 5.5 , 5.6 ,
       5.02, 6.  , 5.98, 5.04, 5.39, 5.12, 5.97, 5.94, 5.91, 6.49, 5.95,
       5.05, 4.93, 5.97, 5.04, 5.92, 4.99, 5.81, 5.02, 5.84, 6.62, 5.12,
       6.26, 5.34, 5.56, 5.48, 5.09, 6.48, 6.01, 5.4 , 5.91, 5.14, 5.47,
       5.73, 5.33, 5.31, 5.01, 5.24, 5.27, 5.17, 5.03, 5.82, 6.06, 5.33,
       6.34, 5.05, 5.15, 6.68, 5.72, 5.82, 5.08, 5.03, 5.35, 5.98, 5.29,
       5.11, 5.11, 5.35, 6.39, 5.73, 6.21, 6.3 , 5.08, 6.  , 6.4 , 6.43,
       5.61, 5.8 , 5.95, 5.32, 6.4 , 5.71, 5.74, 5.72, 6.63, 6.78, 5.48,
       6.81, 5.08, 5.55, 5.17, 6.51, 5.1 , 4.73, 5.69, 4.92, 5.69, 5.9 ,
       5.86, 5.47, 6.06, 5.4 , 5.09, 5.31, 5.91, 5.08, 5.02, 6.02, 5.86,
       5.15, 5.77, 6.15, 5.35, 5.17, 5.41, 5.88, 5.33, 5.46, 5.8 , 6.23,
       5.1 , 5.35, 5.06, 6.47, 5.  , 5.19, 6.64, 5.35, 5.17, 5.1 , 5.5 ,
       6.12, 5.39, 5.43, 5.1 , 6.49, 5.84, 5.09, 5.

## Try Other model trees 

In [36]:
# SVR
from sklearn.svm import SVR

svr_reg = SVR()

parameters_svr = {'kernel': ['rbf', 'linear', 'poly'],
                 'gamma': [.1, 'auto'],
                 'verbose': [True, False]}

grid_svr = GridSearchCV(svr_reg, parameters_svr)
grid_svr.fit(X_train_scaled, y_train)

svr_reg = grid_svr.best_estimator_

svr_reg.fit(X_train_scaled, y_train)
pred_svr = svr_reg.predict(X_test_scaled)
acc_svr = mean_squared_error(pred_svr, y_test)
print()
print('Best parameters: ' + str(grid_svr.best_params_))
print('SVR Accurarcy: ' + str(acc_svr))



[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]
Best parameters: {'gamma': 0.1, 'kernel': 'rbf', 'verbose': True}
SVR Accurarcy: 0.38351564119378534




In [35]:
# Stocastic Gradient Descent 
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3)

sgd_reg.fit(X_train_scaled, y_train)

pred_sgd = sgd_reg.predict(X_test_scaled)

acc_sgd = mean_squared_error(pred_sgd, y_test)

print('SGD Accuracy: ' + str(acc_sgd))

SGD Accuracy: 0.4486093365318443


In [38]:
# Nearest Neighbour 
from sklearn.neighbors import KNeighborsRegressor

knr_reg = KNeighborsRegressor()

parameters_knr = {'weights' : ['uniform', 'distance'], 'n_neighbors' : [5, 10, 15, 20]}

grid_knr = GridSearchCV(knr_reg, parameters_knr)
grid_knr.fit(X_train_scaled, y_train)

knr_reg = grid_knr.best_estimator_

knr_reg.fit(X_train_scaled, y_train)
pred_knr = knr_reg.predict(X_test_scaled)
acc_knr = mean_squared_error(pred_knr, y_test)

print('Best parameters: ' + str(grid_knr.best_params_))
print('K-Neighbor Regressor Accuracy: ' + str(acc_knr))



Best parameters: {'n_neighbors': 15, 'weights': 'distance'}
K-Neighbor Regressor Accuracy: 0.35782883758456563


In [40]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()

paramters_dt = {'max_depth': [2, 5, 7]}

grid_dt = GridSearchCV(dt_reg, paramters_dt)
grid_dt.fit(X_train_scaled, y_train)

dt_reg = grid_dt.best_estimator_

dt_reg.fit(X_train_scaled, y_train)
pred_dt = dt_reg.predict(X_test_scaled)

acc_dt = mean_squared_error(pred_dt, y_test)

print('Best parameters: ' + str(grid_dt.best_params_))
print('Decision Tree Accuracy: ' + str(acc_dt))

Best parameters: {'max_depth': 5}
Decision Tree Accuracy: 0.4741071387454959




In [42]:
# MLP Regressor
from sklearn.neural_network import MLPRegressor

mlp_reg = MLPRegressor()

parameters_mlp = {'hidden_layer_sizes': [100, 1000], 'solver': ['sgd', 'adam'], 'learning_rate': ['constant', 'invscaling', 'adaptive'],
                 'max_iter': [200, 500, 1000]}

grd_mlp = GridSearchCV(mlp_reg, parameters_mlp)
grd_mlp.fit(X_train_scaled, y_train)

mlp_reg = grd_mlp.best_estimator_

mlp_reg.fit(X_train_scaled, y_train)
pred_mlp = mlp_reg.predict(X_test_scaled)
acc_mlp = mean_squared_error(pred_mlp, y_test)

print('Best parameters: ' + str(grd_mlp.best_params_))
print('MLP Regressor Accuracy: ' + str(acc_mlp))







Best parameters: {'hidden_layer_sizes': 1000, 'learning_rate': 'constant', 'max_iter': 500, 'solver': 'adam'}
MLP Regressor Accuracy: 0.4720174228805882


In [74]:
# XGBoost regressor
import xgboost

xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=.09, gamma=0, subsample=0.75,
                          colsample_bytree=1, max_depth=25)

xgb.fit(X_train_scaled, y_train)

pred_xgb = xgb.predict(X_test_scaled)
acc_xgb = mean_squared_error(pred_xgb, y_test)

print('XGB Accuracy: ' + str(acc_xgb))

  if getattr(data, 'base', None) is not None and \


XGB Accuracy: 0.3121596561534517


In [75]:
# Results summary
accuracies = {
    'Model': ['Random Forest', 'Mutli Layer Perceptron', 'Decision Tree',
              'Support Vector Machine', 'K-Nearest Neighbor', 'Stocastic Gradient Descent',
             'XGBoost'],
    'Accuracy': [acc_rf, acc_mlp, acc_dt, acc_svr, acc_knr, acc_sgd, acc_xgb]
    
}

model_performance = pd.DataFrame(accuracies)

model_performance.sort_values(by='Accuracy', ascending=True)

Unnamed: 0,Model,Accuracy
6,XGBoost,0.31216
0,Random Forest,0.33818
4,K-Nearest Neighbor,0.357829
3,Support Vector Machine,0.383516
5,Stocastic Gradient Descent,0.448609
1,Mutli Layer Perceptron,0.472017
2,Decision Tree,0.474107
