In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib



In [7]:
df=pd.read_csv('winequality-red.csv', sep=';')

In [8]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
print(df.shape)

(1599, 12)


In [12]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [14]:
y=df.quality
X=df.drop('quality', axis=1)

In [24]:
X_train, X_test, y_train, y_test= train_test_split(X,y,
                                                  test_size=0.2,
                                                  random_state=123,
                                                  stratify=y)

In [21]:
#X_train_scaled = preprocessing.scale(X_train)
#print (X_train_scaled)

In [25]:
print(X_train_scaled.mean(axis=0))

[-4.47660973e-16  4.60360575e-17  1.14296419e-16 -5.39733088e-17
 -2.07162259e-16  1.11121518e-17  2.22243036e-17 -2.05400189e-14
  1.25408570e-15 -5.81006795e-16  4.80997428e-16]


In [26]:
print(X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [28]:
scaler = preprocessing.StandardScaler().fit(X_train)

In [30]:
X_train_scaled = scaler.transform(X_train)
 
print (X_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [31]:

X_train_scaled = scaler.transform(X_train)
 
print (X_train_scaled.mean(axis=0))

[ 1.16664562e-16 -3.05550043e-17 -8.47206937e-17 -2.22218213e-17
  2.22218213e-17 -6.38877362e-17 -4.16659149e-18 -2.54439854e-15
 -8.70817622e-16 -4.08325966e-16 -1.17220107e-15]


In [32]:
print (X_train_scaled.std(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [33]:
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [35]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [37]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [38]:
print (clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}


In [39]:
print (clf.refit)

True


In [40]:
y_pred = clf.predict(X_test)

In [41]:
print (r2_score(y_test, y_pred))

0.47587577940553316


In [42]:
print (mean_squared_error(y_test, y_pred))

0.3382034375


In [43]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [44]:
clf2 = joblib.load('rf_regressor.pkl')

# Predict data set using loaded model
clf2.predict(X_test)

array([6.64, 5.81, 5.03, 5.45, 6.28, 5.51, 5.08, 4.77, 5.01, 6.05, 5.3 ,
       5.73, 5.67, 5.04, 5.72, 5.69, 6.61, 5.8 , 5.72, 6.95, 5.47, 5.66,
       5.02, 6.11, 5.92, 5.04, 5.49, 5.22, 5.96, 5.94, 5.87, 6.59, 5.97,
       5.01, 4.98, 5.79, 5.09, 6.02, 5.1 , 6.13, 4.88, 5.97, 6.55, 5.1 ,
       6.25, 5.42, 5.61, 5.58, 5.15, 6.28, 6.14, 5.38, 5.89, 5.12, 5.6 ,
       5.82, 5.35, 5.29, 4.95, 5.2 , 5.24, 5.16, 5.08, 5.91, 5.95, 5.24,
       6.31, 5.03, 5.19, 6.76, 5.75, 5.83, 5.09, 5.  , 5.33, 5.99, 5.2 ,
       5.08, 5.17, 5.34, 6.32, 5.74, 6.09, 6.37, 5.06, 6.06, 6.45, 6.36,
       5.77, 5.76, 5.92, 5.36, 6.36, 5.71, 5.78, 5.72, 6.69, 6.77, 5.58,
       6.8 , 5.11, 5.37, 5.22, 6.5 , 5.  , 4.75, 5.73, 5.02, 5.6 , 6.  ,
       5.83, 5.52, 6.1 , 5.55, 5.31, 5.24, 5.99, 5.09, 4.99, 5.89, 5.84,
       5.08, 5.8 , 6.07, 5.34, 5.35, 5.33, 5.96, 5.49, 5.41, 5.76, 6.13,
       5.14, 5.3 , 5.04, 6.49, 5.04, 5.16, 6.56, 5.4 , 5.25, 5.11, 5.64,
       6.04, 5.22, 5.38, 5.1 , 6.59, 5.72, 5.04, 5.