### Step 1. Import libraries and modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing                       
from sklearn.ensemble import RandomForestRegressor     
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib     # modified from 'from sklearn.externals import joblib'

### Step 2. Load red wine data

In [2]:
data = pd.read_csv('winequality-red.csv', sep=';')       # separating by ';'
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
print(data.shape)
data.describe()

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


### Step 3. Split data into training and test sets

In [4]:
# Separate our target (y) features from our input (X) features
y = data.quality
X = data.drop('quality', axis=1)

In [5]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=123, stratify=y)


### Step 4. Data preprocessing

In [6]:
# Pipeline with preprocessing (standardization) and model
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                        RandomForestRegressor(n_estimators=100))

### Step 5. Declare hyperparameters to tune

In [7]:
# List tunable hyperparameters
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler()), ('randomforestregressor', RandomForestRegressor())], 'verbose': False, 'standardscaler': StandardScaler(), 'randomforestregressor': RandomForestRegressor(), 'standardscaler__copy': True, 'standardscaler__with_mean': True, 'standardscaler__with_std': True, 'randomforestregressor__bootstrap': True, 'randomforestregressor__ccp_alpha': 0.0, 'randomforestregressor__criterion': 'squared_error', 'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_leaf_nodes': None, 'randomforestregressor__max_samples': None, 'randomforestregressor__min_impurity_decrease': 0.0, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_weight_fraction_leaf': 0.0, 'randomforestregressor__n_estimators': 100, 'randomforestregressor__n_jobs': None, 'randomforestregressor__oob_score': False, 'randomforestregressor__random_stat

In [8]:
# Declare hyperparameters to tune
hyperparameters = {'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'], 
                    'randomforestregressor__max_depth': [None, 5, 3, 1]}

### Step 6. Tune model using a cross-validation pipeline

In [9]:
# Sklearn cross-validation with pipeline
# cross-validation across the entire "grid" of hyperparameters
clf = GridSearchCV(pipeline, hyperparameters, cv=10)    
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [10]:
# Best set of parameters found using CV
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


### Step 7. Refit on the entire training set

In [11]:
# Confirm model was retrained after tuning hyperparameters
print(clf.refit)

True


### Step 8. Evaluate model pipeline on test data

In [12]:
# Predict a new set of data
y_pred = clf.predict(X_test)

In [13]:
# Evaluate model performace
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.46976935649857743
0.34214375


### (Option) Save model for future use

In [14]:
# Save model to a .pkl file
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [15]:
# Load model from .pkl file
clf2 = joblib.load('rf_regressor.pkl')
 
# Predict data set using loaded model
clf2.predict(X_test)

array([6.42, 5.73, 4.94, 5.41, 6.35, 5.52, 4.92, 4.79, 5.05, 6.08, 5.3 ,
       5.71, 5.74, 5.03, 5.81, 5.66, 6.67, 5.71, 5.78, 6.99, 5.52, 5.6 ,
       5.04, 5.96, 5.95, 5.01, 5.49, 5.14, 6.01, 5.97, 5.85, 6.46, 6.  ,
       5.09, 5.  , 6.03, 5.  , 6.04, 5.13, 5.88, 4.92, 5.85, 6.69, 5.14,
       6.17, 5.34, 5.6 , 5.59, 5.05, 6.51, 5.96, 5.31, 5.75, 5.12, 5.66,
       5.75, 5.34, 5.48, 5.  , 5.18, 5.23, 5.12, 5.05, 5.84, 6.08, 5.35,
       6.32, 5.02, 5.19, 6.67, 5.79, 5.74, 5.01, 4.97, 5.44, 5.92, 5.24,
       5.14, 5.19, 5.29, 6.31, 5.54, 6.23, 6.29, 5.03, 5.99, 6.26, 6.45,
       5.91, 5.86, 5.87, 5.53, 6.47, 5.71, 5.75, 5.7 , 6.61, 6.82, 5.59,
       6.84, 5.09, 5.48, 5.15, 6.48, 5.08, 4.78, 5.71, 5.04, 5.56, 5.99,
       5.96, 5.35, 6.05, 5.41, 5.09, 5.18, 6.05, 5.03, 4.81, 6.04, 5.87,
       5.07, 5.73, 6.08, 5.29, 5.37, 5.37, 6.09, 5.47, 5.36, 5.86, 6.3 ,
       5.12, 5.28, 5.01, 6.39, 5.03, 5.08, 6.69, 5.59, 5.34, 5.04, 5.65,
       6.06, 5.28, 5.36, 5.12, 6.46, 5.82, 5.06, 5.