IMPORT LIBRARIES AND MODULES

In [1]:
import sklearn

In [4]:
import numpy as np
import pandas as pd

In [5]:
# Import train_test_split() function from the model_selection module
# This module contains utilities for choosing between models
from sklearn.model_selection import train_test_split
# Import the preprocessing module This contains utilities for scaling, 
# transforming, and wrangling data.
from sklearn import preprocessing
# Import random forest model
from sklearn.ensemble import RandomForestRegressor

In [7]:
# Import cross-validation pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
# Import evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score

LOAD WINE DATA

In [10]:
# Load wine data from path
data = pd.read_csv('winequality-red.csv')
# Get first 5 rows of data
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
# Get the shape of data (rows x columns)
print(data.shape)

(1599, 12)


In [12]:
# We have 1,599 samples and 12 features, including our target feature
# Print some summary statistics
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [14]:
# Here's the list of all the features:
# quality (target)
# fixed acidity
# volatile acidity
# citric acid
# residual sugar
# chlorides
# free sulfur dioxide
# total sulfur dioxide
# density
# pH
# sulphates
# alcohol

SPLIT DATA INTO TRAINING AND TEST SETS

In [15]:
# Separate our target (y) features from our input (X) features:
y = data.quality
X = data.drop('quality',axis=1)

In [16]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123,
                                                   stratify=y)
# Here I set aside 20% of the data as a test set for evaluating the model.
# I also set an arbitrary "random state" so that I can reproduce the results.
# Finally I stratify the sample by the target variable to ensure the training
# set looks similar to the test set.

DECLARE DATA PREPROCESSING

In [19]:
# Set up the cross-validation pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

DECLARE HYPERPARAMETERS TO TUNE

In [20]:
# Declare the hyperparameters to tune through cross-validation
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

TUNE MODEL USING CROSS-VALIDATION PIPELINE

In [21]:
# Tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                            

REFIT ON THE ENTIRE TRAINING SET

In [22]:
# Confirm model will be retrained
print(clf.refit)

True


EVALUATING MODEL PIPELINE ON TEST DATA

In [24]:
pred = clf.predict(X_test)
print(r2_score(y_test, pred))
print(mean_squared_error(y_test, pred))

0.48471263393667907
0.33250124999999997


In [None]:
# Results of R2_score and mean_squared_error indicate that the model is 
# not perfectly fit the data