In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#model families
from sklearn.ensemble import RandomForestRegressor

#cross validation tools
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

#metrics evaluate model performance
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

#persist model
#from sklearn.externals import joblib
import joblib

In [None]:
#read data from api
url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(url, sep=";")
data.head()

In [None]:
data.describe()

In [14]:
#seperate data 
y = data.quality
X = data.drop('quality', axis=1)


In [15]:
#set aside 20% of the data to test and evaluate model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [21]:
#standaraization => process of subtracting the means from each feature and then dividing by the stdv

#scaling
X_train_scaled = preprocessing.scale(X_train)


In [22]:
#fit transformer on training set
scaler = preprocessing.StandardScaler().fit(X_train)

In [23]:
#declare pipeline -- training ds centered around 0 but test is not. needs tuning
#cross validation pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [31]:
hyperparameters = { 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1] }

In [32]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

#fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [42]:
print(clf.best_params_)
print(clf.refit)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'log2'}
True


In [46]:
y_pred = clf.predict(X_test)


array([6.52, 5.71, 4.97, 5.49, 6.33, 5.46, 4.91, 4.76, 5.03, 5.98, 5.35,
       5.71, 5.92, 5.08, 5.73, 5.62, 6.61, 5.81, 5.75, 6.97, 5.45, 5.56,
       5.07, 6.02, 5.87, 5.06, 5.49, 5.04, 5.9 , 5.96, 5.87, 6.54, 5.98,
       5.01, 5.01, 5.88, 5.05, 6.1 , 4.97, 6.05, 4.97, 5.64, 6.68, 5.03,
       6.24, 5.39, 5.47, 5.57, 5.11, 6.38, 5.94, 5.3 , 5.8 , 5.14, 5.55,
       5.7 , 5.28, 5.28, 5.01, 5.25, 5.28, 5.14, 5.  , 5.8 , 6.04, 5.17,
       6.43, 5.06, 5.17, 6.69, 5.78, 5.78, 5.09, 5.  , 5.4 , 5.97, 5.25,
       5.03, 5.31, 5.27, 6.31, 5.59, 6.28, 6.36, 5.11, 6.18, 6.38, 6.34,
       5.68, 5.8 , 5.84, 5.32, 6.43, 5.74, 5.7 , 5.75, 6.75, 6.67, 5.57,
       6.81, 5.14, 5.48, 5.09, 6.37, 5.02, 4.72, 5.72, 5.05, 5.61, 5.89,
       5.92, 5.46, 6.04, 5.4 , 5.02, 5.25, 5.9 , 5.01, 4.9 , 5.96, 5.84,
       5.05, 5.72, 6.14, 5.27, 5.24, 5.3 , 5.99, 5.54, 5.43, 5.8 , 6.1 ,
       5.21, 5.38, 5.08, 6.42, 5.02, 5.15, 6.64, 5.52, 5.28, 5.09, 5.43,
       6.13, 5.37, 5.47, 5.06, 6.54, 5.89, 5.13, 5.

In [56]:
print( r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
y_train

0.46754694594103763
0.3435778125


691     5
1475    7
1065    6
1159    5
227     5
       ..
463     5
558     6
1094    6
792     6
381     6
Name: quality, Length: 1279, dtype: int64

In [57]:
# confusion_matrix(y_test, y_pred)

y_pred

array([6.52, 5.71, 4.97, 5.49, 6.33, 5.46, 4.91, 4.76, 5.03, 5.98, 5.35,
       5.71, 5.92, 5.08, 5.73, 5.62, 6.61, 5.81, 5.75, 6.97, 5.45, 5.56,
       5.07, 6.02, 5.87, 5.06, 5.49, 5.04, 5.9 , 5.96, 5.87, 6.54, 5.98,
       5.01, 5.01, 5.88, 5.05, 6.1 , 4.97, 6.05, 4.97, 5.64, 6.68, 5.03,
       6.24, 5.39, 5.47, 5.57, 5.11, 6.38, 5.94, 5.3 , 5.8 , 5.14, 5.55,
       5.7 , 5.28, 5.28, 5.01, 5.25, 5.28, 5.14, 5.  , 5.8 , 6.04, 5.17,
       6.43, 5.06, 5.17, 6.69, 5.78, 5.78, 5.09, 5.  , 5.4 , 5.97, 5.25,
       5.03, 5.31, 5.27, 6.31, 5.59, 6.28, 6.36, 5.11, 6.18, 6.38, 6.34,
       5.68, 5.8 , 5.84, 5.32, 6.43, 5.74, 5.7 , 5.75, 6.75, 6.67, 5.57,
       6.81, 5.14, 5.48, 5.09, 6.37, 5.02, 4.72, 5.72, 5.05, 5.61, 5.89,
       5.92, 5.46, 6.04, 5.4 , 5.02, 5.25, 5.9 , 5.01, 4.9 , 5.96, 5.84,
       5.05, 5.72, 6.14, 5.27, 5.24, 5.3 , 5.99, 5.54, 5.43, 5.8 , 6.1 ,
       5.21, 5.38, 5.08, 6.42, 5.02, 5.15, 6.64, 5.52, 5.28, 5.09, 5.43,
       6.13, 5.37, 5.47, 5.06, 6.54, 5.89, 5.13, 5.