In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#model families
from sklearn.ensemble import RandomForestRegressor

#cross validation tools
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

#metrics evaluate model performance
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

#persist model
#from sklearn.externals import joblib
import joblib

In [None]:
#read data from api
url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(url, sep=";")
data.head()

In [None]:
data.describe()

In [None]:
#seperate data 
y = data.quality
X = data.drop('quality', axis=1)


In [None]:
#set aside 20% of the data to test and evaluate model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [None]:
#standaraization => process of subtracting the means from each feature and then dividing by the stdv

#scaling
X_train_scaled = preprocessing.scale(X_train)


In [None]:
#fit transformer on training set
scaler = preprocessing.StandardScaler().fit(X_train)

In [None]:
#declare pipeline -- training ds centered around 0 but test is not. needs tuning
#cross validation pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))

In [None]:
hyperparameters = { 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1] }

In [10]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

#fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor())]),
             param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
                         'randomforestregressor__max_features': ['auto', 'sqrt',
                                                                 'log2']})

In [11]:
print(clf.best_params_)
print(clf.refit)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}
True


In [12]:
y_pred = clf.predict(X_test)


In [13]:
print( r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
y_train

0.4775703129729403
0.33711


691     5
1475    7
1065    6
1159    5
227     5
       ..
463     5
558     6
1094    6
792     6
381     6
Name: quality, Length: 1279, dtype: int64

In [None]:
# confusion_matrix(y_test, y_pred)

y_pred