In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
# Import Data 
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)
data = pd.read_csv(dataset_url, sep=';') # changes separation in data from ; to , 
    # data.shape: 1599 examples, 12 features
# all features are numeric => no one-hot encoding needed


In [7]:
# Splitting Data into Training and Test Sets
y = data.quality  # vector of the quality column (the target)
X = data.drop('quality', axis=1) # removes quality column, creates input matrix 

In [8]:
# Train and Test Splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123, stratify = y)
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html 

In [None]:
# Preprocessing: Transformer API
scaler = preprocessing.StandardScaler().fit(X_train) # this has stored the means and stddev for each feature in X_train
X_train_scaled = scaler.transform(X_train) # scales (transforms) X_train using X_train mean/stddev saved in scaler obj
X_test_scaled = scaler.transform(X_test) # same as above, on X_scale (still using X_train stats)



In [None]:
# Automatically Create cross-validation pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimator=100))
    # this line transforms the data using StandardScaler and then fits a model using Random Forest 


In [None]:
# Tuning Hyperparameters
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [None]:
# Cross Validation
clf = GridSearchCV(pipeline, hyperparameters, cv=10) # does cross validation on all permutations of hyperparameters
clf.fit(X_train, y_train) #actual fit + tune of the model

In [None]:
# Create new prediction
y_pred = clf.predict(X_test)

In [None]:
print r2_score(y_test, y_pred)
print mean_squared_error(y_test, y_pred)