# Regression

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn

# To create test/train splits
from sklearn.model_selection import train_test_split
# To help us go through different parameter configurations for
# each type of model.
from sklearn.model_selection import GridSearchCV
# To help us evaluate the model on each trial or "split"
from sklearn.model_selection import cross_val_score

# Naive Bayes
from sklearn.neighbors import KNeighborsRegressor
# Neural Networks
from sklearn.neural_network import MLPRegressor

# Confusion matrix
from sklearn.metrics import confusion_matrix

In [13]:
# Import our regression evaluation metrics
from sklearn.metrics import explained_variance_score, r2_score

In [14]:
# This is to stop the barrage of warning messages we'll get later
import warnings; warnings.simplefilter('ignore')

In [15]:
# Create a seed to use later.
seed = 42

In [16]:
# First import your dataset as usual
dataset = "datasets/winequality_red.csv"

df = pd.read_csv(dataset, delimiter=";")

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [17]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [18]:
# Let's see how big our dataset is
n_samples, n_columns = df.shape
print("Number of samples: {}".format(n_samples))

Number of samples: 1599


In [19]:
# Step 1: Create visualization

# For this dataset, we can't get it to plot easily, we'll try this later

In [20]:
# Step 2: Separate features and labels
df_X = df.drop("quality", axis=1)
df_X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [21]:
df_y = df["quality"]
df_y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [22]:
# Step 3: Split into test, train
# We want 70% train, 30% test
train_X, test_X, train_y, test_y = train_test_split(df_X, df_y,
                                                    test_size=0.3,
                                                    random_state=seed)

## Step 4: Train Estimators

To demonstrate how to do crossfold-validation (CV), we'll use two algorithms:
- [Neural Network](http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)
- [KNN](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)


Here's what we'll be doing:
- Defining a "grid" of possibilities for the models' parameters
- Training on "splits" of our data
- Keeping the best model of each type (KNN, NN) that performed best on our dataset
- Showing the accuracies and confusion matrices of those best-performing models

__JUPYTER PRO-TIP__: You can time the run-time of individual cells in Jupyter by putting `%%time` at the start of any cell.

In [23]:
%%time
# We're going to build nn_params, a list of dictionaries
# Each dictionary has the variable name to try modifying, and
# which values to try for it.
# All combinations of variables in the dictionary below will be tried

nn_params = [
    {
        "hidden_layer_sizes": [
            (20,), (30,), # models with 1 hidden layer
        ],
        "max_iter": [40, 100],
        "activation": ["logistic", "tanh", "relu"],
        "learning_rate_init": [1e-4, 1e-3],
    },
]

# nn_tester will test the permutations of parameters in nn_params
# using 5 trials (splitting train set into 5, training on 4 and testing
# on the 5th).
nn_model = MLPRegressor(random_state=seed)
nn_experimenter = GridSearchCV(nn_model, nn_params, cv=5)
nn_experimenter.fit(train_X, train_y)

print("Best parameter set found: ")
print(nn_experimenter.best_params_)

Best parameter set found: 
{'activation': 'tanh', 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.001, 'max_iter': 100}
CPU times: user 16.8 s, sys: 2.61 s, total: 19.4 s
Wall time: 18.6 s


## Step 5: Evaluation for Regression

Evaluating regression problems are slightly different, we use two different metrics for this:
- [explained_variance_score]() -- best possible score is `1.0`, lower values are worse
- [r2_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html#sklearn.metrics.explained_variance_score) -- best possible score is `1.0`, lower values are worse

We will use just `r2_score()` below for now but `explained_variance_score()` works similarly.

In [27]:
train_preds = nn_experimenter.predict(train_X)
test_preds = nn_experimenter.predict(test_X)

train_r2 = r2_score(train_preds, train_y)
test_r2 = r2_score(test_preds, test_y)

print("Train R2 score: {:.1f}%".format(train_r2))
print("Test R2 score {:.1f}%".format(test_r2))

Train R2 score: -4.3%
Test R2 score -4.3%


In [28]:
%%time

# Now we can do the same thing with KNN

# Notice here how we decide we only want one dictionary of params
# to try all permutations of.
knn_params = [
    {
        "n_neighbors": [2, 3, 4, 5, 8],
        "p": [1, 2],
        "algorithm": ["ball_tree", "kd_tree"]
    },
]

knn_model = KNeighborsRegressor()
knn_exp = GridSearchCV(knn_model, knn_params, cv=5)

knn_exp.fit(train_X, train_y)

print("Best parameter set found: ")
print(knn_exp.best_params_)

Best parameter set found: 
{'algorithm': 'ball_tree', 'n_neighbors': 8, 'p': 1}
CPU times: user 790 ms, sys: 7.25 ms, total: 797 ms
Wall time: 803 ms


In [29]:
train_preds = knn_exp.predict(train_X)
test_preds = knn_exp.predict(test_X)

train_r2 = r2_score(train_preds, train_y)
test_r2 = r2_score(test_preds, test_y)

print("Train R2 score: {:.1f}%".format(train_r2))
print("Test R2 score {:.1f}%".format(test_r2))

Train R2 score: -1.2%
Test R2 score -2.1%
