In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [36]:
# datasets used
ttt = pd.read_csv('./tic-tac-toe.csv')
candy = pd.read_csv('./candy-data.csv')

# Creating train, test, and validation datasets

## Create one holdout set

In [4]:
# create dummy variables
X = pd.get_dummies(ttt.iloc[:, :9])
y = ttt.iloc[:, 9]

# split 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.1,
                                                    random_state=1111)

## Create two holdout sets

In [5]:
# Create temporary training and final testing datasets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, 
                                                  test_size=0.2, 
                                                  random_state=1111)

# Create the final training and validation datasets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, 
                                                  test_size=0.25, 
                                                  random_state=1111)

# Accuracy metrics: regression models

## Mean absolute error

In [11]:
# provided with two arrays, y_test -> true no. of wins for NBA teams in 2017
#                           predictions -> preds for each team
y_test = np.array([53, 51, 51, 49, 43, 42, 42, 41, 41, 37, 36, 31, 29, 28, 20, 67, 61,
          55, 51, 51, 47, 43, 41, 40, 34, 33, 32, 31, 26, 24])
predictions = np.array([60, 62, 42, 42, 30, 50, 52, 42, 44, 35, 30, 30, 35, 40, 15, 72, 58,
               60, 40, 42, 45, 46, 40, 35, 25, 40, 20, 34, 25, 24])

In [12]:
# manually calculate MAE
n = len(predictions)
mae_one = np.sum(np.abs(y_test - predictions)) / n
print(f'Manual MAE: {mae_one: .2f}')

# use scikit-learn MAE
mae_two = mae(y_test, predictions)
print(f'Sklearn MAE: {mae_two: .2f}')

Manual MAE:  5.90
Sklearn MAE:  5.90


## Mean squared error

In [13]:
# manual MSE
mse_one = np.sum((y_test - predictions)**2 / n)
print(f'Manual MSE: {mse_one: .2f}')

# use sklearn MSE
mse_two = mse(y_test, predictions)
print(f'Sklearn MSE: {mse_two: .2f}')

Manual MSE:  49.10
Sklearn MSE:  49.10


## Performance on data subsets

In [14]:
labels = np.array(['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'E', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W',
       'W', 'W', 'W', 'W'], dtype='<U1')

In [18]:
# find the conference teams
east_teams = labels == 'E'
west_teams = labels == 'W'

# create arrays for the true and pred values
true_east = y_test[east_teams]
preds_east = predictions[east_teams]
true_west = y_test[west_teams]
preds_west = predictions[west_teams]

# print the accuracy metrics
print(f'MAE for Eastern conference: {mae(true_east, preds_east): .2f}')
print(f'MAE for Western conference: {mae(true_west, preds_west): .2f}')

MAE for Eastern conference:  6.73
MAE for Western conference:  5.07


# Classification metrics

## Confusion matrices

In [19]:
# given a conf matrix as follows:
#            Pred: 0   Pred: 1
# Actual: 0  324(TN)   15(FP)
# Actual: 1  123(FN)   491(TP)

# calculate and print the accuracy
accuracy = (324 + 491) / (953)
print(f"The overall accuracy is {accuracy: 0.2f}")

# Calculate and print the precision
precision = (491) / (491 + 15)
print(f"The precision is {precision: 0.2f}")

# Calculate and print the recall
recall = (491) / (491 + 123)
print(f"The recall is {recall: 0.2f}")

The overall accuracy is  0.86
The precision is  0.97
The recall is  0.80


## Confusion matrices, again

In [31]:
X = pd.get_dummies(ttt.iloc[:, 0:9])
y = ttt.iloc[:, 9].map({'positive': 1, 
                        'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.9,
                                                    random_state=1111)

rfc = RandomForestClassifier(n_estimators=500,
                             random_state=1111)

rfc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=1111)

In [32]:
# create preds
y_pred = rfc.predict(X_test)

# create and print conf matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion matrix:\n{cm}')
print(f'The number of true positives is {cm[1, 1]}')

Confusion matrix:
[[177 123]
 [ 92 471]]
The number of true positives is 471


## Precision vs. recall

For instance, we have sore-losers that can't stand losing when they are certain they will win. Thus, our model needs to be as *precise* as possible.

In [35]:
score = precision_score(y_test, y_pred)
print(f'The precision value is {score: .1%}')

The precision value is  79.3%


# The bias-variance tradeoff

In [37]:
candy.head()

Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


## Error due to under/over-fitting

In [41]:
X = candy.drop(['competitorname', 'winpercent'], axis=1)
y = candy['winpercent']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1111)

In [44]:
# instantiate a random forest regressor
rfr = RandomForestRegressor(n_estimators=25,
                            max_features=2,
                            random_state=1111)

rfr.fit(X_train, y_train)

# scores
print(f'Train MAE: {mae(y_train, rfr.predict(X_train)): .2f}')
print(f'Test MAE: {mae(y_test, rfr.predict(X_test)): .2f}')

Train MAE:  3.88
Test MAE:  9.15


In [45]:
# update the rfr model (11 max features)
rfr = RandomForestRegressor(n_estimators=25,
                            max_features=11,
                            random_state=1111)

rfr.fit(X_train, y_train)

# scores
print(f'Train MAE: {mae(y_train, rfr.predict(X_train)): .2f}')
print(f'Test MAE: {mae(y_test, rfr.predict(X_test)): .2f}')

Train MAE:  3.57
Test MAE:  10.05


In [46]:
# update the rfr model (11 max features)
rfr = RandomForestRegressor(n_estimators=25,
                            max_features=4,
                            random_state=1111)

rfr.fit(X_train, y_train)

# scores
print(f'Train MAE: {mae(y_train, rfr.predict(X_train)): .2f}')
print(f'Test MAE: {mae(y_test, rfr.predict(X_test)): .2f}')

Train MAE:  3.60
Test MAE:  8.79


## Am I underfitting?

In [49]:
# use ttt
X = pd.get_dummies(ttt.iloc[:, 0:9])
y = ttt.iloc[:, 9]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=1111)

In [52]:
test_scores, train_scores = [], []
for i in [1, 2, 3, 4, 5, 10, 20, 50]:
    rfc = RandomForestClassifier(n_estimators=i,
                                 random_state=1111)
    rfc.fit(X_train, y_train)
    # preds
    train_pred = rfc.predict(X_train)
    test_pred = rfc.predict(X_test)
    # append
    train_scores.append(round(accuracy_score(y_train, train_pred), 2))
    test_scores.append(round(accuracy_score(y_test, test_pred), 2))
    
print(f'Train scores:\n{train_scores}\n')
print(f'Test scores:\n{test_scores}\n')

Train scores:
[0.94, 0.93, 0.98, 0.97, 0.99, 1.0, 1.0, 1.0]

Test scores:
[0.83, 0.79, 0.89, 0.91, 0.91, 0.93, 0.97, 0.98]

