In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error as mae

In [14]:
# datasets used
candy_data = pd.read_csv('./candy-data.csv')
ttt = pd.read_csv('./tic-tac-toe.csv')

I will be changing the formatting of my markdowns starting in this course. I'll just be loading the data first and then follow the outline of DataCamp.

# Introduction to Model Validation

## Seen vs. unseen data

In [3]:
print(candy_data.shape)
candy_data.head()

(85, 13)


Unnamed: 0,competitorname,chocolate,fruity,caramel,peanutyalmondy,nougat,crispedricewafer,hard,bar,pluribus,sugarpercent,pricepercent,winpercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [5]:
candy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   competitorname    85 non-null     object 
 1   chocolate         85 non-null     int64  
 2   fruity            85 non-null     int64  
 3   caramel           85 non-null     int64  
 4   peanutyalmondy    85 non-null     int64  
 5   nougat            85 non-null     int64  
 6   crispedricewafer  85 non-null     int64  
 7   hard              85 non-null     int64  
 8   bar               85 non-null     int64  
 9   pluribus          85 non-null     int64  
 10  sugarpercent      85 non-null     float64
 11  pricepercent      85 non-null     float64
 12  winpercent        85 non-null     float64
dtypes: float64(3), int64(9), object(1)
memory usage: 8.8+ KB


In [6]:
X = candy_data.drop(['competitorname', 'winpercent'], axis=1)
y = candy_data['winpercent']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=35,
                                                    random_state=42)

In [7]:
# instantiate random forest refressor
model = RandomForestRegressor(n_estimators=50,
                              random_state=1111)

# fit, predict, evaluate
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

train_error = mae(y_train, train_pred)
test_error = mae(y_test, test_pred)

print(f'Model error on seen data: {train_error: .2f}')
print(f'Model error on unseen data: {test_error: .2f}')

Model error on seen data:  3.39
Model error on unseen data:  10.45


When models perform differently on training and testing data, you should look to model validation to ensure you have the best performing model.

# Regression models

## Set parameters and fit a model

In [10]:
# instantiate a random forest regressor
rfr = RandomForestRegressor()

# set parameters after
rfr.n_estimators = 100
rfr.max_depth = 6
rfr.random_state = 1111

# fit
rfr.fit(X, y)

RandomForestRegressor(max_depth=6, random_state=1111)

The parameters were updated after the model was initialized. This approach is helpful when you need to update parameters.

## Feature importances

In [13]:
# print how important each column/feature is to the model
for i, item in enumerate(rfr.feature_importances_):
    print(f'{X.columns[i]}: {item: .2f}')

chocolate:  0.44
fruity:  0.03
caramel:  0.02
peanutyalmondy:  0.05
nougat:  0.01
crispedricewafer:  0.03
hard:  0.01
bar:  0.02
pluribus:  0.02
sugarpercent:  0.17
pricepercent:  0.19


No surprise that chocolate is the most important feature. `.feature_importances` is a great way to see which variables are important to the model.

# Classification models

In [15]:
print(ttt.shape)
ttt.head()

(958, 10)


Unnamed: 0,Top-Left,Top-Middle,Top-Right,Middle-Left,Middle-Middle,Middle-Right,Bottom-Left,Bottom-Middle,Bottom-Right,Class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [16]:
ttt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Top-Left       958 non-null    object
 1   Top-Middle     958 non-null    object
 2   Top-Right      958 non-null    object
 3   Middle-Left    958 non-null    object
 4   Middle-Middle  958 non-null    object
 5   Middle-Right   958 non-null    object
 6   Bottom-Left    958 non-null    object
 7   Bottom-Middle  958 non-null    object
 8   Bottom-Right   958 non-null    object
 9   Class          958 non-null    object
dtypes: object(10)
memory usage: 75.0+ KB


## Classification predictions

In [29]:
X = ttt.drop('Class', axis=1)
X = pd.get_dummies(X)
y = ttt['Class'].map({'positive': 1, 
                      'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.8,
                                                    random_state=1111)

In [32]:
# instantiate a random forest classifier
rfc = RandomForestClassifier(n_estimators=50,
                             max_depth=6,
                             random_state=1111)

# fit, predict, eval
rfc.fit(X_train, y_train)

class_preds = rfc.predict(X_test)
prob_preds = rfc.predict_proba(X_test)

print(f'binary predictions:\n{pd.Series(class_preds).value_counts()}')

print(f'The first predicted probabilties are: {prob_preds[0]}')

binary predictions:
1    563
0    204
dtype: int64
The first predicted probabilties are: [0.26524423 0.73475577]


## Reusing model parameters

In [35]:
rfc = RandomForestClassifier(n_estimators=50,
                             max_depth=6,
                             random_state=1111)

# print the model
print(f'model:\n{rfc}\n')

# print the random state parameter
print(f'the random state is:\n{rfc.random_state}\n')

# print all parameters
print(f'parameters dict:\n{rfc.get_params()}')

model:
RandomForestClassifier(max_depth=6, n_estimators=50, random_state=1111)

the random state is:
1111

parameters dict:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 1111, 'verbose': 0, 'warm_start': False}


## Random forest classifier

In [37]:
# create a random forest classifier
rfc = RandomForestClassifier(n_estimators=50,
                             max_depth=6,
                             random_state=1111)

# fit
rfc.fit(X_train, y_train)

# pred
y_pred = rfc.predict(X_test)
print(f'preds:\n{y_pred[:5]}')

# print model acc
print(f'score: {rfc.score(X_test, y_test)}')

preds:
[1 1 1 1 1]
score: 0.817470664928292
