# Model assessment basics

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_boston, load_iris, load_wine, load_digits, \
                             load_breast_cancer, load_diabetes, fetch_mldata

pd.options.display.float_format = '{:,.3f}'.format

## Training set metric on random data

In [10]:
X_train = np.random.random((1000,4))
y_train = np.random.random(1000)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.8521911296804429

## NYC rent testing

In [14]:
df_rent = pd.read_json("data/rent-train.json")
features = ['bedrooms', 'bathrooms', 'latitude', 'longitude']
df_rent = df_rent[features+['price']]
X = df_rent.drop('price', axis=1)
y = df_rent['price']
X.head()

Unnamed: 0,bedrooms,bathrooms,latitude,longitude
10,3,1.5,40.715,-73.942
10000,2,1.0,40.795,-73.967
100004,1,1.0,40.739,-74.002
100007,1,1.0,40.754,-73.968
100013,4,1.0,40.824,-73.949


**Break out 20% hold out validation set**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**Train model, compute metrics**

In [16]:
def test():
    rf = RandomForestRegressor(n_estimators=50, oob_score=True, n_jobs=-1)
    rf.fit(X_train, y_train)
    oob = rf.oob_score_ # wow this is a terrible score
    y_pred = rf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return (oob,r2,mae,mse)
    df_metrics = pd.DataFrame([[oob,r2,mae,mse]],
                              columns=['OOB', 'R^2', 'MAE', 'MSE'])
    print(df_metrics)
#    print(f"{oob:4.2f}, {r2:5.2f}, {mae:5.2f}, {mse:.2f}")

**This error is stable; just variation due to RF**

In [17]:
pd.DataFrame([test() for i in range(3)], columns=['OOB', 'R^2', 'MAE', 'MSE'])

Unnamed: 0,OOB,R^2,MAE,MSE
0,-0.24,-1.382,564.634,285794218.569
1,-0.219,-0.516,531.9,181932479.737
2,-0.656,-4.08,643.307,609460959.343


**Very unstable if we pick more valid sets**

In [22]:
results = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    results.append( test() )
df_results = pd.DataFrame(results, columns=['OOB', 'R^2', 'MAE', 'MSE'])
df_results

Unnamed: 0,OOB,R^2,MAE,MSE
0,0.582,0.002,1150.354,2402630918.659
1,0.027,0.05,878.512,2053053487.605
2,-0.309,0.323,408.299,3852177.529
3,-0.152,-44.812,527.185,265146585.91
4,-0.155,-0.105,404.7,7275725.459


In [23]:
df_results.std()

OOB               0.347
R^2              20.071
MAE             329.170
MSE   1,181,074,451.996
dtype: float64

**Try k-fold**

In [28]:
k = 5
kf = KFold(n_splits=k, shuffle=True)
results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    results.append( test() )
df_results_kfold = pd.DataFrame(results, columns=['OOB', 'R^2', 'MAE', 'MSE'])
df_results_kfold

Unnamed: 0,OOB,R^2,MAE,MSE
0,-0.039,0.718,381.575,1720111.443
1,-0.29,-1.074,592.666,254276294.859
2,-0.052,0.027,501.273,135483954.396
3,-0.148,-46.047,551.861,267634042.184
4,0.046,0.045,885.087,2064334690.084


In [30]:
df_results_kfold.std() # still unstable; try running it multiple times

OOB             0.128
R^2            20.571
MAE           186.795
MSE   856,254,342.417
dtype: float64

In [32]:
k = 5
cvscore = cross_val_score(
			RandomForestRegressor(n_estimators=50), # which model to use
			X_train, y_train, # what training data to split up
			cv=k, # number of folds/chunks
			scoring='neg_mean_absolute_error') # what error metric
-cvscore

array([623.87163999, 422.87414852, 413.99398158, 511.28675747,
       401.09123528])

## Classifiers

In [53]:
from sklearn.metrics import confusion_matrix

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf = RandomForestClassifier(n_estimators=30, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

C = confusion_matrix(y_test, y_pred)
C

array([[35,  3],
       [ 1, 75]])

In [54]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).flatten()
tn, fp, fn, tp

(35, 3, 1, 75)