# Model assessment basics

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

pd.options.display.float_format = '{:,.3f}'.format

## Training set metric on random data

In [7]:
X_train = np.random.random((1000,4))
y_train = np.random.random(1000)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.8474731797281314

## NYC rent testing

In [26]:
df_rent = pd.read_json("data/rent-train.json")
features = ['bedrooms', 'bathrooms', 'latitude', 'longitude']
df_rent = df_rent[features+['price']]
X = df_rent.drop('price', axis=1)
y = df_rent['price']
X.head()

Unnamed: 0,bedrooms,bathrooms,latitude,longitude
10,3,1.5,40.7145,-73.9425
10000,2,1.0,40.7947,-73.9667
100004,1,1.0,40.7388,-74.0018
100007,1,1.0,40.7539,-73.9677
100013,4,1.0,40.8241,-73.9493


**Break out 20% hold out validation set**

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**Train model, compute metrics**

In [80]:
def test():
    rf = RandomForestRegressor(n_estimators=50, oob_score=True, n_jobs=-1)
    rf.fit(X_train, y_train)
    oob = rf.oob_score_ # wow this is a terrible score
    y_pred = rf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return (oob,r2,mae,mse)
    df_metrics = pd.DataFrame([[oob,r2,mae,mse]],
                              columns=['OOB', 'R^2', 'MAE', 'MSE'])
    print(df_metrics)
#    print(f"{oob:4.2f}, {r2:5.2f}, {mae:5.2f}, {mse:.2f}")

**This error is stable; just variation due to RF**

In [82]:
pd.DataFrame([test() for i in range(3)], columns=['OOB', 'R^2', 'MAE', 'MSE'])

Unnamed: 0,OOB,R^2,MAE,MSE
0,-0.197039,-0.002415,550.885808,140466800.0
1,-0.12859,-0.007982,558.240663,141246900.0
2,-0.520514,-0.012131,562.592616,141828300.0


**Very unstable if we pick more valid sets**

In [110]:
results = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    results.append( test() )
df_results = pd.DataFrame(results, columns=['OOB', 'R^2', 'MAE', 'MSE'])
df_results

Unnamed: 0,OOB,R^2,MAE,MSE
0,0.384,0.002,826.364,2039756335.63
1,-0.212,0.022,507.735,136327179.022
2,-0.403,0.732,374.084,1656298.227
3,-0.277,-0.3,729.761,309120920.387
4,0.526,0.002,841.569,2040343003.177


In [112]:
df_results.std()

OOB               0.421
R^2               0.382
MAE             206.373
MSE   1,041,469,434.452
dtype: float64

**Try k-fold**

In [113]:
k = 5
kf = KFold(n_splits=k, shuffle=True)
results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    results.append( test() )
df_results_kfold = pd.DataFrame(results, columns=['OOB', 'R^2', 'MAE', 'MSE'])
df_results_kfold

Unnamed: 0,OOB,R^2,MAE,MSE
0,-0.183,-98.397,630.506,553412888.658
1,0.058,0.046,882.309,2060757019.849
2,-0.264,0.401,547.204,153225105.143
3,0.04,0.602,390.397,3187444.117
4,0.008,0.719,366.591,1635596.216


In [114]:
df_results_kfold.std()

OOB             0.146
R^2            44.203
MAE           209.244
MSE   871,699,131.043
dtype: float64

In [120]:
k = 5
cvscore = cross_val_score(
			RandomForestRegressor(n_estimators=50), # which model to use
			X_train, y_train, # what training data to split up
			cv=k, # number of folds/chunks
			scoring='neg_mean_absolute_error') # what error metric
-cvscore

array([ 590.7696147 , 1002.89063176,  411.69693247,  739.98621123,
        452.6059509 ])