### Cross Validation

In [1]:
import numpy as np
import pandas as pd

Classifier

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('./data/heart-disease.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
np.random.seed(42)

X = df.drop('target', axis=1)
y = df['target']

clf = RandomForestClassifier()
cv_acc = cross_val_score(clf, X, y, cv=5)

np.mean(cv_acc) # 5-fold - default metric (accuracy)

0.8248087431693989

In [5]:
# Precision
np.random.seed(42)
cv_precision = cross_val_score(clf, X, y, cv=5, scoring='precision')

np.mean(cv_precision)

0.8329547346025924

In [6]:
# Recall
np.random.seed(42)
cv_recall = cross_val_score(clf, X, y, cv=5, scoring='recall')

np.mean(cv_recall)

0.8545454545454545

In [7]:
# F1-score
np.random.seed(42)
cv_f1 = cross_val_score(clf, X, y, cv=5, scoring='f1')

np.mean(cv_f1)

0.8426854603423346

Regressor

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
np.random.seed(42)

model = RandomForestRegressor()

In [10]:
# R^2
np.random.seed(42)
cv_r2 = cross_val_score(clf, X, y, cv=5)

np.mean(cv_r2)

0.8248087431693989

In [11]:
# MAE
np.random.seed(42)
cv_mae = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_absolute_error')

np.mean(cv_mae)

-0.17519125683060108

In [12]:
# MSE
np.random.seed(42)
cv_mse = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error')

np.mean(cv_mse)

-0.17519125683060108

In [13]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

Classification Metrics

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [15]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8360655737704918

In [16]:
y_preds = clf.predict(X_test)

In [17]:
print(f'Accuracy: {accuracy_score(y_test, y_preds) * 100:.2f}%')
print(f'Precision: {precision_score(y_test, y_preds):.2f}')
print(f'Recall: {recall_score(y_test, y_preds):.2f}')
print(f'F1-score: {f1_score(y_test, y_preds):.2f}')

Accuracy: 83.61%
Precision: 0.84
Recall: 0.84
F1-score: 0.84


In [18]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

np.random.seed(42)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.5300502155172415

In [19]:
y_preds = model.predict(X_test)

In [20]:
print(f'R^2: {r2_score(y_test, y_preds) * 100:.2f}')
print(f'MAE: {mean_absolute_error(y_test, y_preds) * 100:.2f}')
print(f'MSE: {mean_squared_error(y_test, y_preds) * 100:.2f}')

R^2: 53.01
MAE: 23.77
MSE: 11.72
