### Data processing
This file contains a comparison of several **supervised machine learning models** applied to pre-processed data.

In [40]:
import data_preprocessing as dp
import numpy as np
import pandas as pd
import warnings
import time

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    KFold,
    RandomizedSearchCV,
    train_test_split,
    cross_val_score,
    learning_curve,
    validation_curve
)
from sklearn.linear_model import (
    LogisticRegression,
    Ridge
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    roc_curve,
    roc_auc_score,
    f1_score
)
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import randint

Split of the dataset in target and input variables

In [4]:
y = dp.X['RainTomorrow']
X = dp.X.drop(['RainTomorrow'], axis=1)

Function to split of the dataset in 2 parts (80%-20%), which is used in some models.

In [5]:
def split_df(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = split_df(X, y)

#### Logistic Regression

In [7]:
def logistic_regression(X_train, X_test, y_train, y_test):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        log_reg = LogisticRegression()
        log_reg.fit(X_train, y_train)
        y_pred = log_reg.predict(X_test)
    return log_reg, y_pred

In [43]:
log_reg, y_pred_log_reg = logistic_regression(X_train, X_test, y_train, y_test)

accuracy_log_reg = log_reg.score(X_test, y_test)
precision_log_reg = precision_score(y_test, y_pred_log_reg)
recall_log_reg = recall_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg)

K-Fold Cross Validation

In [9]:
def logistic_regression_kfold(X, y, n_splits=5, max_iter=800):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        model = LogisticRegression(max_iter=max_iter)
        accuracy_scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
        precision_scores = cross_val_score(model, X, y, cv=kfold, scoring='precision')
        recall_scores = cross_val_score(model, X, y, cv=kfold, scoring='recall')
        f1_scores = cross_val_score(model, X, y, cv=kfold, scoring='f1')
        # evaluation metrics
        mean_accuracy_score = np.mean(accuracy_scores)
        mean_precision_score = np.mean(precision_scores)
        mean_recall_score = np.mean(recall_scores)
        mean_f1_score = np.mean(f1_scores)
    return accuracy_scores, mean_accuracy_score, precision_scores, mean_precision_score, recall_scores, mean_recall_score, f1_scores, mean_f1_score

In [47]:
start_time_lr_cv = time.time()
lr_acc, lr_acc_mean, _, _, _, _, _, _ = logistic_regression_kfold(X, y)
end_time_lr_cv = time.time()

_, _, lr_prec, lr_prec_mean, _, _, _, _ = logistic_regression_kfold(X, y)
_, _, _, _, lr_rec, lr_rec_mean, _, _ = logistic_regression_kfold(X, y)
_, _, _, _, _, _, lr_f1, lr_f1_mean = logistic_regression_kfold(X, y)

elapsed_time_lr_cv = end_time_lr_cv - start_time_lr_cv

#### Decision Tree

In [11]:
def decision_tree(X_train, X_test, y_train, y_test):
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    return dt, y_pred

In [12]:
dt, y_pred_dt = decision_tree(X_train, X_test, y_train, y_test)

accuracy_dt = dt.score(X_test, y_test)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

K-Fold Cross Validation

In [13]:
def decision_tree_kfold(X, y, n_splits=5, max_iter=800):
    dt = DecisionTreeClassifier()
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(dt, X, y, cv=kfold, scoring='accuracy')
    precision_scores = cross_val_score(dt, X, y, cv=kfold, scoring='precision')
    recall_scores = cross_val_score(dt, X, y, cv=kfold, scoring='recall')
    f1_scores = cross_val_score(dt, X, y, cv=kfold, scoring='f1')
    # evaluation metrics
    mean_accuracy_score = np.mean(accuracy_scores)
    mean_precision_score = np.mean(precision_scores)
    mean_recall_score = np.mean(recall_scores)
    mean_f1_score = np.mean(f1_scores)
    return accuracy_scores, mean_accuracy_score, precision_scores, mean_precision_score, recall_scores, mean_recall_score, f1_scores, mean_f1_score


In [14]:
dt_acc, dt_acc_mean, _, _, _, _, _, _ = decision_tree_kfold(X, y)
_, _, dt_prec, dt_prec_mean, _, _, _, _ = decision_tree_kfold(X, y)
_, _, _, _, dt_rec, dt_rec_mean, _, _ = decision_tree_kfold(X, y)
_, _, _, _, _, _, dt_f1, dt_f1_mean = decision_tree_kfold(X, y)

#### Random Forest

In [17]:
def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=100) # 100 trees
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    return rf, y_pred

In [44]:
start_time_rf = time.time()
rf, y_pred_rf = random_forest(X_train, X_test, y_train, y_test)
end_time_rf = time.time()

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

elapsed_time_rf = end_time_rf - start_time_rf

K-Fold Cross Validation

In [15]:
def random_forest_kfold(X, y, n_splits=5):
    rf = RandomForestClassifier(n_estimators=100) # 100 trees
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(rf, X, y, cv=kfold, scoring='accuracy')
    precision_scores = cross_val_score(rf, X, y, cv=kfold, scoring='precision')
    recall_scores = cross_val_score(rf, X, y, cv=kfold, scoring='recall')
    f1_scores = cross_val_score(rf, X, y, cv=kfold, scoring='f1')
    # evaluation metrics
    mean_accuracy_score = np.mean(accuracy_scores)
    mean_precision_score = np.mean(precision_scores)
    mean_recall_score = np.mean(recall_scores)
    mean_f1_score = np.mean(f1_scores)
    return accuracy_scores, mean_accuracy_score, precision_scores, mean_precision_score, recall_scores, mean_recall_score, f1_scores, mean_f1_score

In [16]:
rf_acc, rf_acc_mean, _, _, _, _, _, _ = decision_tree_kfold(X, y)
_, _, rf_prec, rf_prec_mean, _, _, _, _ = decision_tree_kfold(X, y)
_, _, _, _, rf_rec, rf_rec_mean, _, _ = decision_tree_kfold(X, y)
_, _, _, _, _, _, rf_f1, rf_f1_mean = decision_tree_kfold(X, y)

#### SVM
Linear, polinomial and radial basis function kernels.

In [19]:
def svm_linear(X_train, X_test, y_train, y_test):
    svm_linear_model = svm.SVC(kernel='linear', C=1, random_state=42)
    svm_linear_model.fit(X_train, y_train)
    y_pred = svm_linear_model.predict(X_test)
    return svm_linear_model, y_pred

In [32]:
def svm_poly(X_train, X_test, y_train, y_test):
    svm_poly_model = svm.SVC(kernel='poly', C=1, random_state=42)
    svm_poly_model.fit(X_train, y_train)
    y_pred = svm_poly_model.predict(X_test)
    return svm_poly_model, y_pred

In [33]:
def svm_rbf(X_train, X_test, y_train, y_test):
    svm_rbf_model = svm.SVC(kernel='rbf', C=1, random_state=42)
    svm_rbf_model.fit(X_train, y_train)
    y_pred = svm_rbf_model.predict(X_test)
    return svm_rbf_model, y_pred

In [36]:
svm_linear, y_pred_svm_lin = svm_linear(X_train, X_test, y_train, y_test)
accuracy_svm_linear = accuracy_score(y_test, y_pred_svm_lin)
precision_svm_linear = precision_score(y_test, y_pred_svm_lin)
recall_svm_linear = recall_score(y_test, y_pred_svm_lin)
f1_svm_linear = f1_score(y_test, y_pred_svm_lin)

In [37]:
svm_poly, y_pred_svm_poly = svm_poly(X_train, X_test, y_train, y_test)
accuracy_svm_poly = accuracy_score(y_test, y_pred_svm_poly)
precision_svm_poly = precision_score(y_test, y_pred_svm_poly)
recall_svm_poly = recall_score(y_test, y_pred_svm_poly)
f1_svm_poly = f1_score(y_test, y_pred_svm_poly)

In [38]:
svm_rbf, y_pred_svm_rbf = svm_rbf(X_train, X_test, y_train, y_test)
accuracy_svm_rbf = accuracy_score(y_test, y_pred_svm_rbf)
precision_svm_rbf = precision_score(y_test, y_pred_svm_rbf)
recall_svm_rbf = recall_score(y_test, y_pred_svm_rbf)
f1_svm_rbf = f1_score(y_test, y_pred_svm_rbf)

K-Fold Cross Validation

In [21]:
def svm_kfold(X, y, n_splits=5):
    svm_model = svm.SVC(kernel='linear', C=1, random_state=42)
    svm_model.fit(X, y)
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(svm_model, X, y, cv=kfold, scoring='accuracy')
    precision_scores = cross_val_score(svm_model, X, y, cv=kfold, scoring='precision')
    recall_scores = cross_val_score(svm_model, X, y, cv=kfold, scoring='recall')
    f1_scores = cross_val_score(svm_model, X, y, cv=kfold, scoring='f1')
    # evaluation metrics
    mean_accuracy_score = np.mean(accuracy_scores)
    mean_precision_score = np.mean(precision_scores)
    mean_recall_score = np.mean(recall_scores)
    mean_f1_score = np.mean(f1_scores)
    return accuracy_scores, mean_accuracy_score, precision_scores, mean_precision_score, recall_scores, mean_recall_score, f1_scores, mean_f1_score

In [22]:
svm_acc, svm_acc_mean, _, _, _, _, _, _ = decision_tree_kfold(X, y)
_, _, svm_prec, svm_prec_mean, _, _, _, _ = decision_tree_kfold(X, y)
_, _, _, _, svm_rec, svm_rec_mean, _, _ = decision_tree_kfold(X, y)
_, _, _, _, _, _, svm_f1, svm_f1_mean = decision_tree_kfold(X, y)

#### Accuracy comparison

In [53]:
# create a performance dataframe
performance = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM (Linear Kernel)'],
    'Accuracy': [accuracy_log_reg, accuracy_dt, accuracy_rf, accuracy_svm_linear_model],
    'Precision': [precision_log_reg, precision_dt, precision_rf, precision_svm_linear_model],
    'Recall': [recall_log_reg, recall_dt, recall_rf, recall_svm_linear_model],
    'F1-score': [f1_log_reg, f1_dt, f1_rf, f1_svm_linear_model]
})

print(performance)

# find the best performance
performance_sorted = performance.sort_values(by='F1-score', ascending=False)
best_model = performance_sorted.iloc[0]['Model']
best_f1_score = performance_sorted.iloc[0]['F1-score']

print("\nBest model: ", best_model)

                 Model  Accuracy  Precision    Recall  F1-score
0  Logistic Regression  0.825480   0.670034  0.373709  0.479807
1        Decision Tree  0.745804   0.414591  0.437559  0.425765
2        Random Forest  0.824671   0.659164  0.384977  0.486070
3  SVM (Linear Kernel)  0.804651   0.726027  0.149296  0.247664

Best model:  Random Forest


In [54]:
# create a performance dataframe for CV
performance_cv = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM (Linear Kernel)'],
    'Accuracy': [lr_acc_mean, dt_acc_mean, rf_acc_mean, svm_acc_mean],
    'Precision': [lr_prec_mean, dt_prec_mean, rf_prec_mean, svm_prec_mean],
    'Recall': [lr_rec_mean, dt_rec_mean, rf_rec_mean, svm_rec_mean],
    'F1-score': [lr_f1_mean, dt_f1_mean, rf_f1_mean, svm_f1_mean]
})

print(performance_cv)

# find the best performance
performance_sorted_cv = performance_cv.sort_values(by='F1-score', ascending=False)
best_model_cv = performance_cv.iloc[0]['Model']
best_f1_score_cv = performance_cv.iloc[0]['F1-score']

print("\nBest CV model: ", best_model_cv)

                 Model  Accuracy  Precision    Recall  F1-score
0  Logistic Regression  0.822863   0.683127  0.374372  0.483427
1        Decision Tree  0.745844   0.432635  0.450620  0.443794
2        Random Forest  0.746410   0.431223  0.450363  0.442156
3  SVM (Linear Kernel)  0.749080   0.431331  0.455335  0.445277

Best CV model:  Logistic Regression


In [55]:
# create a performance dataframe for SVMs
performance_svm = pd.DataFrame({
    'Model': ['Linear', 'Polynomial', 'RBF'],
    'Accuracy': [accuracy_svm_linear, accuracy_svm_poly, accuracy_svm_rbf],
    'Precision': [precision_svm_linear, precision_svm_poly, precision_svm_rbf],
    'Recall': [recall_svm_linear, recall_svm_poly, recall_svm_rbf],
    'F1-score': [f1_svm_linear, f1_svm_poly, f1_svm_rbf]
})

print(performance_svm)

# find the best performance
performance_sorted_svm = performance_svm.sort_values(by='F1-score', ascending=False)
best_model_svm = performance_svm.iloc[0]['Model']
best_f1_score_svm = performance_svm.iloc[0]['F1-score']

print("\nBest SVM model: ", best_model_svm)

        Model  Accuracy  Precision    Recall  F1-score
0      Linear  0.804651   0.726027  0.149296  0.247664
1  Polynomial  0.824874   0.749373  0.280751  0.408470
2         RBF  0.824267   0.740196  0.283568  0.410048

Best SVM model:  Linear


In [50]:
if elapsed_time_rf < elapsed_time_lr_cv:
    fastest = "Random Forest"
else:
    fastest = "Logistic Regression CV"

print("Random Forest time: ", elapsed_time_rf)
print("Logistic Regression CV time: ", elapsed_time_lr_cv)
print("\nFastest: ", fastest)

Random Forest time:  2.201897382736206
Logistic Regression CV time:  2.9596216678619385

Fastest:  Random Forest



#### Random Forest Improvement

In [65]:
param_distributions = {"n_estimators": randint(100, 1000),
                       "min_samples_split": randint(2, 20),
                       "min_samples_leaf": randint(1, 10),
                       "max_features": randint(1, 10)}

rf = RandomForestClassifier(max_depth=20)

random_search = RandomizedSearchCV(rf, param_distributions=param_distributions, n_iter=50, cv=5, n_jobs=-1)
random_search.fit(X_train, y_train)

best_params = random_search.best_params_

rf_impr = RandomForestClassifier(n_estimators=best_params["n_estimators"],
                                  max_depth=20,
                                  min_samples_split=best_params["min_samples_split"],
                                  min_samples_leaf=best_params["min_samples_leaf"],
                                  max_features=best_params["max_features"])

rf_impr.fit(X_train, y_train)

In [None]:
accuracy_rf_impr = accuracy_score(y_test, y_pred_rf_impr)
print(accuracy_rf_impr)