In [None]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import compose
from sklearn import pipeline
from sklearn import model_selection
from sklearn import metrics
import matplotlib.pyplot as plt
import pickle

In [None]:
import sklearn
sklearn.__version__

In [None]:
def train_with_estimator(n):
    df = pd.read_csv('../data/joined_cases_train.csv')
    df = df.drop(["latitude", "longitude", "Combined_Key", "country"], axis=1)
    y = df.pop('outcome')
    x = df


    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state=0)
    rf1 = ensemble.RandomForestClassifier(
        n_estimators=100,
        criterion="gini",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_features="auto",
        max_leaf_nodes=None,
        min_impurity_split=None,
        bootstrap=True,
    )

    onehot_encoder = preprocessing.OneHotEncoder()
    onehot_encoder.fit_transform(df[['sex']])


    column_trans = compose.make_column_transformer(
        (preprocessing.OneHotEncoder(), ['sex']),
        remainder='passthrough'
    )

    column_trans.fit_transform(df)

    pipe1 = pipeline.make_pipeline(column_trans, rf1)


    #%%

    pipe1.fit(X_train, Y_train)
    file_path = "../models/random_forest_classifier.pkl"
    with open(file_path, 'wb') as fid:
        pickle.dump(pipe1, fid)


    pipe1 = pickle.load(open(file_path, 'rb'))



    #%%

    rocs = {label: [] for label in y.unique()}

    #%%

    # for label in y.unique():
    #     pipe.fit(X_train, Y_train)
    #     rf_probs = pipe.predict_proba(X_test)


    rf_probs = pipe1.predict_proba(X_test)
    rf_pred = pipe1.predict(X_test)

    rf_pred_train = pipe1.predict(X_train)


    #%%

    rf_auc = metrics.roc_auc_score(Y_test, rf_probs, multi_class='ovr')
    print('Random Forest AUC:  %.3f' % (rf_auc))

    n_classes = 4
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    L = ["deceased","hospitalized","nonhospitalized","recovered"]
    binarized_y = preprocessing.label_binarize(Y_test, classes=["deceased","hospitalized","nonhospitalized","recovered"])
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(binarized_y[:, i], rf_probs[:, i])
        roc_auc[i] = metrics.auc(fpr[i], tpr[i])

        plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.legend(L)
    #%%
    print("Result on the test data")
    print(metrics.confusion_matrix(Y_test, rf_pred))
    print(metrics.classification_report(Y_test, rf_pred, digits=3))

    print("Result on the training data")
    print(metrics.confusion_matrix(Y_train, rf_pred_train))
    print(metrics.classification_report(Y_train, rf_pred_train, digits=3))

    #%%

    # cross-validation

    cross_validation_accuracy = model_selection.cross_val_score(pipe1, x, y).mean()
    print("The Cross Validation accuracy is %.3f" % (cross_validation_accuracy))

In [None]:
def detect_overfit():
    df = pd.read_csv('../data/joined_cases_train.csv')
    df = df.drop(["latitude", "longitude", "Combined_Key", "country"], axis=1)
    y = df.pop('outcome')
    x = df
    #
    # X_train, X_test, Y_train, Y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state=0)

    onehot_encoder = preprocessing.OneHotEncoder()
    onehot_encoder.fit_transform(df[['sex']])


    column_trans = compose.make_column_transformer(
        (preprocessing.OneHotEncoder(), ['sex']),
        remainder='passthrough'
    )

    x = column_trans.fit_transform(df)



    parameter_range = np.arange(1, 1000, 100)

    train_score, test_score = model_selection.validation_curve(ensemble.RandomForestClassifier(), x, y,
                                       param_name = "n_estimators",
                                       param_range = parameter_range,
                                        cv = 5, scoring = "accuracy")



    mean_train_score = np.mean(train_score, axis = 1)
    std_train_score = np.std(train_score, axis = 1)
    mean_test_score = np.mean(test_score, axis = 1)
    std_test_score = np.std(test_score, axis = 1)
    plt.plot(parameter_range, mean_train_score,
        label = "Training Score", color = 'b')
    plt.plot(parameter_range, mean_test_score,
        label = "Cross Validation Score", color = 'g')


In [None]:
detect_overfit()

In [None]:
train_with_estimator(100)


In [None]:
train_with_estimator(200)

In [None]:
train_with_estimator(500)

In [None]:
train_with_estimator(1000)