# Final Project
## Nicholas Schenone - A13599911

- 3 trials
- 7 classifiers
    - SVM
    - Logistic Regression
    - Decision Tree
    - Perceptron
    - Multilayer Perceptron
    - KNN
    - Random Forest
- 3 datasets
    - Heart Disease: https://www.kaggle.com/ronitf/heart-disease-uci
    - Mushroom: https://archive.ics.uci.edu/ml/datasets/Mushroom
    - Adult Data Set: https://archive.ics.uci.edu/ml/datasets/Adult
- 2 partitions (20/80, 80/20)
- 2 accuracies per (train, test)

### Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import json

import seaborn as sns

### Pre-Process Data

In [None]:
def adult_pre_process(data_path="data/adult/adult.csv", split=0.2):
    df_adult = pd.read_csv(data_path)
    df_adult_one_hot = pd.get_dummies(df_adult);
    
    X = df_adult_one_hot.iloc[:,0 : len(df_adult_one_hot.columns) - 1]
    X = StandardScaler().fit_transform(X)

    y = df_adult_one_hot.iloc[:, len(df_adult_one_hot.columns) - 1]
    y = y.values.ravel()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)

    return X, y, X_train, X_test, y_train, y_test

def heart_pre_process(data_path="data/heart_disease/heart.csv", split=0.2):
    df_heart = pd.read_csv(data_path)
    X = df_heart.iloc[:, 0 : len(df_heart.columns) - 1]
    X = StandardScaler().fit_transform(X)

    y = df_heart.iloc[:, len(df_heart.columns) - 1]
    y = y.values.ravel()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)

    return X, y, X_train, X_test, y_train, y_test

def mushroom_pre_process(data_path="data/mushroom/mushroom.csv", split=0.2):
    df_mushroom = pd.read_csv(data_path, header=None)
    df_mush_one_hot = pd.get_dummies(df_mushroom);
    
    X = df_mush_one_hot.iloc[:,1:]
    X = StandardScaler().fit_transform(X)

    y = df_mush_one_hot.iloc[:, :1]
    y = y.values.ravel()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)

    return X, y, X_train, X_test, y_train, y_test

def pre_process(dataset, split=0.2):
    if dataset == "happy":
        return happiness_pre_process(split=split)
    elif dataset == "mush":
        return mushroom_pre_process(split=split)
    elif dataset == "heart":
        return heart_pre_process(split=split)
    elif dataset == "adult":
        return adult_pre_process(split=split)

In [None]:
heart_X, heart_y, heart_X_train, heart_X_test, heart_y_train, heart_y_test = heart_pre_process(split=0.2)

mush_X, mush_y, mush_X_train, mush_X_test, mush_y_train, mush_y_test = mushroom_pre_process(split=0.2)

adult_X, adult_y, adult_X_train, adult_X_test, adult_y_train, adult_y_test = adult_pre_process(split=0.8)

### Classifiers and Functions

In [None]:
# SVM
def clf_SVM(param_grid):
    return svm.SVC(C = param_grid["C"],
                   gamma=param_grid["gamma"],
                   kernel=param_grid["kernel"],
                   max_iter = 10000)

# Logistic Regression
def clf_log(param_grid):
    return LogisticRegression(C = param_grid["C"],
                              penalty = param_grid["penalty"],
                              solver="liblinear",
                              max_iter = 10000)

# Decision Tree
def clf_tree(param_grid):
    return DecisionTreeClassifier(criterion=param_grid["criterion"],
                                  max_depth=param_grid["max_depth"])

# Perceptron
def clf_perc(param_grid):
    return Perceptron(penalty=param_grid["penalty"],
                      alpha=param_grid["alpha"],
                      max_iter=param_grid["max_iter"],
                      tol=param_grid["tol"],
                      early_stopping=param_grid["early_stopping"])

# Multi-Layer Perceptron
def clf_mlp(param_grid):
    return MLPClassifier(activation=param_grid["activation"],
                      solver=param_grid["solver"],
                      hidden_layer_sizes=param_grid["hidden_layer_sizes"],
                      max_iter=param_grid["max_iter"],
                      tol=param_grid["tol"],
                      early_stopping=param_grid["early_stopping"])

# KNN
def clf_knn(param_grid):
    return KNeighborsClassifier(n_neighbors=param_grid["n_neighbors"])

# Random Forest
def clf_rf(param_grid):
    return RandomForestClassifier(bootstrap=param_grid["bootstrap"],
                                 max_depth=param_grid["max_depth"],
                                 max_features=param_grid["max_features"],
                                 min_samples_leaf=param_grid["min_samples_leaf"],
                                 min_samples_split=param_grid["min_samples_split"],
                                 n_estimators=param_grid["n_estimators"])

# General
def clf(model, param_grid):
    if model == "svm":
        return clf_SVM(param_grid)
    elif model=="log":
        return clf_log(param_grid)
    elif model=="tree":
        return clf_tree(param_grid)
    elif model=="perc":
        return clf_perc(param_grid)
    elif model=="mlp":
        return clf_mlp(param_grid)
    elif model=="knn":
        return clf_knn(param_grid)
    elif model=="rf":
        return clf_rf(param_grid)
    
def train_model(classifier, X_train, y_train):
    classifier.fit(X_train, y_train)

def hyper_tune(X_train, y_train, estimator, param_grid, k_top=3):
    grid_search = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, cv=10, n_iter=20, n_jobs=-1, verbose=10)
    grid_search.fit(X_train, y_train)
    results = pd.DataFrame(grid_search.cv_results_)
    results.sort_values(by='rank_test_score', inplace=True)
    out = []
    [out.append(results.loc[i, 'params']) for i in range(k_top)]
    print(f"Best {k_top} params:", out)
    return out

def evalModel(classifer, X_test, y_test):
    y_pred = classifier.predict(X_test)
    
    accuracy= accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro")
    recall = recall_score(y_test, y_pred, average="macro")
    f_score = f1_score(y_test, y_pred, average="macro") 
    
    return (accuracy, precision, recall, f_score)

## Hyperparameter Tuning

### SVM

In [None]:
svm_param_grid = {
    "C" : [1, 10, 100, 1000, 10000],
    "gamma" : [1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    "kernel" : ["linear", "rbf"]
}

In [None]:
# Mushroom SVM Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, svm.SVC(), svm_param_grid)

with open('params/svm/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart SVM Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, svm.SVC(), svm_param_grid)

with open('params/svm/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult SVM Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, svm.SVC(), svm_param_grid)

with open('params/svm/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

### Logistic Regression

In [None]:
log_param_grid = {
    "C" : [1, 10, 100, 1000, 10000],
    "penalty" : ["l1", "l2"],
}

In [None]:
# Mushroom Logistic Regression Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, LogisticRegression(), log_param_grid)

with open('params/log/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart Logistic Regression Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, LogisticRegression(), log_param_grid)

with open('params/log/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult Logistic Regression Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, LogisticRegression(), log_param_grid)

with open('params/log/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

### Decision Tree

In [None]:
tree_param_grid = {
    "criterion" : ['gini', 'entropy'],
    "max_depth" : [4,6,8,12],
}

In [None]:
# Mushroom Decision Tree Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, DecisionTreeClassifier(), tree_param_grid)

with open('params/tree/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart Decision Tree Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, DecisionTreeClassifier(), tree_param_grid)

with open('params/tree/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult Decision Tree Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, DecisionTreeClassifier(), tree_param_grid)

with open('params/tree/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

### Perceptron

In [None]:
perc_param_grid = {
    "penalty" : [None, "l1", "l2", "elasticnet"],
    "alpha" : [0.001, 0.0001, 0.00001],
    "max_iter" : [500, 1000, 2000],
    "tol" : [1e-4, 1e-3, 1e-2],
    "early_stopping" : [True, False]
}

In [None]:
# Mushroom Perceptron Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, Perceptron(), perc_param_grid)

with open('params/perc/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart Perceptron Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, Perceptron(), perc_param_grid)

with open('params/perc/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult Perceptron Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, Perceptron(), perc_param_grid)

with open('params/perc/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

### Multi-Layer Perceptron

In [None]:
mlp_param_grid = {
    "hidden_layer_sizes" : [(100,), (50,), (200,), (25,)],
    "activation" : ["identity", "logistic", "tanh", "relu"],
    "solver" : ["lbfgs", "sgd", "adam"],
    "max_iter" : [200, 100, 300],
    "tol" : [1e-4, 1e-3, 1e-5],
    "early_stopping" : [True, False]
}

In [None]:
# Mushroom Perceptron Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, MLPClassifier(), mlp_param_grid)

with open('params/mlp/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart Perceptron Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, MLPClassifier(), mlp_param_grid)

with open('params/mlp/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult Perceptron Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, MLPClassifier(), mlp_param_grid)

with open('params/mlp/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

### KNN

In [None]:
knn_param_grid = {
    "n_neighbors" : [1, 3, 5, 9, 15, 25, 50, 75, 100],
}

In [None]:
# Mushroom Decision Tree Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, KNeighborsClassifier(), knn_param_grid)

with open('params/knn/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart Decision Tree Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, KNeighborsClassifier(), knn_param_grid)

with open('params/knn/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult Decision Tree Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, KNeighborsClassifier(), knn_param_grid)

with open('params/knn/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

### Random Forest

In [None]:
rf_param_grid = {
    'bootstrap': [True, False],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

In [None]:
# Mushroom Decision Tree Tuning
best_param_grid_mush = hyper_tune(mush_X_train, mush_y_train, RandomForestClassifier(), rf_param_grid)

with open('params/rf/best_param_grid_mush', 'w') as f:
    json.dump(best_param_grid_mush, f)

In [None]:
# Heart Decision Tree Tuning
best_param_grid_heart = hyper_tune(heart_X_train, heart_y_train, RandomForestClassifier(), rf_param_grid)

with open('params/rf/best_param_grid_heart', 'w') as f:
    json.dump(best_param_grid_heart, f)

In [None]:
# Adult Decision Tree Tuning
best_param_grid_adult = hyper_tune(adult_X_train, adult_y_train, RandomForestClassifier(), rf_param_grid)

with open('params/rf/best_param_grid_adult', 'w') as f:
    json.dump(best_param_grid_adult, f)

## Main Training Loop

- 3 trials
    - 3 datasets
        - 7 models
            - 2 splits (80/20, 20/80)    

In [None]:
num_trials = 3
datasets = ['adult', 'mush', 'heart']
splits = [0.2, 0.8]
models = ['log', 'svm', 'tree', 'perc', 'mlp', 'knn', 'rf']

In [None]:
# Loop through datasets
dataset_scores = {}
for dataset in datasets:
    
    # Loop through dataset splits
    split_scores = {}
    for split in splits:

        # Prepare data splits
        X, y, X_train, X_test, y_train, y_test = pre_process(dataset=dataset, split=split)
        
        # Loop through trials
        trial_scores = {}
        for i in range(num_trials):
            
            # Loop through models
            model_scores = {}
            for model in models:
                
                # Load best model params for given model and dataset
                with open(f'params/{model}/best_param_grid_{dataset}', 'r') as f:
                    best_param_grid = json.load(f)
    
                    # Create classifier
                    classifier = clf(model=model, param_grid=best_param_grid[i])

                    # Train classifier
                    print(f"Training {dataset}-{split}-{i}-{model}")
                    train_model(classifier, X_train, y_train)

                    # Evaluate classifier
                    print(f"Evaluating {dataset}-{split}-{i}-{model}")
                    acc, prec, rec, f = evalModel(classifier, X_test, y_test)
                    test = {"accuracy" : acc, "precision": prec, "recall" : rec, "f1_score" : f}  
                    acc, prec, rec, f = evalModel(classifier, X_train, y_train)
                    train = {"accuracy" : acc, "precision": prec, "recall" : rec, "f1_score" : f} 
                    
                    classifier_eval = {"train" : train, "test" : test}
                    
                # Add evaluation scores for given model
                model_scores.update({f"model_{model}" : classifier_eval})
                
            # Add model scores for given trial
            trial_scores.update({f"trial_{i}" : model_scores})

        # Add trial scores for given model
        split_scores.update({f"split_{split}": trial_scores})

    # Add split scores for given dataset
    dataset_scores.update({f"data_{dataset}": split_scores})
    
with open('scores/dataset_scores', 'w') as f:
    json.dump(dataset_scores, f)

## Dataset Results

In [None]:
with open('scores/dataset_scores', 'r') as f:
    dataset_scores = json.load(f)

In [None]:
def data_table_acc(_split_name, _model_name, dataset_scores=dataset_scores, models=models):
    # Init empty lists to gather data
    adult_train_acc = []
    adult_test_acc = []
    adult_train_f1 = []
    adult_test_f1 = []
    
    heart_train_acc = []
    heart_test_acc = []
    heart_train_f1 = []
    heart_test_f1 = []
    
    mush_train_acc = []
    mush_test_acc = []
    mush_train_f1 = []
    mush_test_f1 = []
    
    # Loop through all datasets, trials, splits, etc and add data
    # to empty lists
    for dataset_name, dataset_data in dataset_scores.items():
        for trial_name, trial_data in dataset_data[_split_name].items():
            if dataset_name == "data_adult":
                adult_train_acc.append(trial_data[_model_name]["train"]["accuracy"])
                adult_test_acc.append(trial_data[_model_name]["test"]["accuracy"])
                adult_train_f1.append(trial_data[_model_name]["train"]["f1_score"])
                adult_test_f1.append(trial_data[_model_name]["test"]["f1_score"])
            elif dataset_name == "data_mush":
                mush_train_acc.append(trial_data[_model_name]["train"]["accuracy"])
                mush_test_acc.append(trial_data[_model_name]["test"]["accuracy"])
                mush_train_f1.append(trial_data[_model_name]["train"]["f1_score"])
                mush_test_f1.append(trial_data[_model_name]["test"]["f1_score"])
            elif dataset_name == "data_heart":
                heart_train_acc.append(trial_data[_model_name]["train"]["accuracy"])
                heart_test_acc.append(trial_data[_model_name]["test"]["accuracy"])
                heart_train_f1.append(trial_data[_model_name]["train"]["f1_score"])
                heart_test_f1.append(trial_data[_model_name]["test"]["f1_score"])
    
    # Convert lists to numpy arrays for computations (mean, std)
    adult_train_acc = np.asarray(adult_train_acc)
    adult_test_acc = np.asarray(adult_test_acc)
    adult_train_f1 = np.asarray(adult_train_f1)
    adult_test_f1 = np.asarray(adult_test_f1)
    
    heart_train_acc = np.asarray(heart_train_acc)
    heart_test_acc = np.asarray(heart_test_acc)
    heart_train_f1 = np.asarray(heart_train_f1)
    heart_test_f1 = np.asarray(heart_test_f1)
    
    mush_train_acc = np.asarray(mush_train_acc)
    mush_test_acc = np.asarray(mush_test_acc)
    mush_train_f1 = np.asarray(mush_train_f1)
    mush_test_f1 = np.asarray(mush_test_f1)
    
    # Display variable dictionaries
    disp_split = {
        "split_0.2" : "80/20",
        "split_0.8" : "20/80"
    }
    
    disp_model = {
        'model_log': "Logarithmic Regression",
        'model_svm': "Support Vector Machine",
        'model_tree': "Decision Tree",
        'model_perc': "Perceptron",
        'model_mlp': "Multi-Layer Perceptron",
        'model_knn': "K-Nearest Neighbor",
        'model_rf': "Random Forest"
    }
    
    # Create dataframe
    df = pd.DataFrame([[
        f"{np.around(100 * adult_train_acc.mean(), decimals=2)} ± {np.around(100 * adult_train_acc.std(), decimals=2)}%",
        f"{np.around(100 * adult_test_acc.mean(), decimals=2)} ± {np.around(100 * adult_test_acc.std(), decimals=2)}%",
        f"{np.around(100 * mush_train_acc.mean(), decimals=2)} ± {np.around(100 * mush_train_acc.std(), decimals=2)}%",
        f"{np.around(100 * mush_test_acc.mean(), decimals=2)} ± {np.around(100 * mush_test_acc.std(), decimals=2)}%",
        f"{np.around(100 * heart_train_acc.mean(), decimals=2)} ± {np.around(100 * heart_train_acc.std(), decimals=2)}%",
        f"{np.around(100 * heart_test_acc.mean(), decimals=2)} ± {np.around(100 * heart_test_acc.std(), decimals=2)}%",
    ]], columns=[f"Adult Train Acc {disp_split[_split_name]}",
                 f"Adult Test Acc {disp_split[_split_name]}",
                 f"Mushroom Train Acc {disp_split[_split_name]}",
                 f"Mushroom Test Acc {disp_split[_split_name]}",
                 f"Heart Train Acc {disp_split[_split_name]}",
                 f"Heart Test Acc {disp_split[_split_name]}"])
    df[f"Avg Train Acc {disp_split[_split_name]}"] = f"{np.around(100 * np.array([adult_train_acc.mean(), mush_train_acc.mean(), heart_train_acc.mean()]).mean(), decimals=2)}%"
    
    df[f"Avg Test Acc {disp_split[_split_name]}"] = f"{np.around(100 * np.array([adult_test_acc.mean(), mush_test_acc.mean(), heart_test_acc.mean()]).mean(), decimals=2)}%"
    df = df.T
    df.columns = [f"{disp_model[_model_name]}"]
    return df.T

In [None]:
def data_table_f1(_split_name, _model_name, dataset_scores=dataset_scores, models=models):
    # Init empty lists to gather data
    adult_train_acc = []
    adult_test_acc = []
    adult_train_f1 = []
    adult_test_f1 = []
    
    heart_train_acc = []
    heart_test_acc = []
    heart_train_f1 = []
    heart_test_f1 = []
    
    mush_train_acc = []
    mush_test_acc = []
    mush_train_f1 = []
    mush_test_f1 = []
    
    # Loop through all datasets, trials, splits, etc and add data
    # to empty lists
    for dataset_name, dataset_data in dataset_scores.items():
        for trial_name, trial_data in dataset_data[_split_name].items():
            if dataset_name == "data_adult":
                adult_train_acc.append(trial_data[_model_name]["train"]["accuracy"])
                adult_test_acc.append(trial_data[_model_name]["test"]["accuracy"])
                adult_train_f1.append(trial_data[_model_name]["train"]["f1_score"])
                adult_test_f1.append(trial_data[_model_name]["test"]["f1_score"])
            elif dataset_name == "data_mush":
                mush_train_acc.append(trial_data[_model_name]["train"]["accuracy"])
                mush_test_acc.append(trial_data[_model_name]["test"]["accuracy"])
                mush_train_f1.append(trial_data[_model_name]["train"]["f1_score"])
                mush_test_f1.append(trial_data[_model_name]["test"]["f1_score"])
            elif dataset_name == "data_heart":
                heart_train_acc.append(trial_data[_model_name]["train"]["accuracy"])
                heart_test_acc.append(trial_data[_model_name]["test"]["accuracy"])
                heart_train_f1.append(trial_data[_model_name]["train"]["f1_score"])
                heart_test_f1.append(trial_data[_model_name]["test"]["f1_score"])
    
    # Convert lists to numpy arrays for computations (mean, std)
    adult_train_acc = np.asarray(adult_train_acc)
    adult_test_acc = np.asarray(adult_test_acc)
    adult_train_f1 = np.asarray(adult_train_f1)
    adult_test_f1 = np.asarray(adult_test_f1)
    
    heart_train_acc = np.asarray(heart_train_acc)
    heart_test_acc = np.asarray(heart_test_acc)
    heart_train_f1 = np.asarray(heart_train_f1)
    heart_test_f1 = np.asarray(heart_test_f1)
    
    mush_train_acc = np.asarray(mush_train_acc)
    mush_test_acc = np.asarray(mush_test_acc)
    mush_train_f1 = np.asarray(mush_train_f1)
    mush_test_f1 = np.asarray(mush_test_f1)
    
    # Display variable dictionaries
    disp_split = {
        "split_0.2" : "80/20",
        "split_0.8" : "20/80"
    }
    
    disp_model = {
        'model_log': "Logarithmic Regression",
        'model_svm': "Support Vector Machine",
        'model_tree': "Decision Tree",
        'model_perc': "Perceptron",
        'model_mlp': "Multi-Layer Perceptron",
        'model_knn': "K-Nearest Neighbor",
        'model_rf': "Random Forest"
    }
    
    # Create dataframe
    df = pd.DataFrame([[
        f"{np.around(100 * adult_train_f1.mean(), decimals=2)} ± {np.around(100 * adult_train_f1.std(), decimals=2)}%",
        f"{np.around(100 * adult_test_f1.mean(), decimals=2)} ± {np.around(100 * adult_test_f1.std(), decimals=2)}%",
        f"{np.around(100 * mush_train_f1.mean(), decimals=2)} ± {np.around(100 * mush_train_f1.std(), decimals=2)}%",
        f"{np.around(100 * mush_test_f1.mean(), decimals=2)} ± {np.around(100 * mush_test_f1.std(), decimals=2)}%",
        f"{np.around(100 * heart_train_f1.mean(), decimals=2)} ± {np.around(100 * heart_train_f1.std(), decimals=2)}%",
        f"{np.around(100 * heart_test_f1.mean(), decimals=2)} ± {np.around(100 * heart_test_f1.std(), decimals=2)}%",
    ]], columns=[f"Adult Train F1 {disp_split[_split_name]}",
                 f"Adult Test F1 {disp_split[_split_name]}",
                 f"Mushroom Train F1 {disp_split[_split_name]}",
                 f"Mushroom Test F1 {disp_split[_split_name]}",
                 f"Heart Train F1 {disp_split[_split_name]}",
                 f"Heart Test F1 {disp_split[_split_name]}"])
    df[f"Avg Train F1 {disp_split[_split_name]}"] = f"{np.around(100 * np.array([adult_train_f1.mean(), mush_train_f1.mean(), heart_train_f1.mean()]).mean(), decimals=2)}%"
    
    df[f"Avg Test F1 {disp_split[_split_name]}"] = f"{np.around(100 * np.array([adult_test_f1.mean(), mush_test_f1.mean(), heart_test_f1.mean()]).mean(), decimals=2)}%"
    df = df.T
    df.columns = [f"{disp_model[_model_name]}"]
    return df.T

### 80/20 Split

In [None]:
model_list = ['model_log', 'model_svm', 'model_tree', 'model_perc', 'model_mlp', 'model_knn','model_rf']
data_tables = []
for model in model_list:
    data_tables.append(data_table_acc("split_0.2", model))
# data_table_split_02_acc = pd.concat(data_tables).sort_values(by=['Avg Test Acc 80/20'], ascending=False)
data_table_split_02_acc = pd.concat(data_tables)
data_table_split_02_acc#.to_csv("tables/data_table_split_02_acc.csv")

In [None]:
model_list = ['model_log', 'model_svm', 'model_tree', 'model_perc', 'model_mlp', 'model_knn','model_rf']
data_tables = []
for model in model_list:
    data_tables.append(data_table_f1("split_0.2", model))
# data_table_split_02_f1 = pd.concat(data_tables).sort_values(by=['Avg Test F1 80/20'], ascending=False)
data_table_split_02_f1 = pd.concat(data_tables)
data_table_split_02_f1#.to_csv("tables/data_table_split_02_f1.csv")

### 20/80 Split

In [None]:
model_list = ['model_log', 'model_svm', 'model_tree', 'model_perc', 'model_mlp', 'model_knn','model_rf']
data_tables = []
for model in model_list:
    data_tables.append(data_table_acc("split_0.8", model))
# data_table_split_08_acc = pd.concat(data_tables).sort_values(by=['Avg Test Acc 20/80'], ascending=False)
data_table_split_08_acc = pd.concat(data_tables)
data_table_split_08_acc#.to_csv("tables/data_table_split_08_acc.csv")

In [None]:
model_list = ['model_log', 'model_svm', 'model_tree', 'model_perc', 'model_mlp', 'model_knn','model_rf']
data_tables = []
for model in model_list:
    data_tables.append(data_table_f1("split_0.8", model))
# data_table_split_08_f1 = pd.concat(data_tables).sort_values(by=['Avg Test F1 20/80'], ascending=False)
data_table_split_08_f1 = pd.concat(data_tables)
data_table_split_08_f1
data_table_split_08_f1#.to_csv("tables/data_table_split_08_f1.csv")

### Ranked Avg Test Acc

In [None]:
acc_02 = data_table_split_02_acc["Avg Test Acc 80/20"]
acc_08 = data_table_split_08_acc["Avg Test Acc 20/80"]
df_acc = pd.concat((acc_02, acc_08), axis=1)
df_acc['Avg Test Acc 80/20'].replace(regex=True,inplace=True,to_replace=r'%',value=r'')
df_acc['Avg Test Acc 20/80'].replace(regex=True,inplace=True,to_replace=r'%',value=r'')
df_acc['Avg Test Acc'] = df_acc.astype('float64').mean(axis=1).to_frame()
df_acc.sort_values(by=['Avg Test Acc'], ascending=False)

### Ranked Avg F1 Score

In [None]:
f1_02 = data_table_split_02_f1["Avg Test F1 80/20"]
f1_08 = data_table_split_08_f1["Avg Test F1 20/80"]
df_f1 = pd.concat((f1_02, f1_08), axis=1)
df_f1['Avg Test F1 80/20'].replace(regex=True,inplace=True,to_replace=r'%',value=r'')
df_f1['Avg Test F1 20/80'].replace(regex=True,inplace=True,to_replace=r'%',value=r'')
df_f1['Avg Test F1'] = df_f1.astype('float64').mean(axis=1).to_frame()
df_f1.sort_values(by=['Avg Test F1'], ascending=False)