This notebook is used to train classifiers.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import copy
from random import sample
from joblib import dump, load
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, fbeta_score, make_scorer, accuracy_score, confusion_matrix, plot_confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, learning_curve, validation_curve
from sklearn.utils.class_weight import compute_class_weight

plt.style.use('ggplot')
%matplotlib inline

# Suppress sklearn deprecated warnings
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn

np.random.seed(42)

In [None]:
# Save paths on drive for models
path = f"drive/MyDrive/UNI/IPOTERI/data/cad/"
path_models = f"{path}models/"
suffix_old = ""
suffix = ""

### Read Data

In [None]:
# Read data
df_train = pd.read_csv(f"{path}train{suffix}.csv", index_col=0)
df_valid = pd.read_csv(f"{path}valid{suffix_old}.csv", index_col=0)
df_test = pd.read_csv(f"{path}test{suffix_old}.csv", index_col=0)

# # Use only top 7 variables
# top_variables = [
#     "Hyperlipemia\nHistoty of hyperlipemia",
#     "FE",
#     "Previous CABG",
#     "Diabetes\nHistory of diabetes",
#     "Previous Myocardial Infarction",
#     "Smoke\nHistory of smoke",
#     "Documented resting \nor exertional ischemia",
#     "Survive7Y"
# ]
# df_train = df_train.loc[:, top_variables]
# df_valid = df_valid.loc[:, top_variables]
# df_test = df_test.loc[:, top_variables]

train, valid, test = df_train.to_numpy(), df_valid.to_numpy(), df_test.to_numpy()
X_train, y_train = train[:, :-1], train[:, -1]
X_valid, y_valid = valid[:, :-1], valid[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]
feat_names = list(df_train.columns)

from collections import Counter
print(Counter(y_train))
print(Counter(y_valid))
print(Counter(y_test))

In [None]:
# All the numerical features that can be standarditazed
feat_names_num = ["Age", "FE", "Creatinina"]

# Preprocess only the numerical features
def get_preprocess_std_num(feat_names):
    def update_num_feats(x):
        if x in feat_names:
            return feat_names.index(x)

    num_feat_index = list(map(update_num_feats, feat_names_num))
    num_feat_index = [x for x in num_feat_index if x is not None]
    preprocess_std_num = ColumnTransformer(
                                transformers = [('stand', StandardScaler(), num_feat_index)], 
                                remainder="passthrough"
                            )
    return preprocess_std_num

preprocess_std = get_preprocess_std_num(feat_names)
preprocess_std_all = StandardScaler()

# Preprocessed ready-to-use train and valid set
process_tmp = preprocess_std.fit(X_train)
X_train_std = process_tmp.transform(X_train)
X_valid_std = process_tmp.transform(X_valid)

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates[:1]:
            print("Model rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            

# Evaluate models
def evaluate(pipe, X, y, plot=False):
    y_pred = pipe.predict(X)
    print(classification_report(y, y_pred, digits=3))
    print(f"auc macro {roc_auc_score(y, pipe.predict_proba(X)[:, 1]):.3f}")

    if plot:
        plot_confusion_matrix(pipe, X, y, normalize=None, values_format = '')
        plt.grid(False)
    else:
        print("confusion matrix")
        print(confusion_matrix(y, y_pred))


# Train and evaluate
def train_and_evaluate(
    preprocess, 
    model, 
    hyperparams, 
    X_train, 
    y_train, 
    X_valid, 
    y_valid, 
    scoring="f1_macro", 
    iter=5000, 
    save=False, 
    savename=""
):
    pipe = Pipeline(steps=[
        ('preprocess', preprocess), 
        ('model', model)
    ])

    rand = RandomizedSearchCV(pipe,
                              param_distributions=hyperparams,
                              n_iter=iter,
                              scoring=scoring,
                              cv=2,
                              n_jobs=-1,    # use all processors
                              refit=True,   # refit the best model at the end
                              return_train_score=True,
                              verbose=True).fit(X_train, y_train)
    
    evaluate(rand.best_estimator_, X_train, y_train)
    evaluate(rand.best_estimator_, X_valid, y_valid)
    report(rand.cv_results_, n_top=5)

    if save:
        dump(rand.best_estimator_, f"{path_models}{savename}{suffix}.joblib")
    
    return rand.best_estimator_

### Training


In [None]:
from sklearn.linear_model import LogisticRegression

hyperparams = {
    'model__penalty': ['l1', 'l2', 'elasticnet'],
    'model__dual': [True, False],
    'model__warm_start': [True, False],
    'model__C': stats.randint(1, 10),
    'model__max_iter': stats.randint(50, 500),
    'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}

model = LogisticRegression(class_weight="balanced")
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5000, save=True, savename=f"logreg2")
# metrics.plot_roc_curve(pipe, X_valid, y_valid)

# import math
# w = logreg.coef_[0]
# feature_importance = pd.DataFrame(df_feat.columns[:-1], columns=["features"])
# feature_importance["importance"] = pow(math.e, w)
# feature_importance = feature_importance.sort_values(by = ["importance"], ascending=False)
# feature_importance

In [None]:
from sklearn.svm import SVC

hyperparams = {
    'model__C': stats.randint(100, 600),
    'model__kernel': ['rbf', 'poly', 'sigmoid'],
    'model__degree': stats.randint(5, 200),
    'model__gamma': ['scale', 'auto'],
    'model__coef0': stats.uniform(0.0, 1),
    'model__max_iter': [400, 800, 1200, 1600]
}

model = SVC(class_weight="balanced", probability=True)
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5000, save=True, savename=f"svc2")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

hyperparams = {
    'model__n_neighbors': stats.randint(2, 100),
    'model__weights': ('uniform', 'distance'),
    'model__algorithm': ('ball_tree', 'kd_tree'),
    'model__leaf_size': stats.randint(10, 60)
}

model = KNeighborsClassifier()
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5000, save=True, savename=f"knn2")

In [None]:
from sklearn.ensemble import RandomForestClassifier

hyperparams = {
    'model__n_estimators': stats.randint(10, 200),
    'model__criterion': ('gini', 'entropy'),
    'model__min_samples_split': stats.randint(1, 8),
    'model__min_samples_leaf': stats.randint(1, 5),
    'model__max_features': ('sqrt', 'log2', None),
    'model__class_weight': ['balanced', 'balanced_subsample'],
}

model = RandomForestClassifier()
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5000, save=True, savename=f"rf2")

# feature importance use permutation importance
# importance = rf_rand.best_estimator_["model"].feature_importances_
# plt.bar(list(range(len(importance))), importance)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

hyperparams = {
    'model__n_estimators': stats.randint(10, 100),
    'model__learning_rate': stats.uniform(0.2, 1)
}

model = AdaBoostClassifier()
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5000, save=True, savename=f"adaboost2")

In [None]:
from sklearn.neural_network import MLPClassifier
import random

hyperparams = {
    'model__hidden_layer_sizes': [[stats.randint.rvs(100, 300), stats.randint.rvs(50, 150)], [stats.randint.rvs(50, 300)]],
    'model__solver': ['sgd', 'adam'],
    'model__learning_rate_init': stats.uniform(0.0005, 0.005),
    'model__learning_rate': ('constant', 'adaptive'),
    'model__alpha': stats.uniform(0, 1),
    'model__early_stopping': [True],
    'model__max_iter': stats.randint(300, 500),
}

model = MLPClassifier()
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=2500, save=True, savename=f"nn2")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

hyperparams = {
    'model__learning_rate': stats.uniform(0.03, 0.2),
    'model__n_estimators': stats.randint(10, 100),
    'model__max_depth': stats.randint(2, 6),
    'model__max_features': ('sqrt', 'log2', None),  # regularization
    'model__subsample': (0.25, 0.5, 0.75, 1),       # regularization
}

model = GradientBoostingClassifier()
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5000, save=True, savename=f"gb2")

In [None]:
import xgboost as xgb

hyperparams = {
    'model__booster': ['gbtree', 'gblinear', 'dart'],
    'model__eta': stats.uniform(0.05, 0.5),
    'model__gamma': stats.uniform(0, 0.2),
    'model__max_depth': [2, 3, 4, 6],
    'model__n_estimators': stats.randint(10, 100),
    'model__subsample': [0.25, 0.5, 0.75, 1],     # Stochastic regularization
    'model__lambda': stats.uniform(0.5, 1.5),     # L2 regularization
    'model__alpha': stats.uniform(0, 0.5),        # L1 regularization
    'model__scale_pos_weight': [0.2, 0.4, 0.8, 1, 2],
}

model = xgb.XGBClassifier(n_jobs=1)
train_and_evaluate(preprocess_std, model, hyperparams, X_train, y_train, X_valid, y_valid, scoring="f1_macro", iter=5, save=False, savename=f"xgb2")

### Evaluate

In [None]:
models = [
    "logreg_random_svmsmote_logreg",
    "svc_random_bordersmote_svc",
    "knn2_random_svmsmote_knn2",
    "rf2_random_svmsmote_rf2",
    "adaboost2_random_svmsmote_adaboost2",
    "nn_random_svmsmote_nn",
    "gb_random_svmsmote_gb",
    "xgb2_random_svmsmote_xgb2",
    "xgb2",
    "nn1_random_svmsmote_nn1",
]

name = f"{models[9]}.joblib"
model = load(path_models+name)

# model.fit(X_train, y_train)
# evaluate(model, X_train, y_train)
evaluate(model, X_valid, y_valid)
evaluate(model, X_test, y_test)
X_train.shape

In [None]:
tree = [load(path_models+f"tree4.joblib")]
evaluate(tree[0], X_valid4, y_valid4)
evaluate(tree[0], X_test4, y_test4)

### Feature Selection
Testing with algorithms such as RFECV and SFS.
- RFECV select features by recursively considering smaller sets of features using models' feature importance.
- SFS greedy procedure that initially start with zero feature and find the one feature that maximizes a cross-validated score when an estimator is trained on this single feature. Repeat the procedure by adding a new feature to the set of selected features. The procedure stops when the desired number of selected features is reached (n_features_to_select).

In [None]:
# !pip uninstall scikit-learn -y
# !pip install -U scikit-learn
# !pip install imblearn

In [None]:
# # feature selection
# from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2
# selector = SelectKBest(chi2, k=15)
# selector.fit(X_train, y_train)
# # scores = -np.log10(selector.pvalues_)
# # scores /= scores.max()
# # plt.bar(range(train.shape[-1]-1), scores)
# plt.bar(range(train.shape[-1]-1), selector.pvalues_)

In [None]:
# from sklearn.feature_selection import RFECV, SequentialFeatureSelector

# # apply the feature selection on the dataset
# model = load(path_models+"logreg_60feats.joblib")
# # selector = RFECV(model["model"], min_features_to_select=1, cv=4, scoring="f1_macro")
# selector = SequentialFeatureSelector(model["model"], n_features_to_select=50, direction="backward", cv=2, scoring="f1_macro")
# selector = selector.fit(X_train_std, y_train)
# X_train_select = selector.transform(X_train)
# X_valid_select = selector.transform(X_valid)
# X_test_select = selector.transform(X_test)

# # saving the feature selected dataset
# suffix = "_60feats_SFS50_logreg"
# df_train_select = pd.DataFrame(np.concatenate((X_train_select, np.expand_dims(y_train, 1)), axis=1), columns=new_feat_names)
# df_valid_select = pd.DataFrame(np.concatenate((X_valid_select, np.expand_dims(y_valid, 1)), axis=1), columns=new_feat_names)
# df_test_select = pd.DataFrame(np.concatenate((X_test_select, np.expand_dims(y_test, 1)), axis=1), columns=new_feat_names)
# # df_train_select.to_csv(path+f"train{suffix}.csv")
# # df_valid_select.to_csv(path+f"valid{suffix}.csv")
# # df_test_select.to_csv(path+f"test{suffix}.csv")

# new_feat_names = list(pd.Index(feat_names[:-1])[selector.get_support()])
# new_feat_names.append(feat_names[-1])
# print(X_train_select.shape)
# print(new_feat_names)