In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate, KFold, ParameterGrid
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.preprocessing import StandardScaler
from scipy.special import rel_entr
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import warnings
import pickle

df_feat = pd.read_csv('df_train', index_col=["record", "augm_idx"])
df_lbp = pd.read_csv('df_lbp_train', index_col=["record", "augm_idx"])
target = pd.DataFrame(pd.read_csv('targets_train', index_col="record").join(df_lbp)["target"])
df = pd.concat([df_feat, df_lbp], axis=1)

index = pd.MultiIndex.from_tuples(list(itertools.product(np.arange(600), np.arange(10))), names=["record", "augm_idx"])
df.set_index(index, inplace=True)
target.set_index(index, inplace=True)

kernel = pd.read_csv('df_gram_tr.csv.gz', index_col=[0,1], header=[0,1])
kernel.set_index(index, inplace=True)
kernel.columns = index
kernel = np.exp(-kernel/kernel.values.max())
print(kernel.values.min())


## Base estimators + LogisticRegression

In [None]:
warnings.simplefilter(action='ignore')


N_JOBS=-1

X_train, X_test, y_train, y_test = train_test_split(df, target, train_size=4800, random_state=42, shuffle=False)

feat_col_names = list(df_feat.columns.values)
lbp_col_names = list(df_lbp.columns.values)

feat_classifier = RandomForestClassifier()
feat_col_trans = ColumnTransformer(transformers=[('selection', SimpleImputer(), feat_col_names)])
scaler = StandardScaler()
feat_pipe = Pipeline(steps=[('selection', feat_col_trans), ('preprocess', scaler), ('classifier', feat_classifier)])

def kl_div(x, y):
    fw = np.mean([np.sum(rel_entr(x[:512], y[:512])), np.sum(rel_entr(y[:512], x[:512]))])
    bw = np.mean([np.sum(rel_entr(x[512:], y[512:])), np.sum(rel_entr(y[512:], x[512:]))])
    return np.mean([fw, bw])

lbp_classifier = SVC()
lbp_pipe = lbp_classifier



def cv_gen_zero(cv, df):
    return [(df.loc[tr, :, :].index, df.loc[te, 0, :].index) for tr, te in cv.split(df.loc[:, 0, :])]

def cv_gen(cv, df):
    return [(df.loc[tr, :, :].index, df.loc[te, :, :].index) for tr, te in cv.split(df.loc[:, 0, :])]

n_splits = 5
cv = cv_gen(KFold(n_splits=n_splits, random_state=42, shuffle=True), X_train)
cv_zero = cv_gen_zero(KFold(n_splits=n_splits, random_state=42, shuffle=True), X_train)


In [None]:
# lbp classifier parameters TO UPDATE
lbp_classifier_name = str(type(lbp_classifier)).split('.')[-1].split("'")[0]
lbp_params = {
    # 'classifier__n_neighbors': [5, 10],
    # 'classifier__weights': ["uniform", "distance"],
    # 'classifier__n_jobs': [N_JOBS],
    # 'classifier__metric': [kl_div]
    'kernel': ['precomputed'],
    'C': list(np.logspace(-1,1,3)),
    'random_state': [42],
    'probability': [True]
}

# feat classifier parameters TO UPDATE
feat_classifier_name = str(type(feat_classifier)).split('.')[-1].split("'")[0]
feat_params = {
    'classifier__n_estimators': [10000],
    'classifier__criterion': ["gini", "entropy", "log_loss"],
    'classifier__max_depth': [None, 5], # don't know if 10 is a good value
    'classifier__max_features': ["sqrt", "log2", None],
    'classifier__class_weight': ["balanced"],
    'classifier__random_state': [42],
    'classifier__n_jobs': [N_JOBS],
}

final_clf = LogisticRegression(random_state=42, n_jobs=N_JOBS)

lbp_param_grid = [x for x in ParameterGrid(lbp_params)]
feat_param_grid = [x for x in ParameterGrid(feat_params)]

feat_predictions = []
for i in range(len(feat_param_grid)):
    feat_predictions.append([None]*n_splits)

lbp_predictions = []
for i in range(len(lbp_param_grid)):
    lbp_predictions.append([None]*n_splits)

results = []
importances = []
importances_lr = []
for i, lbp_p in enumerate(lbp_param_grid):
    lbp_pipe.set_params(**lbp_p)
    for k, feat_p in tqdm(list(enumerate(feat_param_grid))):
        feat_pipe.set_params(**feat_p)
        scores = []
        importances_c = []
        feat_prob_te = pd.DataFrame(np.ones(X_train.shape[0])*2/3, index=X_train.index).rename(columns={0:"feat_prob"})
        lbp_prob_te = pd.DataFrame(np.ones(X_train.shape[0])*2/3, index=X_train.index).rename(columns={0:"lbp_prob"})
        for j, (train_index, test_index) in enumerate(cv):
            # split
            X_tr, X_te = X_train.loc[train_index], X_train.loc[test_index]
            y_tr, y_te = y_train.loc[train_index], y_train.loc[test_index]

            # fit and predict base estimators
            
            # use pre-calculated predictions for the feat classifier
            try:
                feat_prob_te.loc[test_index] = pd.read_csv(f"feat_predictions_new/{feat_classifier_name}_prob_te_{k}_{j}", index_col=0).values
            except:
                feat_pipe.fit(X_tr, y_tr)
                feat_pred = feat_pipe.predict_proba(X_te)
                feat_prob_te.loc[test_index] = feat_pred[:,1:2]
                feat_prob_te.loc[test_index].to_csv(f"feat_predictions_new/{feat_classifier_name}_prob_te_{k}_{j}")
                importances_c.append(feat_pipe['classifier'].feature_importances_)


            # use pre-calculated predictions for the lbp classifier
            try:
                lbp_prob_te.loc[test_index] = pd.read_csv(f"lbp_predictions_new/{lbp_classifier_name}_te_{i}_{j}", index_col=0)
            except:
                X_tr_svm = kernel[train_index]
                X_te_svm = X_tr_svm.loc[test_index]
                X_tr_svm = X_tr_svm.loc[train_index]
                lbp_pipe.fit(X_tr_svm, y_tr)
                lbp_pred = lbp_pipe.predict_proba(X_te_svm)
                lbp_prob_te.loc[test_index] = lbp_pred[:,1:2]
                lbp_prob_te.loc[test_index].to_csv(f"lbp_predictions_new/{lbp_classifier_name}_te_{i}_{j}")

        importances.append(importances_c)

        importances_lr_c = []
        for j, (train_index, test_index) in enumerate(cv_zero):
            probs_te = pd.concat([lbp_prob_te, feat_prob_te], axis=1)
            final_clf.fit(probs_te.loc[train_index], y_train.loc[train_index])
            y_pr = final_clf.predict_proba(probs_te.loc[test_index])
            y1 = [x[1] for x in y_pr]

            # evaluate
            scores.append(roc_auc_score(y_train.loc[test_index], y1))
            importances_lr_c.append(final_clf.coef_)
        r = (lbp_p, feat_p, np.mean(scores))
        importances_lr.append(importances_lr_c)
        results.append(r)


Save results on disk

In [None]:
with open(f'results_new/unsorted_{lbp_classifier_name}_{feat_classifier_name}.pickle', 'wb') as outf:
    pickle.dump(obj=results, file=outf)

sorted_results = sorted(results, key=lambda x: x[2], reverse=True)
with open(f'results_new/{lbp_classifier_name}_{feat_classifier_name}.pickle', 'wb') as outf:
    pickle.dump(obj=sorted_results, file=outf)

with open(f'results_new/importances_lr_{lbp_classifier_name}_{feat_classifier_name}.pickle', 'wb') as outf:
    pickle.dump(obj=importances_lr, file=outf)

with open(f'results_new/importances_{lbp_classifier_name}_{feat_classifier_name}.pickle', 'wb') as outf:
    pickle.dump(obj=importances, file=outf)
