In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate, KFold, ParameterGrid
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.preprocessing import StandardScaler
from scipy.special import rel_entr
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm
import warnings
import pickle

df_feat = pd.read_csv('df_train', index_col=["record", "augm_idx"])
df_lbp = pd.read_csv('df_lbp_train', index_col=["record", "augm_idx"])
target = pd.DataFrame(pd.read_csv('targets_train', index_col="record").join(df_lbp)["target"])
df = pd.concat([df_feat, df_lbp], axis=1)

df_feat_kag = pd.read_csv('df_test', index_col=0)
df_lbp_kag = pd.read_csv('df_lbp_test', index_col=0)
df_kag = pd.concat([df_feat_kag, df_lbp_kag], axis=1)

index = pd.MultiIndex.from_tuples(list(itertools.product(np.arange(600), np.arange(10))), names=["record", "augm_idx"])
df.set_index(index, inplace=True)
target.set_index(index, inplace=True)

kernel = pd.read_csv('df_gram_tr.csv.gz', index_col=[0,1], header=[0,1])
kernel_kag = pd.read_csv('df_gram_te.csv.gz', index_col=0, header=[0,1])

kernel_kag.columns = index
kernel_kag = np.exp(-kernel_kag/kernel.values.max())
print(kernel_kag.values.min())

kernel.set_index(index, inplace=True)
kernel.columns = index
kernel = np.exp(-kernel/kernel.values.max())
print(kernel.values.min())


## Base estimators + LogisticRegression

In [None]:
warnings.simplefilter(action='ignore')


N_JOBS=-1

X_train = df
X_test = df_kag
y_train = target

feat_col_names = list(df_feat.columns.values)
lbp_col_names = list(df_lbp.columns.values)

feat_classifier = RandomForestClassifier()
feat_col_trans = ColumnTransformer(transformers=[('selection', SimpleImputer(), feat_col_names)])
scaler = StandardScaler()
feat_pipe = Pipeline(steps=[('selection', feat_col_trans), ('preprocess', scaler), ('classifier', feat_classifier)])

def kl_div(x, y):
    fw = np.mean([np.sum(rel_entr(x[:512], y[:512])), np.sum(rel_entr(y[:512], x[:512]))])
    bw = np.mean([np.sum(rel_entr(x[512:], y[512:])), np.sum(rel_entr(y[512:], x[512:]))])
    return np.mean([fw, bw])

lbp_classifier = SVC()
lbp_pipe = lbp_classifier



def cv_gen_zero(cv, df):
    return [(df.loc[tr, :, :].index, df.loc[te, 0, :].index) for tr, te in cv.split(df.loc[:, 0, :])]

def cv_gen(cv, df):
    return [(df.loc[tr, :, :].index, df.loc[te, :, :].index) for tr, te in cv.split(df.loc[:, 0, :])]

n_splits = 5
cv = cv_gen(KFold(n_splits=n_splits, random_state=42, shuffle=True), X_train)


In [None]:
# lbp classifier parameters
lbp_classifier_name = str(type(lbp_classifier)).split('.')[-1].split("'")[0]
lbp_params = {
    'kernel': ['precomputed'],
    'C': [0.1], # regularization parameter
    'random_state': [42],
    'probability': [True],
}

# feat classifier parameters
feat_classifier_name = str(type(feat_classifier)).split('.')[-1].split("'")[0]
feat_params = {
    'classifier__n_estimators': [10000],
    'classifier__criterion': ["entropy"],
    'classifier__max_depth': [5],
    'classifier__max_features': [None],
    'classifier__class_weight': ["balanced"],
    'classifier__random_state': [42],
    'classifier__n_jobs': [N_JOBS],
}

final_clf = LogisticRegression(random_state=42, n_jobs=N_JOBS)

lbp_param_grid = [x for x in ParameterGrid(lbp_params)]
feat_param_grid = [x for x in ParameterGrid(feat_params)]

feat_predictions = []
for i in range(len(feat_param_grid)):
    feat_predictions.append([None]*n_splits)

lbp_predictions = []
for i in range(len(lbp_param_grid)):
    lbp_predictions.append([None]*n_splits)

lbp_pipe.set_params(**(lbp_param_grid[0]))
feat_pipe.set_params(**(feat_param_grid[0]))
feat_prob_te = pd.DataFrame(np.ones(X_train.shape[0])*2/3, index=X_train.index).rename(columns={0:"feat_prob"})
lbp_prob_te = pd.DataFrame(np.ones(X_train.shape[0])*2/3, index=X_train.index).rename(columns={0:"lbp_prob"})
for j, (train_index, test_index) in enumerate(cv):
    # split
    X_tr, X_te = X_train.loc[train_index], X_train.loc[test_index]
    y_tr, y_te = y_train.loc[train_index], y_train.loc[test_index]

    # fit and predict base estimators
    
    # use pre-calculated predictions for the feat classifier
    feat_pipe.fit(X_tr, y_tr)
    feat_pred = feat_pipe.predict_proba(X_te)
    feat_prob_te.loc[test_index] = feat_pred[:,1:2]

    X_tr_svm = kernel[train_index]
    X_te_svm = X_tr_svm.loc[test_index]
    X_tr_svm = X_tr_svm.loc[train_index]
    lbp_pipe.fit(X_tr_svm, y_tr)
    lbp_pred = lbp_pipe.predict_proba(X_te_svm)
    lbp_prob_te.loc[test_index] = lbp_pred[:,1:2]

lbp_pipe.set_params(**(lbp_param_grid[0]))
feat_pipe.set_params(**(feat_param_grid[0]))
feat_pipe.fit(X_train, y_train)
feat_pred = feat_pipe.predict_proba(X_test)
feat_prob_te_te = pd.DataFrame(feat_pred[:,1], index=X_test.index).rename(columns={0:"feat_prob"})

lbp_pipe.fit(kernel, y_train)
lbp_pred = lbp_pipe.predict_proba(kernel_kag)
lbp_prob_te_te = pd.DataFrame(lbp_pred[:,1], index=X_test.index).rename(columns={0:"feat_prob"})

probs_te = pd.concat([lbp_prob_te, feat_prob_te], axis=1)
probs_te_te = pd.concat([lbp_prob_te_te, feat_prob_te_te], axis=1)
final_clf.fit(probs_te, y_train)
y_pr = final_clf.predict_proba(probs_te_te)
y1 = [x[1] for x in y_pr]


In [None]:
result = pd.DataFrame(y1, index=X_test.index, columns=['target'])
result.to_csv('kaggle.csv', index_label='id')
