In [None]:
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from matplotlib import pyplot

from sklearn.preprocessing import normalize, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import f1_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier

import hyperopt as hp

In [None]:
X_train = pd.read_csv("features_ata.csv").drop(columns='Unnamed: 0')
labels = pd.read_csv("y_train.csv").drop(columns='id')
X_test =  pd.read_csv("test_features_ata.csv").drop(columns='Unnamed: 0')
Y_train = labels.to_numpy()
Y_train = Y_train.reshape(-1)
pred_3 = pd.read_csv("class_3_pred.csv").drop(columns='id')
pred_3 = pred_3.to_numpy().reshape(-1)

In [None]:
def make_submission(prediction_, name='submission.csv'):
    dt = pd.DataFrame(data=prediction_, columns=['y'])
    dt['id'] = dt.index
    dt = dt[['id', 'y']]
    dt.to_csv(name, header=True, index=False)

# Visualize data

In [None]:
X_train = pd.DataFrame(X_train)
#histogram plot of features
for i in range(64):
    plt.figure()
    plt.title(i)
    X_train.iloc[:,i].hist(bins = 40)

In [None]:
class_3 = X_train[Y_train == 3]
rest = X_train[Y_train != 3]

for i in range(63):
    plt.figure()
    plt.title(i)
    plt.scatter(rest.iloc[:,i], rest.iloc[:,i+1], label='rest')
    plt.scatter(class_3.iloc[:,i], class_3.iloc[:,i+1], label='3')

# PRE-PROCESSING
---

## Outlier detection

In [None]:
# outlier detection
clf = LocalOutlierFactor(n_neighbors=10, contamination='auto')
outlier_detection = np.array(clf.fit_predict(X_train))
inliers = np.where(outlier_detection == 1)[0]

X_train = X_train.iloc[inliers]
Y_train = Y_train[inliers]

## Normalization

In [None]:
# normalization
X_train = normalize(X_train, axis=0)
X_test = normalize(X_test, axis=0)

## split data into 1 vs rest

In [None]:
# 1 vs rest split
class_3 = np.copy(Y_train)
rest = X_train[class_3!=3]
rest_labels = Y_train[class_3!=3]
class_3[np.where(class_3!=3)]=0
class_3[np.where(class_3==3)]=1

## feature selection

In [None]:
model = XGBClassifier(n_estimators =100, eval_metric = 'mlogloss', n_jobs=-1, reg_alpha= 0.1)
model.fit(rest, rest_labels)
# feature importants
#print(np.where(model.feature_importances_[model.feature_importances_> 0]))
print([model.feature_importances_])
# plot
#plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
#plt.show()

In [None]:
rest = rest[:, model.feature_importances_>0.001]
X_test = X_test[:, model.feature_importances_>0.001]
X_train = X_train[:, model.feature_importances_>0.001]

## train a model

In [None]:
classifier = XGBClassifier(n_estimators =100, eval_metric = 'mlogloss')
scores = cross_validate(classifier, rest, rest_labels, scoring = 'f1_micro', cv = 5, return_estimator=True, verbose=1, n_jobs=-1)

In [None]:
print(scores["test_score"])
print(np.mean(scores["test_score"]))

In [None]:
# train-test split
x_train, x_test, y_train, y_test = train_test_split(rest, rest_labels, test_size=0.2, random_state=42)

model = XGBClassifier(n_estimators=100, eval_metric = 'mloglos')
model.fit(x_train, y_train)
pred = model.predict(x_test)
#print(f1_score(y_test))

# Explore different models
---

# Baseline model (rbf-kernelized SVM)

In [None]:
# define the model
model = SVC()
model2 = RandomForestClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, rest, rest_labels, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

# EXPLORE BAGGING ALGORITHMS
---
- source: https://machinelearningmastery.com/bagging-ensemble-with-python/

## Simple bagged decision tree classifier

In [None]:
# bagging svm
# define the model
clf = DecisionTreeClassifier(random_state=1)
model = XGBClassifier(n_estimators=100, max_features=0.5, )
model2 = GradientBoostingClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, rest, rest_labels, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

# Explore Hyperparameters

## explore number of trees

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    # define number of trees to consider
    n_trees = [10, 20, 30, 40, 50, 100]
    for n in n_trees:
        models[str(n)] = XGBClassifier(n_estimators=n)
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    # define the evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
    # evaluate the model and collect the results
    scores = cross_val_score(model, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
    return scores
 
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model
    scores = evaluate_model(model, rest, rest_labels)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize the performance along the way
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

## explore number of samples in bootstrap

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    # explore ratios from 10% to 100% in 10% increments
    for i in np.arange(0.1, 1.1, 0.1):
        key = '%.1f' % i
        models[key] = XGBClassifier(subsample=i, n_estimators = 100)
    return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    # define the evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
    # evaluate the model and collect the results
    scores = cross_val_score(model, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
    return scores
 
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model
    scores = evaluate_model(model, X_train, Y_train)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize the performance along the way
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

## Train a model with the hyperparameter choosen with the knowledge from the above inspections

In [None]:
# bagging svm
# define the model
clf = DecisionTreeClassifier(random_state=1)
model = BaggingClassifier(base_estimator=SVC(class_weight='balanced'), n_estimators=100, max_samples=0.4)
model2 = GradientBoostingClassifier(n_estimators=100, subsample=1.0)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, rest, rest_labels, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('F_micro score: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
model = BaggingClassifier(n_estimators=100, max_samples=0.4)
model2 = GradientBoostingClassifier(n_estimators=100, subsample=0.4)
model2.fit(rest,rest_labels)
pred = model2.predict(X_test)
pred[pred_3==1] = 3
make_submission(pred)

# Try to use XGB

In [None]:
# 1 vs rest split
class_3 = np.copy(Y_train)
rest = X_train[class_3!=3]
rest_labels = Y_train[class_3!=3]
class_3[np.where(class_3!=3)]=0
class_3[np.where(class_3==3)]=1

In [None]:
# define model
model = XGBClassifier(n_estimators=10, eval_metric = 'mlogloss')
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, rest, rest_labels, scoring='f1_micro', cv=cv, n_jobs=-1)
# summarize performance
print('Mean F1: %.5f' % np.mean(scores))

## best tuned SVM

In [None]:
# define the model
model = SVC(gamma=100, C=20)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, rest, rest_labels, scoring='f1_micro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

# majority vote from previous classifiers

In [None]:
ann_pred = pd.read_csv("ann_pred.csv").drop(columns='id')
ann_pred = ann_pred.to_numpy().reshape(-1)

In [None]:
xgb_pred = pd.read_csv("ann_pred.csv").drop(columns='id')
xgb_pred = xgb_pred.to_numpy().reshape(-1)

In [None]:
s0 = pd.read_csv("submission-0.csv").drop(columns='id')
s1 = pd.read_csv("submission-1.csv").drop(columns='id')
s2 = pd.read_csv("submission-2.csv").drop(columns='id')
s3 = pd.read_csv("submission-3.csv").drop(columns='id')
s4 = pd.read_csv("submission-4.csv").drop(columns='id')
s5 = pd.read_csv("submission-5.csv").drop(columns='id')
s0 = s0.to_numpy().reshape(-1)
s1 = s1.to_numpy().reshape(-1)
s2 = s2.to_numpy().reshape(-1)
s3 = s3.to_numpy().reshape(-1)
s4 = s4.to_numpy().reshape(-1)
s5 = s5.to_numpy().reshape(-1)

ensemble = np.zeros((5, len(tuned_svc_pred)))

ensemble[0,:] = tuned_svc_pred
ensemble[1,:] = bagged_svm_pred
ensemble[2,:] = ann_pred
ensemble[3,:] = xgb_pred
ensemble[4,:] = s0

In [None]:
final_pred = np.zeros(len(tuned_svc_pred))
for i in tqdm.trange(len(tuned_svc_pred)):
    b = Counter(ensemble[:,i])
    final_pred[i] = b.most_common(1)[0][0]
    
make_submission(final_pred)