In [1]:
import numpy as np
import pandas as pd
import tqdm
import matplotlib.pyplot as plt
from matplotlib import pyplot
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE

In [3]:
X_train = pd.read_csv("features_ata.csv").drop(columns='Unnamed: 0')
labels = pd.read_csv("y_train.csv").drop(columns='id')
X_test =  pd.read_csv("test_features_ata.csv").drop(columns='Unnamed: 0')
Y_train = labels.to_numpy()
Y_train = Y_train.reshape(-1)

In [4]:
def make_submission(prediction_, name='submission.csv'):
    dt = pd.DataFrame(data=prediction_, columns=['y'])
    dt['id'] = dt.index
    dt = dt[['id', 'y']]
    dt.to_csv(name, header=True, index=False)

# Normalization

In [5]:
# normalization
X_train = normalize(X_train, axis=0)
X_test = normalize(X_test, axis = 0)

# performe label balancing
- source: https://towardsdatascience.com/machine-learning-target-feature-label-imbalance-problem-and-solutions-98c5ae89ad0

In [6]:
#print relative size of classes
tot = len(labels)
for i in range(4):
    percentage = round(100*np.array(labels==i).sum()/tot ,1)
    print(f" {percentage} % of samples belong to class {i}")

 59.2 % of samples belong to class 0
 8.7 % of samples belong to class 1
 28.8 % of samples belong to class 2
 3.3 % of samples belong to class 3


In [7]:
smote = SMOTE(random_state = 14)
X_train_balanced, Y_train_balanced = smote.fit_sample(X_train, Y_train)

In [9]:
#again print relative size of classes for balanced data
print("for balanced data we get the following frequency of labels")
tot = len(Y_train_balanced)
for i in range(4):
    percentage = round(100*np.array(Y_train_balanced==i).sum()/tot ,1)
    print(f" {percentage} % of samples belong to class {i}")

for balanced data we get the following frequency of labels
 25.0 % of samples belong to class 0
 25.0 % of samples belong to class 1
 25.0 % of samples belong to class 2
 25.0 % of samples belong to class 3


# Explore different models
---

## train-test split

In [26]:
# train-test split
x_train, x_test, y_train, y_test = train_test_split(X_train_balanced, Y_train_balanced, test_size=0.2, random_state=42)

# Baseline model (rbf-kernelized SVM)

In [29]:
# outlier detection 
svc = SVC(class_weight='balanced')
svc.fit(x_train, y_train)
svc.score(x_test, y_test)
#pred = svc.predict(X_test)
#make_submission(pred)

0.8164191419141914

# EXPLORE BAGGING ALGORITHMS
---
- source: https://machinelearningmastery.com/bagging-ensemble-with-python/

## Simple bagged decision tree classifier

In [25]:
# bagging svm
# define the model
clf = DecisionTreeClassifier(random_state=1)
model = BaggingClassifier(base_estimator=clf)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train_balanced, Y_train_balanced, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.888 (0.009)


# Explore Hyperparameters

## explore number of trees

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    # define number of trees to consider
    n_trees = [10, 50, 100, 500, 1000]
    for n in n_trees:
        models[str(n)] = BaggingClassifier(n_estimators=n)
    return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    # define the evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate the model and collect the results
    scores = cross_val_score(model, X_train, Y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores
 
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model
    scores = evaluate_model(model, X_train, Y_train)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize the performance along the way
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

## explore number of samples in bootstrap

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    # explore ratios from 10% to 100% in 10% increments
    for i in np.arange(0.1, 1.1, 0.1):
        key = '%.1f' % i
        models[key] = BaggingClassifier(max_samples=i, n_estimators = 100)
    return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    # define the evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate the model and collect the results
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores
 
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    # evaluate the model
    scores = evaluate_model(model, X_train, Y_train)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize the performance along the way
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

## Train a model with the hyperparameter choosen with the knowledge from the above inspections

In [31]:
# bagging svm
# define the model
clf = DecisionTreeClassifier(random_state=1)
model = BaggingClassifier(base_estimator=clf, n_estimators=50, max_samples=0.5)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, Y_train, scoring='f1_macro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('F_macro score: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.789 (0.016)


### same with balanced labels

In [37]:
# bagging svm
# define the model
clf = DecisionTreeClassifier(random_state=1)
model = BaggingClassifier(base_estimator=clf, n_estimators=50, max_samples=0.5)
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train_balanced, Y_train_balanced, scoring='f1_macro', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('F1_macro score: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.889 (0.010)


# Train model on whole dataset

In [41]:
# define the model
clf = DecisionTreeClassifier(random_state=1)
model = BaggingClassifier(base_estimator=clf, n_estimators=50, max_samples=0.5)
# fit the model on the whole dataset
model.fit(X_train, Y_train)
# make a single prediction
prediction = model.predict(X_test)

In [33]:
# make a submission
make_submission(prediction)