In [279]:
"""
Created on: Thu. 1 Sep. 2022
Author: Mélina Verger
"""

# For data manipulation
import pandas as pd
pd.set_option('display.max_columns', None)

# For resampling
from sklearn.utils import resample

# ML models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# For cross-validation
from sklearn.model_selection import cross_val_score

# Metric
from sklearn.metrics import accuracy_score

# SMOTE
from imblearn.over_sampling import SMOTE

In [280]:
SPLIT = "7030"  # "8020" 
DATA = "stAll"  # "stInfo"

In [281]:
# Load train and test sets
X_train = pd.read_csv("./data/X_train" + "_" + DATA + "_" + SPLIT + ".csv")
X_test = pd.read_csv("./data/X_test" + "_" + DATA + "_" + SPLIT + ".csv")
y_train = pd.read_csv("./data/y_train" + "_" + DATA + "_" + SPLIT + ".csv")
y_test = pd.read_csv("./data/y_test" + "_" + DATA + "_" + SPLIT + ".csv")

In [282]:
y_train.squeeze()

0       0.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
3577    1.0
3578    0.0
3579    1.0
3580    1.0
3581    0.0
Name: final_result, Length: 3582, dtype: float64

In [283]:
type(pd.Series(y_train.squeeze()))

pandas.core.series.Series

## Up- and down-sampling

In [284]:
# X = pd.concat([X_train, y_train], axis=1)

In [285]:
# # Positive class imbalance
# round(X["final_result"].value_counts()[1] / (X["final_result"].value_counts()[1] + X["final_result"].value_counts()[0]) * 100, 2)

In [286]:
# X["final_result"].value_counts()

In [287]:
# mean_class = int((X["final_result"].value_counts()[1] + X["final_result"].value_counts()[0])/2)
# print(mean_class)

In [288]:
# X_majority = X[X["final_result"] == 1]
# X_minority = X[X["final_result"] == 0]

In [289]:
# X_majority_downsample = resample(X_majority,
#                                  replace=False,
#                                  n_samples=len(X_minority),
#                                  random_state=0)


In [290]:
# X_minority_upsample = resample(X_minority,
#                                replace=True,
#                                n_samples=len(X_majority),
#                                random_state=0)

In [291]:
# Xs = pd.concat([X_majority_downsample, X_minority]).sample(frac=1)  # shuffled

In [292]:
# X_majority_downsample.shape

In [293]:
# X_minority.shape

In [294]:
# Xs_train = Xs.drop(columns=["final_result"])
# ys_train = Xs["final_result"]

## SMOTE

In [295]:
oversample = SMOTE()
Xs_train, ys_train = oversample.fit_resample(X_train, y_train)

In [296]:
ys_train = ys_train.squeeze()

## Out-of-the-box classifiers

In [297]:
if not os.path.exists("./models"):
    os.mkdir("models")
else:
    print("The folder 'models' already exists." )

The folder 'models' already exists.


In [298]:
clf_lr = LogisticRegression(random_state=0)
clf_svc = LinearSVC(random_state=0)
clf_knearest = KNeighborsClassifier()
clf_dt = DecisionTreeClassifier(random_state=0)

In [299]:
clf_lr.fit(Xs_train, ys_train)
clf_svc.fit(Xs_train, ys_train)
clf_knearest.fit(Xs_train, ys_train)
clf_dt.fit(Xs_train, ys_train)

#### Logistic regression

In [300]:
scores_train = cross_val_score(clf_lr, Xs_train, ys_train, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_train.mean(), scores_train.std()))

print(f"Accuracy on the test set: {round(accuracy_score(y_test, clf_lr.predict(X_test)), 2)}")

0.73 accuracy with a standard deviation of 0.02
Accuracy on the test set: 0.69


### SVC

In [301]:
scores_train = cross_val_score(clf_svc, Xs_train, ys_train, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_train.mean(), scores_train.std()))

print(f"Accuracy on the test set: {round(accuracy_score(y_test, clf_lr.predict(X_test)), 2)}")

0.75 accuracy with a standard deviation of 0.02
Accuracy on the test set: 0.69


### K-nearest

In [302]:
scores_train = cross_val_score(clf_knearest, Xs_train, ys_train, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_train.mean(), scores_train.std()))

print(f"Accuracy on the test set: {round(accuracy_score(y_test, clf_lr.predict(X_test)), 2)}")

0.77 accuracy with a standard deviation of 0.03
Accuracy on the test set: 0.69


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### DT

In [303]:
scores_train = cross_val_score(clf_dt, Xs_train, ys_train, cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_train.mean(), scores_train.std()))

print(f"Accuracy on the test set: {round(accuracy_score(y_test, clf_lr.predict(X_test)), 2)}")

0.75 accuracy with a standard deviation of 0.02
Accuracy on the test set: 0.69
