### Modules import

In [1]:
#from google.colab import files
import pandas as pd
import sklearn
import os
import modelHandler
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import KMeansSMOTE, SMOTE, SVMSMOTE

### Data load

In [51]:
df_train = pd.read_csv('../Data/train.csv', index_col = 0)
df_test = pd.read_csv('../Data/test_x.csv', index_col = 0)


cols_train = ['Hour', 'Sensor_beta', 'Sensor_gamma',
       'Sensor_alpha_plus']
target_train = ['Insect']

x_train = df_train[cols_train].to_numpy()
y_train = df_train[target_train].to_numpy().ravel()

x_test = df_test[cols_train].to_numpy() 

In [5]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 3519, 1: 2793, 2: 689}

In [6]:
x_train.shape

(7001, 4)

After the EDA and several model tests such as knn, svm and dt, I have seen that the best performing model is the XGBClassifier and it is the one I will focus on this notebook (the one I will hand)

## Model training

### Grid search

In [7]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(x_train, y_train, test_size=0.2, random_state=42) #Training and validation

SMOTE to balance data proportions

In [8]:
sm = SMOTE(random_state = 42, n_jobs = -1)
x0, y0 = sm.fit_resample(X_train0, y_train0)
unique, counts = np.unique(y0, return_counts=True)
dict(zip(unique, counts))

{0: 2823, 1: 2823, 2: 2823}

In [None]:
x = modelHandler.ModelHandler(X = x0, Y = y0, model = 'XGB')

In [None]:
x.fit() 

1200 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1200 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py", line 732, in fit
    callbacks=callbacks)
  File "/usr/local/lib/python3.7/dist-packages/xgboost/training.py", line 216, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/usr/local/lib/python3.7/dist-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/usr/local/lib/python3.7/dist-pack

[INFO] The best parameters are {'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6}
[INFO] The best score is 0.9367
[INFO] The best parameters according to ci are {'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6}
[INFO] Train acc  is : 0.9976


In [None]:
print(x.model)

XGBClassifier(colsample_bytree=1.0, gamma=0.5, max_depth=32,
              min_child_weight=0.5, objective='multi:softprob', subsample=0.6)


The best parameters are saved for further study after grid search

In [None]:
x.save('xgb_Model')

In [None]:
##{'colsample_bytree': 1.0, 'gamma': 0, 'max_depth': 16, 'min_child_weight': 1, 'n_estimators': 80, 'subsample': 0.8}

## Subsampling


In [12]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [13]:
mod = XGBClassifier(**{'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6})
mod.fit(X_train0, y_train0)
xgb2 = sklearn.metrics.classification_report(mod.predict(X_train0), y_train0)
print(xgb2)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2824
           1       1.00      1.00      1.00      2234
           2       1.00      1.00      1.00       542

    accuracy                           1.00      5600
   macro avg       1.00      1.00      1.00      5600
weighted avg       1.00      1.00      1.00      5600



In [14]:
xgb3 = sklearn.metrics.classification_report(mod.predict(X_test0), y_test0)
print(xgb3)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       723
           1       0.95      0.93      0.94       571
           2       0.62      0.85      0.72       107

    accuracy                           0.91      1401
   macro avg       0.84      0.89      0.86      1401
weighted avg       0.92      0.91      0.91      1401



### Equaling all data proportion (cutting data)

In [40]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

In [41]:
unique, counts = np.unique(y_train0, return_counts=True)
dict(zip(unique, counts))

{0: 2475, 1: 1936, 2: 489}

In [42]:
tu = np.zeros((500*2 + 489,4))
tu[:500] = X_train0[y_train0 == 1][:500]
tu[500:1000] = X_train0[y_train0 == 0][:500]
tu[1000:] = X_train0[y_train0 == 2]
tuy = np.zeros((500*2 + 489))
tuy[:500] = y_train0[y_train0 == 1][:500]
tuy[500:1000] = y_train0[y_train0 == 0][:500]
tuy[1000:] = y_train0[y_train0 == 2]

In [43]:
tu, tuy = sklearn.utils.shuffle(tu, tuy)

In [44]:
unique, counts = np.unique(tuy, return_counts=True)
dict(zip(unique, counts))

{0.0: 500, 1.0: 500, 2.0: 489}

In [45]:
mod2 = XGBClassifier(**{'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6})
mod2.fit(tu, tuy)
xgb2 = sklearn.metrics.classification_report(mod2.predict(tu), tuy)
print(xgb2)



              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       500
         1.0       1.00      1.00      1.00       499
         2.0       1.00      1.00      1.00       490

    accuracy                           1.00      1489
   macro avg       1.00      1.00      1.00      1489
weighted avg       1.00      1.00      1.00      1489



In [46]:
xgb3 = sklearn.metrics.classification_report(mod.predict(X_test0), y_test0)
print(xgb3)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1043
           1       1.00      1.00      1.00       858
           2       1.00      1.00      1.00       200

    accuracy                           1.00      2101
   macro avg       1.00      1.00      1.00      2101
weighted avg       1.00      1.00      1.00      2101



#  SMOTE

In [21]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [22]:
sm = SMOTE(random_state = 42, n_jobs = -1)
x0, y0 = sm.fit_resample(X_train0, y_train0)
unique, counts = np.unique(y0, return_counts=True)
dict(zip(unique, counts))

{0: 2823, 1: 2823, 2: 2823}

In [23]:
mod3 = XGBClassifier(**{'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6})
mod3.fit(x0, y0)
xgb3 = sklearn.metrics.classification_report(mod3.predict(x0), y0)
print(xgb3)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2822
           1       1.00      1.00      1.00      2824
           2       1.00      1.00      1.00      2823

    accuracy                           1.00      8469
   macro avg       1.00      1.00      1.00      8469
weighted avg       1.00      1.00      1.00      8469



In [24]:
xgb3 = sklearn.metrics.classification_report(mod3.predict(X_test0), y_test0)
print(xgb3)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       704
           1       0.95      0.92      0.94       574
           2       0.67      0.80      0.73       123

    accuracy                           0.91      1401
   macro avg       0.85      0.88      0.86      1401
weighted avg       0.91      0.91      0.91      1401



# SVMSMOTE

In [25]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [26]:
sm = SVMSMOTE(random_state = 42, n_jobs = -1)
x0, y0 = sm.fit_resample(X_train0, y_train0)
unique, counts = np.unique(y0, return_counts=True)
dict(zip(unique, counts))

{0: 2823, 1: 2823, 2: 2823}

In [27]:
mod3 = XGBClassifier(**{'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6})
mod3.fit(x0, y0)
xgb3 = sklearn.metrics.classification_report(mod3.predict(x0), y0)
print(xgb3)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2826
           1       1.00      1.00      1.00      2823
           2       1.00      1.00      1.00      2820

    accuracy                           1.00      8469
   macro avg       1.00      1.00      1.00      8469
weighted avg       1.00      1.00      1.00      8469



In [28]:
xgb3 = sklearn.metrics.classification_report(mod3.predict(X_test0), y_test0)
print(xgb3)

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       710
           1       0.95      0.92      0.94       573
           2       0.62      0.77      0.69       118

    accuracy                           0.90      1401
   macro avg       0.83      0.87      0.85      1401
weighted avg       0.91      0.90      0.90      1401



# KMEANSMOTE

In [29]:
X_train0, X_test0, y_train0, y_test0 = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [30]:
sm = KMeansSMOTE(random_state = 42, n_jobs = -1)
x0, y0 = sm.fit_resample(X_train0, y_train0)
unique, counts = np.unique(y0, return_counts=True)
dict(zip(unique, counts))

{0: 2823, 1: 2828, 2: 2823}

In [31]:
mod3 = XGBClassifier(**{'booster': 'gbtree', 'colsample_bytree': 1.0, 'gamma': 0.5, 'max_depth': 32, 'min_child_weight': 0.5, 'subsample': 0.6})
mod3.fit(x0, y0)
xgb3 = sklearn.metrics.classification_report(mod3.predict(x0), y0)
print(xgb3)



              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2823
           1       1.00      1.00      1.00      2828
           2       1.00      1.00      1.00      2823

    accuracy                           1.00      8474
   macro avg       1.00      1.00      1.00      8474
weighted avg       1.00      1.00      1.00      8474



In [32]:
xgb3 = sklearn.metrics.classification_report(mod3.predict(X_test0), y_test0)
print(xgb3)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92       720
           1       0.95      0.92      0.94       574
           2       0.62      0.85      0.72       107

    accuracy                           0.91      1401
   macro avg       0.84      0.89      0.86      1401
weighted avg       0.92      0.91      0.91      1401



# Normalitzant

Not achieved better results by normalazing data so I did not input the code

## Best result

Best result obtained by cutting data, getting the best f1-score.

In [48]:
mod2.save_model('../models/XGBClassifier.json')

In [None]:
# To load: 
# model2 = xgb.XGBRegressor()
# model2.load_model('../models/XGBClassifier.json')

### Store results as csv

In [54]:
df = {'Insect' : mod.predict(x_test)}

df = pd.DataFrame(df)

In [55]:
df.to_csv('results.csv')