In [None]:
#!pip install -r requirements.txt

In [None]:
from collections import Counter
import hashlib
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from xgboost import plot_importance
from joblib import dump, load

from ds_helpers import temporada_alta, dif_min, get_periodo_dia

In [None]:
df = pd.read_csv('../datasets/dataset_SCL.csv')
df = df[:-1]
df

In [None]:
df['temporada_alta'] = df['Fecha-I'].apply(temporada_alta)
df['dif_min'] = df.apply(dif_min, axis = 1)
df['atraso_15'] = np.where(df['dif_min'] > 15, 1, 0)
df['periodo_dia'] = df['Fecha-I'].apply(get_periodo_dia)

In [None]:
data = df[['OPERA', 'MES', 'TIPOVUELO', 'SIGLAORI', 'SIGLADES', 'DIANOM', 'atraso_15']]
label = data['atraso_15']

In [None]:
data

In [None]:
features = data.assign(
    OPERA = LabelEncoder().fit_transform(data['OPERA']),
    MES = data.MES,
    TIPOVUELO = LabelEncoder().fit_transform(data['TIPOVUELO']),
    SIGLAORI = LabelEncoder().fit_transform(data['SIGLAORI']),
    SIGLADES = LabelEncoder().fit_transform(data['SIGLADES']),
    DIANOM = data['DIANOM'].map( {'Lunes':1, 'Martes':2, 'Miercoles':3, 'Jueves':4, 'Viernes':5, 'Sabado':6, 'Domingo':7})
).drop(columns=['atraso_15'])
features

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, label, test_size = 0.33, stratify=label, random_state = 1)

In [None]:
assert y_train.size + y_test.size == label.size
assert x_train.shape[0] + x_test.shape[0] == features.shape[0]

In [None]:
print(y_train.size)
print(y_test.size)
# this 2-value arrays must be close
print(y_train.value_counts('%').values)
print(y_test.value_counts('%').values)
np.allclose(y_train.value_counts('%').values, y_test.value_counts('%').values, atol=0.01)

In [None]:
counter = Counter(y_train)
cls_weight = counter[0] / counter[1]

modelxgb = xgb.XGBClassifier(scale_pos_weight=cls_weight, random_state=None)
parameters = {
    'learning_rate': [0.01,0.05, 0.1],
    'n_estimators': [50, 100, 150],
    'subsample': [0.5, 0.9],
    'max_depth': [10, 20] 
}

modelxgb_GridCV = GridSearchCV(
    modelxgb,
    param_grid = parameters,
    cv = 3,
    n_jobs=-1,
    verbose=1).fit(x_train, y_train)

In [None]:
y_pred_xgb_grid = modelxgb_GridCV.predict(x_test)
confusion_matrix(y_test, y_pred_xgb_grid)
print(classification_report(y_test, y_pred_xgb_grid))

In [None]:
print(modelxgb_GridCV.best_params_)
print(modelxgb_GridCV.best_estimator_)

phash = hashlib.sha1(str(modelxgb_GridCV.best_estimator_.get_params()).encode('utf-8')).hexdigest() 
mname = f"{datetime.utcnow().strftime('%Y%m%d')}_{phash[:7]}_xgb_m1.joblib"

dump(modelxgb_GridCV.best_estimator_, f'../models/{mname}')