In [None]:
import pandas as pd
from xgboost import XGBClassifier
from DataCleaner import DataCleaner
from WeatherFeatures import WeatherFeatures
from DatetimeFeatures import DatetimeFeatures
from GeographicFeatures import GeographicFeatures

from sklearn.metrics import f1_score
from utils import get_evaluation_data, transform_features, pre_clean,\
                  post_clean, NON_BOOLEAN_FEATURES, predictions_to_labels

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Run params:
here you need to insert the paths to your data .csv files for the train and test:

In [3]:
# need to change for your test
train_path  = 'data/tmp_train.csv'
test_path   = 'data/tmp_test.csv'

## Data loading:

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_evaluation_data(train_path, test_path)

## Features creation:

In [None]:
X_train, X_val, X_test = transform_features(GeographicFeatures, X_train, X_val, X_test, None, force_fit=False)
X_train, X_val, X_test = transform_features(DatetimeFeatures, X_train, X_val, X_test, None, force_fit=False)
X_train, X_val, X_test = transform_features(WeatherFeatures, X_train, X_val, X_test, None, force_fit=False)

## Data cleaning:

In [None]:
X_train, X_val, X_test = pre_clean(X_train, X_val, X_test)

X_train_non_bool, X_val_non_bool, X_test_non_bool, y_train, y_val, y_test = \
                        transform_features(DataCleaner, X_train[NON_BOOLEAN_FEATURES],
                        X_val[NON_BOOLEAN_FEATURES], X_test[NON_BOOLEAN_FEATURES],
                        y_train, y_val, y_test, transform_y=True, force_fit=True)

X_train, X_val, X_test = post_clean(X_train, X_val, X_test, X_train_non_bool, X_val_non_bool, X_test_non_bool)

## model creation and training:

In [None]:
param = {"n_estimators" : 250, "learning_rate" : 0.31, "max_depth" : 9,
         "colsample_bytree" : 0.85, "subsample" : 0.7, "min_child_weight" : 3.14,
         "reg_lambda" : 0.23, "reg_alpha" : 0.14, "n_jobs" : -1}

def f1_eval(y_pred, dtrain):
        f1_err = 1 - f1_score(y_val, y_pred.argmax(1), average='weighted')
        return 'f1_avg_err', f1_err

clf = XGBClassifier(**param)
clf.fit(X_train, y_train, eval_metric=f1_eval, eval_set=[(X_val, y_val)], early_stopping_rounds=10)

## predictions:

In [None]:
preds = clf.predict(X_test)
preds_names = predictions_to_labels(preds)