In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

In [2]:
data_file_suffix = '20210726'

In [3]:
def train_test_split(X, y, train_idx=None, test_idx=None):
    X_train = X.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X.loc[test_idx]
    y_test = y.loc[test_idx]
    return (X_train, y_train, X_test, y_test)


def load_split_data(suffix=None, split=False, window=14):
    if suffix==None:
        suffix='DEFAULT'
        
    X = pd.read_pickle(f'data/X_{suffix}.pkl')
    y = pd.read_pickle(f'data/y_{suffix}.pkl').buy
    
    # Drop NA rows:
    na_rows = X.isna().any(axis=1)
    X = X[~na_rows]
    y = y[~na_rows]
        
    if split:
        X_train, y_train, X_test, y_test = train_test_split(X, y, X.loc[:'2019'].index, X.loc['2020':].index)
        return X_train, y_train, X_test, y_test
    else:
        return X, y
    
X_train, y_train, X_test, y_test = load_split_data(suffix=data_file_suffix, split=True)
#X_train, y_train = load_split_data(suffix=data_file_suffix, split=False)

In [4]:
#clf = GradientBoostingClassifier(random_state=0, learning_rate=0.06).fit(X_train, y_train)
clf = xgb.XGBClassifier(n_estimators=20, random_state=42, learning_rate=0.1, n_jobs=-1, gamma=0.9, max_depth=4).fit(X_train, y_train)





In [5]:
from sklearn.metrics import f1_score, precision_score, recall_score

pred=clf.predict(X_test)

print(f'Precision = {round(precision_score(y_test, pred),4)}')
print(f'Recall = {round(recall_score(y_test, pred),4)}')
print(f'F1-Score = {round(f1_score(y_test, pred),4)}')

Precision = 0.5891
Recall = 0.0627
F1-Score = 0.1133




In [None]:
import pickle
import datetime
timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
print(f'Pickle model file timestamp: {timestamp}')

In [None]:
pickle.dump(clf, open(f'models/n_xgb_{timestamp}.pkl', 'wb'))

In [None]:
print(list(X_train.columns))