## Preparation
Run all cells.

In [1]:
import xgb_model as x
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn import model_selection, metrics 
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
from xgboost import XGBRegressor
import csv
from scipy.stats import spearmanr
from sklearn import preprocessing
import pyinputplus as pyip
from datetime import datetime

In [2]:
train = x.load_data()[0]
features = x.load_features(train)
target = 'target'
train["erano"] = train.era.str.slice(3).astype(int)
eras = train.erano
train = x.feature_interactions_intel_dexte(train)
features = x.load_features(train)

09:31:17 Loading data from round 264...09:31:58 Done.
09:31:58 Loaded 310 features.
09:31:58 Adding 2nd order interactions between intelligence and dexterity features...09:32:02 Done.
09:32:02 Loaded 661 features.


In [3]:
class TimeSeriesSplitGroups(_BaseKFold):
    def __init__(self, n_splits=5):
        super().__init__(n_splits, shuffle=False, random_state=None)

    def split(self, X, y=None, groups=None):
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_list = np.unique(groups)
        n_groups = len(group_list)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_groups))
        indices = np.arange(n_samples)
        test_size = (n_groups // n_folds)
        test_starts = range(test_size + n_groups % n_folds,
                            n_groups, test_size)
        test_starts = list(test_starts)[::-1]
        for test_start in test_starts:
            
            yield (indices[groups.isin(group_list[:test_start])],
                   indices[groups.isin(group_list[test_start:test_start + test_size])])


# def spearman(y_true, y_pred): 
#     return spearmanr(y_pred, y_true).correlation

# Submissions are scored by spearman correlation
# This is a copy of correlation function
def numerai_corr(y_true, y_pred):
    predictions = pd.Series(y_pred)
    ranked_preds = predictions.rank(pct=True, method="first")
    return np.corrcoef(ranked_preds, y_true)[0, 1]


def cv_makemodels(lr_list, ne_list, cs_list, md_list, ss_list, ga_list):
    model_list = []
    for lr in lr_list:
        for ne in ne_list:
            for cs in cs_list:
                for md in md_list:
                    for ss in ss_list:
                        for ga in ga_list:
                            model = XGBRegressor(learning_rate=lr, n_estimators=ne, colsample_bytree = cs,
                                                 max_depth=md, subsample = ss, gamma = ga,
                                                 tree_method='gpu_hist')
                            model_list.append(model)
    return model_list

In [4]:
def cv_test(model_list):
    cv_scores = []
    i = 0
    for model in model_list:
        print(f'{x.get_time()} Testing model {i+1}/{len(model_list)}:\n{model}...')
        scores = model_selection.cross_val_score(
                    model,
                    train[features],
                    train[target],
                    cv=TimeSeriesSplitGroups(5),
                    groups=eras,
                    scoring=metrics.make_scorer(numerai_corr, greater_is_better=True))
        mean_score = np.mean(scores)
        std_score = np.std(scores, ddof=0)
        sharpe_score = mean_score/std_score
        cv_scores.append([mean_score, std_score, sharpe_score])
        print(f'{x.get_time()} Mean = {mean_score}, Std = {std_score}, Sharpe = {sharpe_score}\n')

        try:
            f = open('progress.txt', 'a')
            f.write(f"\n{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} Evaluated {i+1}/{len(model_list)} model(s)")
            f.close()
        except:
            pass
        
        i += 1
        
    model_params = []
    for model in model_list:
        model_params.append(model.get_params())
    model_params_df = pd.DataFrame(model_params)
    
    cv_scores_df = pd.DataFrame([{'test_completed': f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
                                  'cv_corr_mean': x[0], 'cv_corr_std': x[1], 'cv_corr_sharpe': x[2]} for x in cv_scores])
    cv_test_df = pd.concat([model_params_df, cv_scores_df], axis=1)
    return cv_test_df

In [5]:
def append_cv_scores(cv_test_df):
    try:
        df = pd.read_csv('cv_test.csv', index_col=0)
    except:
        print(f'{x.get_time()} No file named cv_test.csv')
        return cv_test_df
    else:
        combined = pd.concat([df, cv_test_df], axis=0).reset_index(drop=True)
    return combined

In [6]:
def export_cv_scores(combined):
    #choice = pyip.inputChoice(['y', 'n'], prompt='Export to cv_test.csv? y/n...')
    #if choice == 'y' or choice == 'Y':
    print(f'{x.get_time()} Exporting...', end='', flush=True)
    combined.to_csv('cv_test.csv')
    print(f'{x.get_time()} Done.')
    #else:
    #    print(f'{x.get_time()} Not exporting.')   

## CV testing
1. Enter parameters in lists in cell below.
1. Run all cells.

In [7]:
lr_list = [0.008, 0.010, 0.012]
ne_list = [2000, 3000, 4000]
cs_list = [0.08, 0.10, 0.12]
md_list = [5]
ss_list = [0.75]
ga_list = [0]
model_list = cv_makemodels(lr_list, ne_list, cs_list, md_list, ss_list, ga_list)

In [8]:
print(f'About to test {len(model_list)} models...')

About to test 27 models...


In [9]:
cv_test_df = cv_test(model_list)
cv_test_df

09:32:24 Testing model 1/27:
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.08, gamma=0, gpu_id=None,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.008, max_delta_step=None, max_depth=5,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=2000, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=0.75, tree_method='gpu_hist',
             validate_parameters=None, verbosity=None)...


KeyboardInterrupt: 

In [16]:
combined = append_cv_scores(cv_test_df)
export_cv_scores(combined)

19:35:14 Exporting...19:35:14 Done.


## Plotting
Testing area - not all functional.

In [None]:
cv_means = [x[0] for x in cv_scores]
cv_means

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams.update(mpl.rcParamsDefault)

In [None]:
def plot_cv_3var(cv_scores, p1_list, p1_name, p2_list, p2_name, p3_list, p3_name, title='661-feature CV grid search'):
    cv_means = [x[0] for x in cv_scores]
    cv_means = np.array(cv_means).reshape(len(p1_list), len(p2_list)*len(p3_list))
    l = len(p1_list)
    
    num_colours = len(p2_list) * len(p3_list)
    #colours = plt.cm.Spectral(np.linspace(0,1,num_colours))
    fig = plt.figure(figsize=(10, 6))
    ax = fig.add_subplot(111)
    ax.set_prop_cycle('color',plt.cm.tab20(np.linspace(0,1,20)))
    #ax.set_prop_cycle('color', colours)
    
    for iind, i in enumerate(p1_list):
        for jind, j in enumerate(p2_list):
            plt.plot(p3_list, cv_means[iind][l*jind:l*jind+l], label=f'{p1_name}={i}, {p2_name}={j}')
        plt.xlabel(p3_name)
        plt.ylabel('mean_spearman')
        plt.legend(bbox_to_anchor=(1.05, 1))
        plt.title('661-feature CV grid search')

In [None]:
plot_cv_3var(cv_scores, lr_list, 'lr', ne_list, 'ne', cs_list, 'cs', title='661-feature CV grid search')

In [None]:
def plot_cv(cv_scores, param1_list, param2_list, param1_name, param2_name, title='661-feature CV grid search'):
    cv_means = [x[0] for x in cv_scores]
    cv_means = np.array(cv_means).reshape(len(param1_list), len(param2_list))
    for ind, i in enumerate(param1_list):
        plt.plot(param2_list, cv_means[ind], label=f'{param1_name}={i}')
    plt.xlabel(f'{param2_name}')
    plt.ylabel('mean_spearman')
    plt.title(title)
    plt.legend()

In [None]:
plot_cv(cv_scores, cs_list, md_list, 'colsample_bytree', 'max_depth')

In [None]:
def cv_df(df, param_list, cv_scores, param_string = ['param1', 'param2']):
    if len(param_list) != len(cv_scores):
        raise ValueError('Lengths of cv_scores and param_list not equal')
    for i in range(len(cv_scores)):
        row = f'{param_string[0]}={param_list[i][0]:.3f}'
        col = f'{param_string[1]}={param_list[i][1]:.3f}'
        value = str(cv_scores[i])
        if row not in df.index:
            df = df.append(pd.Series(name=row))
        df.loc[row, col] = value
    df = df.sort_index()
    return df

In [None]:
df = pd.DataFrame()

In [None]:
df = cv_df(df, param_list, cv_scores, ['lr', 'ne'])

In [None]:
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
def cv_heatmap(df, title, filename):
    df = df.applymap(lambda x: float(x[1: -1].split(', ')[0]))
    sns.heatmap(df)
    plt.title(title)
    plt.savefig(filename)

In [None]:
cv_heatmap(df, 'learning_rate vs n_estimators\ncolsample_bytree=0.5, max_depth=5', 'lr_ne.png')

## Analysis

In [None]:
cv = pd.read_csv('cv_test.csv', index_col=0)
cv

In [None]:
cv = cv.dropna(axis=1)
cv.dtypes

In [None]:
cv = cv.select_dtypes(include=['float64', 'int64'])
cv.dtypes

In [None]:
cv = cv.drop(columns=['cv_corr_std', 'cv_corr_sharpe'])

In [None]:
cv.corr().cv_corr_mean

In [None]:
cv['lr_x_ne'] = cv['learning_rate'] * cv['n_estimators']

In [None]:
cv.corr().cv_corr_mean

In [None]:
import sklearn
interactions = sklearn.preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions.fit(cv)
col_names = interactions.get_feature_names(cv.columns)

df_interact = pd.DataFrame(interactions.transform(cv), columns=col_names, index=cv.index)
df_interact = df_interact.drop(columns=cv.columns) # drop original features from df_interact
cv = pd.concat([cv, df_interact], axis=1)

In [None]:
cv

In [None]:
cv.corr().cv_corr_mean.sort_values()