# Imports

In [None]:
# data
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# machine learning
import sklearn
from sklearn import metrics, linear_model, svm
import xgboost
import lightgbm

# other
import gc
import json
from tqdm import tqdm
from itertools import product
import functools
import random
from numerapi import NumerAPI
from timeit import default_timer
import re
from scipy.stats import spearmanr
import time

# save variables
import pickle
import joblib

# my utils
from my_utils import *

# Download data

In [None]:
napi = NumerAPI()
current_round = napi.get_current_round()

# filenames = napi.list_datasets()

napi.download_dataset('v4/live.parquet', f'v4/live_{current_round}.parquet')
napi.download_dataset('v4/live_int8.parquet', f'v4/live_int8_{current_round}.parquet')

# Dataframes

## Features and columns to read

- feature sets: `all`, `small`, `medium`, `v2_equivalent_features`, `v3_equivalent_features`, `fncv3_features`
- feature groups: `features_all[0:210]`, `features_all[210:420]`, `features_all[420:630]`, `features_all[630:840]`, `features_all[840:1050]`, `features_all[1050:1191]`

In [None]:
# with open('v4/features.json', 'r') as f:
#     feature_metadata = json.load(f)

# features_all = list(feature_metadata['feature_stats'].keys())
# features_small = feature_metadata['feature_sets']['small']
# features_medium = feature_metadata['feature_sets']['medium']
# features_v2 = feature_metadata['feature_sets']['v2_equivalent_features']
# features_v3 = feature_metadata['feature_sets']['v3_equivalent_features']
# features_fncv3 = feature_metadata['feature_sets']['fncv3_features']

# features = features_all
# target = 'target_nomi_v4_20'
# n_features = len(features)

# read_columns = ['era', 'data_type'] + features + [target] 

# df_feature_metadata = pd.DataFrame(feature_metadata['feature_stats'])
# df_feature_metadata

## Train

In [None]:
# df_train = pd.read_parquet('v4/train_int8.parquet', columns=read_columns)
# df_train['era'] = df_train['era'].astype('int32')
# df_train.info(memory_usage='deep')
# df_train

## Validation + Test

In [None]:
# df_validation = pd.read_parquet('v4/validation_int8.parquet', columns=read_columns)
# df_validation['era'] = df_validation['era'].astype('int32')
# df_validation.info(memory_usage='deep')
# df_validation

In [None]:
# df_test = df_validation.loc[df_validation['data_type'] == 'test']
# df_test.info(memory_usage='deep')
# df_test

## Live

In [None]:
# df_live = pd.read_parquet('v4/live_int8.parquet', columns=read_columns)
# df_live.info(memory_usage='deep')
# df_live

# Analyse data

## Number of examples as a function of the era

In [None]:
# x_train = df_train.groupby('era').size().index.values
# y_train = df_train.groupby('era').size().values
# x_validation = df_validation[df_validation.data_type == 'validation'].groupby('era').size().index.values
# y_validation = df_validation[df_validation.data_type == 'validation'].groupby('era').size().values
# x_test = df_validation[df_validation.data_type == 'test'].groupby('era').size().index.values
# y_test = df_validation[df_validation.data_type == 'test'].groupby('era').size().values

# fig, ax = plt.subplots()
# ax.plot(x_train, y_train, label='train')
# ax.plot(x_validation, y_validation, label='validation')
# ax.plot(x_test, y_test, label='test')
# ax.set_xlabel('era')
# ax.set_ylabel('number of examples')
# ax.legend()

## Feature correlation heatmap

In [None]:
# feature_correlations = df_train[df_train.era==1][features].corr()
# plt.figure(figsize = (8,8))
# plt.imshow(feature_correlations)
# for a in [210, 420, 630, 840, 1050]:
#     plt.axvline(a, color = 'orange')
#     plt.axhline(a, color = 'orange')

# feature_groups = [features[0:210],
#                   features[210:420],
#                   features[420:630],
#                   features[840:1050],
#                   features[1050:-1]]

## Correlation of feature with target as a function of the era

In [None]:
# eras_train = df_train.era.unique()
# target_correlations = np.array([np.corrcoef(df_train[df_train.era == e][[target] + features].T)[0, 1:] for e in eras_train])
# target_correlations = pd.DataFrame(target_correlations)
# target_correlations.rename(columns = dict(enumerate(features)), inplace = True)
# target_correlations.insert(0, 'era', eras_train)
# joblib.dump(target_correlations, 'saved-variables/target_correlations.pkl')
# target_correlations = joblib.load('saved-variables/target_correlations.pkl')

In [None]:
# x = target_correlations['era']
# y = target_correlations['feature_untidy_withdrawn_bargeman']

# fig, ax = plt.subplots()
# ax.plot(x, y)
# ax.set_xlabel('era')
# ax.set_ylabel('correlation with target')

# Test models

In this section we test many models without caring about hyperparameters (i.e., just use the defaults for each model). The goal is to identify which models look the most promising. We want to consider the time to train, as well as

Performance metrics:

- correlation
- rank-correlation / spearman-correlation
- `sklearn.metrics.r2_score`
- `sklearn.metrics.mean_squared_error`

Models worth trying at first

- `sklearn.linear_model.LinearRegression()`
- `sklearn.linear_model.LogisticRegression()`
- `sklearn.linear_model.SGDRegressor()` (Stochastic Gradient Descent regressor)
- `sklearn.linear_model.Lasso()`
- `sklearn.linear_model.ElasticNet()`
- `sklearn.linear_model.Ridge()`
- `sklearn.svm.SVR(kernel='rbf')` (Support Vector Machine / Regression)
- `sklearn.svm.SVR(kernel='linear')`
- `lightgbm.LGBMRegressor()`
- `xgboost.XGBRegressor()`

Ensembles

- `sklearn.ensemble.RandomForestRegressor()`
- `sklearn.ensemble.ExtraTreesRegressor()`
- `sklearn.ensemble.BaggingRegressor()`
- `sklearn.ensemble.AdaBoostRegressor()`
- `sklearn.ensemble.GradientBoostingRegressor()`

`BlockRegressor` class

In [None]:
class BlockRegressor:
    def __init__(self, base_regressor):
        self.base_regressor = base_regressor
        self.times = np.zeros((4, 6))
        self.base_models = np.array([[base_regressor() for j in range(6)] for i in range(4)])

    def fit(self, df):
        for i, j in product(range(4), range(6)):
            X = X_block(df, i, j)
            y = y_rows(df, i)
            t0 = default_timer()
            self.base_models[i, j].fit(X, y)
            self.times[i, j] = default_timer() - t0

    def predict(self, df):
        y_mean = 0
        for i, j in product(range(4), range(6)):
            y_mean += self.base_models[i, j].predict(X_cols(df, j))
        y_mean /= 24
        return y_mean

    def score(self, df, y_pred = None):
        y_true = df[TARGET].to_numpy()
        if y_pred is None:
            y_pred = self.predict(df)
        return metrics.r2_score(y_true, y_pred)

    def corr(self, df, y_pred = None):
        y_true = df[TARGET].to_numpy()
        if y_pred is None:
            y_pred = self.predict(df)
        return np.corrcoef(y_true, y_pred)[0,1]

    def r_corr(self, df, y_pred = None):
        y_true = df[TARGET].to_numpy()
        if y_pred is None:
            y_pred = self.predict(df)
        return spearmanr(y_true, y_pred)[0]

    def n_corr(self, df, y_pred = None):
        y_true = df[TARGET].to_numpy()
        if y_pred is None:
            y_pred = self.predict(df)
        # y_eras = pd.DataFrame({'era': df.era, 'y_pred': y_pred})
        # r_pred = y_eras.groupby(y_eras.era).apply(lambda x: x.rank(pct=True, method="first"))
        y_eras = pd.DataFrame({'era': df.era, 'y': y_pred})
        r_pred = y_eras.groupby(y_eras.era).apply(lambda x: x.rank(pct=True, method="first"))['y'].to_numpy()
        return np.corrcoef(y_true, r_pred)[0,1]

    def mse(self, df, y_pred = None):
        y_true = df[TARGET].to_numpy()
        if y_pred is None:
            y_pred = self.predict(df)
        return metrics.mean_squared_error(y_true, y_pred)

    def to_dataframe(self, df, y_pred = None):
        if y_pred is None:
            y_pred = self.predict(df)
        
        df_dict = {'model': [], 'i': [], 'j': [], 'time': [], 'r2': [], 'corr': [], 'r_corr': [], 'n_corr': [], 'mse': []}

        df_dict['model'].append(string_from_class(self.base_regressor))
        df_dict['i'].append(-1)
        df_dict['j'].append(-1)
        df_dict['time'].append(np.sum([self.times[i, j] for i, j in product(range(4), range(6))]))
        df_dict['r2'].append(self.score(df, y_pred))
        df_dict['corr'].append(self.corr(df, y_pred))
        df_dict['r_corr'].append(self.r_corr(df, y_pred))
        df_dict['n_corr'].append(self.n_corr(df, y_pred))
        df_dict['mse'].append(self.mse(df, y_pred))
        
        for i, j in product(range(4), range(6)):
            model = self.base_models[i, j]
            X = X_block(df, i, j)
            y_true = y_rows(df, i)
            y_pred = model.predict(X)
            t = self.times[i, j]
            e0 = df['era'][0]
            e1 = df['era'][-1]
            # y_eras = pd.DataFrame({'era': df[df.era.isin(era_subsample(e0, e1, i))].era, 'y_pred': y_pred})
            # r_pred = y_eras.groupby(y_eras.era).apply(lambda x: x.rank(pct=True, method="first"))
            y_eras = pd.DataFrame({'era': df[df.era.isin(era_subsample(e0, e1, i))].era, 'y': y_pred})
            r_pred = y_eras.groupby(y_eras.era).apply(lambda x: x.rank(pct=True, method="first"))['y'].to_numpy()

            df_dict['model'].append(string_from_class(self.base_regressor))
            df_dict['i'].append(i)
            df_dict['j'].append(j)
            df_dict['time'].append(t)
            df_dict['r2'].append(metrics.r2_score(y_true, y_pred))
            df_dict['corr'].append(np.corrcoef(y_true, y_pred)[0,1])
            df_dict['r_corr'].append(spearmanr(y_true, y_pred)[0])
            df_dict['n_corr'].append(np.corrcoef(y_true, r_pred)[0,1])
            df_dict['mse'].append(metrics.mean_squared_error(y_true, y_pred))

        return pd.DataFrame(df_dict)


`sklearn.linear_model.LinearRegression()`

In [None]:
# c = linear_model.LinearRegression

# df = pd.read_parquet('v4/train_int8.parquet', columns=READ_COLUMNS)
# df['era'] = df['era'].astype('int32')

# model = BlockRegressor(c)
# model.fit(df)

# y_pred = model.predict(df)
# results = model.to_dataframe(df, y_pred)

# joblib.dump(results, 'saved-variables/target_correlations.pkl')
# results.to_parquet(f'saved-variables/{string_from_class(c)}.parquet')

`sklearn.linear_model.LogisticRegression()`

This does not work because `LogisticRegression` is actually a classifier.

`sklearn.linear_model.SGDRegressor()` (Stochastic Gradient Descent regressor)

In [None]:
c = linear_model.SGDRegressor

df = pd.read_parquet('v4/train_int8.parquet', columns=READ_COLUMNS)
df['era'] = df['era'].astype('int32')

model = BlockRegressor(c)
model.fit(df)

y_pred = model.predict(df)
results = model.to_dataframe(df, y_pred)

joblib.dump(results, 'saved-variables/target_correlations.pkl')
results.to_parquet(f'saved-variables/{string_from_class(c)}.parquet')

`sklearn.linear_model.Lasso()`

In [None]:
c = linear_model.Lasso

df = pd.read_parquet('v4/train_int8.parquet', columns=READ_COLUMNS)
df['era'] = df['era'].astype('int32')

model = BlockRegressor(c)
model.fit(df)

y_pred = model.predict(df)
results = model.to_dataframe(df, y_pred)

joblib.dump(results, 'saved-variables/target_correlations.pkl')
results.to_parquet(f'saved-variables/{string_from_class(c)}.parquet')

`sklearn.linear_model.ElasticNet()`

In [None]:
c = linear_model.ElasticNet

df = pd.read_parquet('v4/train_int8.parquet', columns=READ_COLUMNS)
df['era'] = df['era'].astype('int32')

model = BlockRegressor(c)
model.fit(df)

y_pred = model.predict(df)
results = model.to_dataframe(df, y_pred)

joblib.dump(results, 'saved-variables/target_correlations.pkl')
results.to_parquet(f'saved-variables/{string_from_class(c)}.parquet')

`sklearn.linear_model.Ridge()`

In [None]:
c = linear_model.Ridge

df = pd.read_parquet('v4/train_int8.parquet', columns=READ_COLUMNS)
df['era'] = df['era'].astype('int32')

model = BlockRegressor(c)
model.fit(df)

y_pred = model.predict(df)
results = model.to_dataframe(df, y_pred)

joblib.dump(results, 'saved-variables/target_correlations.pkl')
results.to_parquet(f'saved-variables/{string_from_class(c)}.parquet')

`sklearn.svm.SVR(kernel='rbf')` (Support Vector Machine / Regression)

`sklearn.svm.SVR(kernel='linear')`

`lightgbm.LGBMRegressor()`

`xgboost.XGBRegressor()`