# Imports

In [None]:
# Numerai API
from numerapi import NumerAPI

# data
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# machine learning models
from sklearn.linear_model import LinearRegression

# other
import json
from tqdm import tqdm

# Download data

In [None]:
napi = NumerAPI()
round = napi.get_current_round()
era = round + 695

filenames = napi.list_datasets()

napi.download_dataset('v4/features.json', '../data/features.json')
napi.download_dataset('v4/train_int8.parquet', '../data/train.parquet')
napi.download_dataset('v4/validation_int8.parquet', '../data/validation.parquet')
napi.download_dataset('v4/live_int8.parquet', f'../data/live_{round}.parquet')

# Features

In [None]:
with open('../data/features.json', 'r') as f:
    FEATURE_METADATA = json.load(f)
del f

FEATURES_L = list(FEATURE_METADATA['feature_stats'].keys())
FEATURES_M = FEATURE_METADATA['feature_sets']['medium']
FEATURES_S = FEATURE_METADATA['feature_sets']['small']
FEATURES_2 = FEATURE_METADATA['feature_sets']['v2_equivalent_features']
FEATURES_3 = FEATURE_METADATA['feature_sets']['v3_equivalent_features']
FEATURES_N = FEATURE_METADATA['feature_sets']['fncv3_features']

ERA = 'era'
DATA = 'data_type'
Y_TRUE = 'target_nomi_v4_20'
Y_PRED = 'target_prediction'
Y_RANK = 'prediction' 

X_COLS = FEATURES_L
COLUMNS = [ERA, DATA] + X_COLS + [Y_TRUE]

ITC = 'intercept'
COEFS = X_COLS + [ITC]

# Dataframes

In [None]:
df = pd.read_parquet('../data/train.parquet', columns=COLUMNS)
df[ERA] = df[ERA].astype('int32')

# Compute coefs of linear regression by era

In [None]:
def coefs_linreg(df):
    model = LinearRegression()
    model.fit(df[X_COLS], df[Y_TRUE])
    y_prd = pd.Series(model.predict(df[X_COLS]))
    y_rnk = y_prd.rank(pct=True)
    ncorr = np.corrcoef(df[Y_TRUE], y_rnk)[0, 1]
    coefs = {X_COLS[i]: [model.coef_[i]] for i in range(model.n_features_in_)}
    coefs[ITC] = [model.intercept_]
    coefs['corr'] = [ncorr]
    coefs = pd.DataFrame(coefs)
    return coefs

In [None]:
df_coefs = df.groupby(ERA).apply(coefs_linreg)
df_coefs[ERA] = np.arange(len(df_coefs)) + 1

# Do a linear regression to predict the coefs as a function of the era

In [None]:
X = df_coefs[ERA].to_numpy().reshape(-1, 1)
y = df_coefs[COEFS].to_numpy()

coef_predictor = LinearRegression()
coef_predictor.fit(X, y)

In [None]:
pred_coefs = ['pred_' + c for c in COEFS]
df_coefs[pred_coefs] = coef_predictor.predict(X)

# Predict coefficients for current era. Make final predictions

In [None]:
coef_predictions = coef_predictor.predict(np.array([[era]]))
w = np.array(coef_predictions[0][0:-1])
b = coef_predictions[0][-1]

In [None]:
df_liv = pd.read_parquet(f'../data/live_{round}.parquet', columns=COLUMNS)
df_liv[Y_PRED] = df_liv[X_COLS] @ w + b

# Plots for coefs as a function of the era

In [None]:
for c in tqdm(COEFS):
    fig, ax = plt.subplots()
    ax.plot(df_coefs[ERA], df_coefs[c], label='coefs')
    ax.plot(df_coefs[ERA], df_coefs['pred_' + c], label='linreg')
    ax.set_xlabel('era')
    ax.set_ylabel('coef')
    ax.set_title(f'coef for {c} as function of era')
    ax.legend()

    fig.savefig(f'figures/{c}.png')
    plt.close(fig)