In [None]:
from xgboost import XGBRegressor, XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
TRAINING_DATAPATH = 'data/numerai_datasets_04.04.21/numerai_training_data.csv'
df = pd.read_csv(TRAINING_DATAPATH)
features = [c for c in df if c.startswith("feature")]

In [None]:
import math
from sklearn.utils import class_weight

In [None]:
tournament_data = pd.read_csv("data/numerai_datasets_04.04.21/numerai_tournament_data.csv")  
    

validation_data = tournament_data.loc[tournament_data.data_type == 'validation']
X_val = validation_data[features].reset_index().drop(['index'], axis = 1)
Y_val = validation_data.target

In [None]:


def ar1(x):
    return np.corrcoef(x[:-1], x[1:])[0,1]

def autocorr_penalty(x):
    n = len(x)
    p = ar1(x)
    return np.sqrt(1 + 2*np.sum([((n - i)/n)*p**i for i in range(1,n)]))

def smart_sharpe(x):
    return np.mean(x)/(np.std(x, ddof=1)*autocorr_penalty(x))

def spearmanr(target, pred):
    predictionArray = np.zeros((target.shape[0], 2))
    predictionArray[:,0] = target.to_numpy()
    predictionArray[:,1] = pred.to_numpy()
    score = np.corrcoef(predictionArray
#         target,
#         pred.rank(pct=True, method="first")
    )[0, 1]
    if score == -1:
        print(predictionArray)
    if math.isnan(score):
        score = 0
    #print(score)
    return score

def calcMaxAccuracy2(Y_true, Y_pred):
    corVal = 0
    allTrainSamples = len(Y_pred)
    for i in range(allTrainSamples):
        if Y_pred[i] == Y_true[i]:
            corVal +=1
    accuracy = corVal / allTrainSamples
    return accuracy

def era_boost_train(df, features, proportion=0.5, trees_per_step=10, num_iters=200):
    classes = np.array([0, 0.25, 0.5, 0.75, 1])
    class_weights = class_weight.compute_class_weight('balanced', np.array([0, 0.25, 0.5, 0.75, 1]), df["target"])
    w_array = np.ones(df["target"].shape[0], dtype = 'float')
    for i, val in enumerate(df["target"]):
        index = np.where(classes == val)
        w_array[i] = class_weights[index]
    
    #evalset = [(df[features], df["target"]), (X_val,Y_val)]
    model = XGBClassifier(max_depth=5, objective='multi:softprob', learning_rate=0.01, n_estimators=1000, n_jobs=-1, colsample_bytree=0.1)
    
    model.fit(df[features], df["target"],  sample_weight=w_array)
    
    for i in range(num_iters-1):
        print(f"iteration {i}")
        # score each era
        print("predicting on train")
        preds = model.predict(df[features])
        df["pred"] = preds
        #print(preds)
        era_scores = pd.Series(index=df["era"].unique())
        print("getting per era scores")
        for era in df["era"].unique():
            era_predictions_df = df[df["era"] == era]
            era_score = calcMaxAccuracy2(era_predictions_df["pred"].to_numpy(), era_predictions_df["target"].to_numpy())
            era_scores[era] = era_score
        #print(era_scores)
        era_scores.sort_values(inplace=True)
        worst_eras = era_scores[era_scores <= era_scores.quantile(proportion)].index
        print(list(worst_eras))
        worst_df = df[df["era"].isin(worst_eras)]
        era_scores.sort_index(inplace=True)
        era_scores.plot(kind="bar")
        print("performance over time")
        plt.show()
        if era_scores.sum() > 0:
            print("autocorrelation")
            print(ar1(era_scores))
            print("mean correlation")
            print(np.mean(era_scores))
            print("sharpe")
            print(np.mean(era_scores)/np.std(era_scores))
            print("smart sharpe")
            print(smart_sharpe(era_scores))
        model.n_estimators += trees_per_step
        booster = model.get_booster()
        print("fitting on worst eras")
        
        class_weights = class_weight.compute_class_weight('balanced', np.array([0, 0.25, 0.5, 0.75, 1]), worst_df["target"])
        w_array = np.ones(worst_df["target"].shape[0], dtype = 'float')
        for i, val in enumerate(worst_df["target"]):
            index = np.where(classes == val)
            w_array[i] = class_weights[index]
        
        model.fit(worst_df[features], worst_df["target"], sample_weight=w_array, xgb_model=booster)
    return model

boost_model = era_boost_train(df, features, proportion=0.5, trees_per_step=10, num_iters=20)

In [None]:
pred_df

In [None]:
X_train = df[features]
Y_train = df["target"]

In [None]:
Y_train_pred = boost_model.predict(X_train)
corVal = 0
allTrainSamples = len(Y_train_pred)
for i in range(allTrainSamples):
    if int(Y_train_pred[i]*4) == Y_train[i]*4:
        corVal +=1
accuracy = corVal / allTrainSamples
print("train accuracy = ", accuracy)


y_pred = boost_model.predict(X_val)
corVal = 0
allTrainSamples = len(y_pred)
for i in range(allTrainSamples):
    if y_pred[i] == Y_val[i]:
        corVal +=1
accuracy = corVal / allTrainSamples
print("validation accuracy = ", accuracy)

In [None]:
Y_train_pred

In [None]:
predictions_df = validation_data["id"].to_frame()
predictions_df["pred"] = y_pred
predictions_df.pred.hist(bins = 10)

In [None]:
predictions_df = training_data["id"].to_frame()
predictions_df["pred"] = Y_train_pred
predictions_df.pred.hist(bins = 10)