In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats import rankdata
import pickle
import lightgbm as lgb

In [2]:
def get_features(dataframe, unimportant=set()):
    feature_columns = dataframe.columns
    columns_to_remove = []
    for feature in feature_columns:
        if not feature.startswith('feature_'):
            columns_to_remove.append(feature)
    feature_columns = feature_columns.drop(columns_to_remove)
    return sorted(feature_columns)

def get_X_array(df, feature_columns):
    return df.loc[:, feature_columns].values

def get_Y_array(df):
    return df.loc[:, 'target_kazutsugi'].values


In [3]:
feature_columns_final = pickle.load(open('214_feature_columns_final_q01.pickle', 'rb'))

In [4]:
from sklearn.preprocessing import MinMaxScaler
def _neutralize(df, columns, by, proportion=1.0):
    scores = df[columns]
    exposures = df[by].values
    scores = scores - (proportion * exposures).dot(np.linalg.pinv(exposures).dot(scores))
    return scores / scores.std()
def _normalize(df):
    X = (df.rank(method="first") - 0.5) / len(df)
    return scipy.stats.norm.ppf(X)
def normalize_and_neutralize(df, columns, by, proportion=1.0):
    # Convert the scores to a normal distribution
    df[columns] = _normalize(df[columns])
    df[columns] = _neutralize(df, columns, by, proportion)
    return df[columns]

In [5]:
TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = "target_kazutsugi"
PREDICTION_NAME = "prediction_kazutsugi"

BENCHMARK = 0
BAND = 0.2


# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0,1]


# The payout function
def payout(scores):
    return ((scores - BENCHMARK)/BAND).clip(lower=-1, upper=1)


In [6]:
cur_folder = 'numerai_233'

In [7]:
import subprocess
indatafile = open('%s/numerai_tournament_data.csv' % (cur_folder, ), 'r')
p1file = open('%s/part1.csv' % (cur_folder, ), 'w')
all_lines = indatafile.readlines()
total_size = len(all_lines)
first_half = int(1691469 / 2)
for ln in all_lines[:first_half]:
    p1file.write(ln)
p1file.close()
p2file = open('%s/part2.csv' % (cur_folder, ), 'w')
p2file.write(all_lines[0])
for ln in all_lines[first_half:]:
    p2file.write(ln)
p2file.close()

In [8]:
mdl_0 = pickle.load(open('210model_remove03.pickle', 'rb'))

usef_columns = pickle.load(open('210features_remove03.pickle', 'rb'))
neut_columns = pickle.load(open('210features_neut03.pickle', 'rb'))

df2 = pd.read_csv('%s/part1.csv' % (cur_folder, ))
print(df2.shape)

df2[PREDICTION_NAME] = mdl_0.predict(get_X_array(df2, usef_columns))
validation_data = df2[df2.data_type == "validation"]
validation_correlations3 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations3.mean()} and std {validation_correlations3.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations3).mean()}")

df2["preds_neutralized"] = df2.groupby("era").apply(
    lambda x: normalize_and_neutralize(x, [PREDICTION_NAME], feature_columns_final, 1.0) # neutralize by 50% within each era
)
scaler = MinMaxScaler()
df2["preds_neutralized"] = scaler.fit_transform(df2[["preds_neutralized"]]) # transform back to 0-1

df2[PREDICTION_NAME] = df2["preds_neutralized"]

validation_data = df2[df2.data_type == "validation"]
validation_correlations4 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations4.mean()} and std {validation_correlations4.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations4).mean()}")

ff_final = []
ff_final.insert(0, 'id')
ff_final.insert(0, 'era')
ff_final.insert(0, 'data_type')
ff_final.insert(0, 'target_kazutsugi')
ff_final.insert(0, 'prediction_kazutsugi')

df2 = df2[ff_final]


df2 = df2.set_index("id")
df2[PREDICTION_NAME].to_csv('%s/kazutsugi' % (cur_folder, ) + '_submission_part1.csv', header='s')

df2 = pd.read_csv('%s/part2.csv' % (cur_folder, ))
print(df2.shape)

df2[PREDICTION_NAME] = mdl_0.predict(get_X_array(df2, usef_columns))
validation_data = df2[df2.data_type == "validation"]
validation_correlations3 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations3.mean()} and std {validation_correlations3.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations3).mean()}")

df2["preds_neutralized"] = df2.groupby("era").apply(
    lambda x: normalize_and_neutralize(x, [PREDICTION_NAME], feature_columns_final, 1.0) # neutralize by 50% within each era
)
scaler = MinMaxScaler()
df2["preds_neutralized"] = scaler.fit_transform(df2[["preds_neutralized"]]) # transform back to 0-1

df2[PREDICTION_NAME] = df2["preds_neutralized"]

validation_data = df2[df2.data_type == "validation"]
validation_correlations4 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations4.mean()} and std {validation_correlations4.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations4).mean()}")

ff_final = []
ff_final.insert(0, 'id')
ff_final.insert(0, 'era')
ff_final.insert(0, 'data_type')
ff_final.insert(0, 'target_kazutsugi')
ff_final.insert(0, 'prediction_kazutsugi')

df2 = df2[ff_final]


df2 = df2.set_index("id")
df2[PREDICTION_NAME].to_csv('%s/kazutsugi' % (cur_folder, ) + '_submission_part2.csv', header='s')

p1file = open('%s/kazutsugi_submission_part1.csv' % (cur_folder, ), 'r')
p2file = open('%s/kazutsugi_submission_part2.csv' % (cur_folder, ), 'r')
poutfile = open('%s/kazutsugi_submission.csv' % (cur_folder, ), 'w')
for ln in p1file.readlines():
    poutfile.write(ln)
for ln in p2file.readlines()[1:]:
    poutfile.write(ln)
poutfile.close()

df1 = pd.read_csv('%s/kazutsugi_submission.csv' % (cur_folder, ))

df1['prediction_kazutsugi'] = df1['prediction_kazutsugi'] - df1['prediction_kazutsugi'].min()
df1['prediction_kazutsugi'] = df1['prediction_kazutsugi'] / df1['prediction_kazutsugi'].max()

df1 = df1.set_index("id")
df1['prediction_kazutsugi'].to_csv('%s/kazutsugi' % (cur_folder, ) + '_jackerparker6.csv', header='s')


(845733, 314)
On validation the correlation has mean 0.040312616405054034 and std 0.02408954258205016
On validation the average per-era payout is 0.20156308202527018
On validation the correlation has mean 0.038254395585296475 and std 0.022160438957453385
On validation the average per-era payout is 0.1912719779264824
(812848, 314)
On validation the correlation has mean 0.017075857728340953 and std 0.02906686178417882
On validation the average per-era payout is 0.08537928864170478
On validation the correlation has mean 0.02194948957558552 and std 0.02512039143759398
On validation the average per-era payout is 0.10974744787792759


In [13]:
len(usef_columns), len(neut_columns), len(feature_columns_final)

(196, 119, 49)

In [15]:
len(set(usef_columns).intersection(set(neut_columns))), len(set(usef_columns).intersection(set(neut_columns)))

119

In [None]:
mdl_0 = pickle.load(open('210model_remove03.pickle', 'rb'))

usef_columns = pickle.load(open('210features_remove03.pickle', 'rb'))
neut_columns = pickle.load(open('210features_neut03.pickle', 'rb'))

df2 = pd.read_csv('%s/part1.csv' % (cur_folder, ))
print(df2.shape)

df2[PREDICTION_NAME] = mdl_0.predict(get_X_array(df2, usef_columns))
validation_data = df2[df2.data_type == "validation"]
validation_correlations3 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations3.mean()} and std {validation_correlations3.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations3).mean()}")

# df2["preds_neutralized"] = df2.groupby("era").apply(
#     lambda x: normalize_and_neutralize(x, [PREDICTION_NAME], feature_columns_final, 1.0) # neutralize by 50% within each era
# )
# scaler = MinMaxScaler()
# df2["preds_neutralized"] = scaler.fit_transform(df2[["preds_neutralized"]]) # transform back to 0-1

# df2[PREDICTION_NAME] = df2["preds_neutralized"]

# validation_data = df2[df2.data_type == "validation"]
# validation_correlations4 = validation_data.groupby("era").apply(score)
# print(f"On validation the correlation has mean {validation_correlations4.mean()} and std {validation_correlations4.std()}")
# print(f"On validation the average per-era payout is {payout(validation_correlations4).mean()}")

ff_final = []
ff_final.insert(0, 'id')
ff_final.insert(0, 'era')
ff_final.insert(0, 'data_type')
ff_final.insert(0, 'target_kazutsugi')
ff_final.insert(0, 'prediction_kazutsugi')

df2 = df2[ff_final]


df2 = df2.set_index("id")
df2[PREDICTION_NAME].to_csv('%s/kazutsugi' % (cur_folder, ) + '_submission_part1.csv', header='s')

df2 = pd.read_csv('%s/part2.csv' % (cur_folder, ))
print(df2.shape)

df2[PREDICTION_NAME] = mdl_0.predict(get_X_array(df2, usef_columns))
validation_data = df2[df2.data_type == "validation"]
validation_correlations3 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations3.mean()} and std {validation_correlations3.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations3).mean()}")

# df2["preds_neutralized"] = df2.groupby("era").apply(
#     lambda x: normalize_and_neutralize(x, [PREDICTION_NAME], feature_columns_final, 1.0) # neutralize by 50% within each era
# )
# scaler = MinMaxScaler()
# df2["preds_neutralized"] = scaler.fit_transform(df2[["preds_neutralized"]]) # transform back to 0-1

# df2[PREDICTION_NAME] = df2["preds_neutralized"]

# validation_data = df2[df2.data_type == "validation"]
# validation_correlations4 = validation_data.groupby("era").apply(score)
# print(f"On validation the correlation has mean {validation_correlations4.mean()} and std {validation_correlations4.std()}")
# print(f"On validation the average per-era payout is {payout(validation_correlations4).mean()}")

ff_final = []
ff_final.insert(0, 'id')
ff_final.insert(0, 'era')
ff_final.insert(0, 'data_type')
ff_final.insert(0, 'target_kazutsugi')
ff_final.insert(0, 'prediction_kazutsugi')

df2 = df2[ff_final]


df2 = df2.set_index("id")
df2[PREDICTION_NAME].to_csv('%s/kazutsugi' % (cur_folder, ) + '_submission_part2.csv', header='s')

p1file = open('%s/kazutsugi_submission_part1.csv' % (cur_folder, ), 'r')
p2file = open('%s/kazutsugi_submission_part2.csv' % (cur_folder, ), 'r')
poutfile = open('%s/kazutsugi_submission.csv' % (cur_folder, ), 'w')
for ln in p1file.readlines():
    poutfile.write(ln)
for ln in p2file.readlines()[1:]:
    poutfile.write(ln)
poutfile.close()

df1 = pd.read_csv('%s/kazutsugi_submission.csv' % (cur_folder, ))

df1['prediction_kazutsugi'] = df1['prediction_kazutsugi'] - df1['prediction_kazutsugi'].min()
df1['prediction_kazutsugi'] = df1['prediction_kazutsugi'] / df1['prediction_kazutsugi'].max()

df1 = df1.set_index("id")
df1['prediction_kazutsugi'].to_csv('%s/kazutsugi' % (cur_folder, ) + '_jackerparker3.csv', header='s')


In [9]:
mdl_0 = pickle.load(open('210model_remove03.pickle', 'rb'))

usef_columns = pickle.load(open('210features_remove03.pickle', 'rb'))
neut_columns = pickle.load(open('210features_neut03.pickle', 'rb'))

df2 = pd.read_csv('%s/part1.csv' % (cur_folder, ))
print(df2.shape)

df2[PREDICTION_NAME] = mdl_0.predict(get_X_array(df2, usef_columns))
validation_data = df2[df2.data_type == "validation"]
validation_correlations3 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations3.mean()} and std {validation_correlations3.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations3).mean()}")

df2["preds_neutralized"] = df2.groupby("era").apply(
    lambda x: normalize_and_neutralize(x, [PREDICTION_NAME], neut_columns, 1.0) # neutralize by 50% within each era
)
scaler = MinMaxScaler()
df2["preds_neutralized"] = scaler.fit_transform(df2[["preds_neutralized"]]) # transform back to 0-1

df2[PREDICTION_NAME] = df2["preds_neutralized"]

validation_data = df2[df2.data_type == "validation"]
validation_correlations4 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations4.mean()} and std {validation_correlations4.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations4).mean()}")

ff_final = []
ff_final.insert(0, 'id')
ff_final.insert(0, 'era')
ff_final.insert(0, 'data_type')
ff_final.insert(0, 'target_kazutsugi')
ff_final.insert(0, 'prediction_kazutsugi')

df2 = df2[ff_final]


df2 = df2.set_index("id")
df2[PREDICTION_NAME].to_csv('%s/kazutsugi' % (cur_folder, ) + '_submission_part1.csv', header='s')

df2 = pd.read_csv('%s/part2.csv' % (cur_folder, ))
print(df2.shape)

df2[PREDICTION_NAME] = mdl_0.predict(get_X_array(df2, usef_columns))
validation_data = df2[df2.data_type == "validation"]
validation_correlations3 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations3.mean()} and std {validation_correlations3.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations3).mean()}")

df2["preds_neutralized"] = df2.groupby("era").apply(
    lambda x: normalize_and_neutralize(x, [PREDICTION_NAME], neut_columns, 1.0) # neutralize by 50% within each era
)
scaler = MinMaxScaler()
df2["preds_neutralized"] = scaler.fit_transform(df2[["preds_neutralized"]]) # transform back to 0-1

df2[PREDICTION_NAME] = df2["preds_neutralized"]

validation_data = df2[df2.data_type == "validation"]
validation_correlations4 = validation_data.groupby("era").apply(score)
print(f"On validation the correlation has mean {validation_correlations4.mean()} and std {validation_correlations4.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations4).mean()}")

ff_final = []
ff_final.insert(0, 'id')
ff_final.insert(0, 'era')
ff_final.insert(0, 'data_type')
ff_final.insert(0, 'target_kazutsugi')
ff_final.insert(0, 'prediction_kazutsugi')

df2 = df2[ff_final]


df2 = df2.set_index("id")
df2[PREDICTION_NAME].to_csv('%s/kazutsugi' % (cur_folder, ) + '_submission_part2.csv', header='s')

p1file = open('%s/kazutsugi_submission_part1.csv' % (cur_folder, ), 'r')
p2file = open('%s/kazutsugi_submission_part2.csv' % (cur_folder, ), 'r')
poutfile = open('%s/kazutsugi_submission.csv' % (cur_folder, ), 'w')
for ln in p1file.readlines():
    poutfile.write(ln)
for ln in p2file.readlines()[1:]:
    poutfile.write(ln)
poutfile.close()

df1 = pd.read_csv('%s/kazutsugi_submission.csv' % (cur_folder, ))

df1['prediction_kazutsugi'] = df1['prediction_kazutsugi'] - df1['prediction_kazutsugi'].min()
df1['prediction_kazutsugi'] = df1['prediction_kazutsugi'] / df1['prediction_kazutsugi'].max()

df1 = df1.set_index("id")
df1['prediction_kazutsugi'].to_csv('%s/kazutsugi' % (cur_folder, ) + '_jackerparker2.csv', header='s')


(845733, 314)
On validation the correlation has mean 0.040312616405054034 and std 0.02408954258205016
On validation the average per-era payout is 0.20156308202527018
On validation the correlation has mean 0.029215425636591134 and std 0.01804659818969637
On validation the average per-era payout is 0.14607712818295565
(812848, 314)
On validation the correlation has mean 0.017075857728340953 and std 0.02906686178417882
On validation the average per-era payout is 0.08537928864170478
On validation the correlation has mean 0.02617351996274575 and std 0.014085384958865058
On validation the average per-era payout is 0.13086759981372875


In [7]:
df2 = pd.read_csv('%s/kazutsugi' % (cur_folder, ) + '_jackerparker2.csv')
df6 = pd.read_csv('%s/kazutsugi' % (cur_folder, ) + '_jackerparker6.csv')

In [8]:
df10 = df2.copy()
df10['prediction_kazutsugi'] = rankdata(df2['prediction_kazutsugi']) + rankdata(df6['prediction_kazutsugi'])
df10 = df10.set_index("id")
df10['prediction_kazutsugi'] = df10['prediction_kazutsugi'] - df10['prediction_kazutsugi'].min()
df10['prediction_kazutsugi'] = df10['prediction_kazutsugi'] / df10['prediction_kazutsugi'].max()
df10.to_csv('%s/kazutsugi' % (cur_folder, ) + '_jackerparker10.csv', header='s')