In [326]:
import numpy as np
import pandas as pd
import scipy

## Make Toy Dataframe Containing  Prediction Data

In [327]:
tickers = ["A", "B", "C", "D", "E"]
df = pd.DataFrame(index=tickers)

predictions = pd.Series({"A": 0.4, "B": 0.2, "C": 0.8, "E": 0.3}) # Missing a ticker!
df["preds"] = predictions
df

Unnamed: 0,preds
A,0.4
B,0.2
C,0.8
D,
E,0.3


## Preprocessing

In [328]:
# this ranking method perfectly centers around 0.5
df["ranked_preds"] = (df["preds"].rank()-0.5)/len(df.dropna())
df

Unnamed: 0,preds,ranked_preds
A,0.4,0.625
B,0.2,0.125
C,0.8,0.875
D,,
E,0.3,0.375


In [329]:
print(f"mean and median: \n{df['ranked_preds'].mean()}\n{df['ranked_preds'].median()}")

mean and median: 
0.5
0.5


In [330]:
# can safely fill missing data with 0.5 now 
df["ranked_preds"] = df["ranked_preds"].fillna(0.5)
df

Unnamed: 0,preds,ranked_preds
A,0.4,0.625
B,0.2,0.125
C,0.8,0.875
D,,0.5
E,0.3,0.375


## Neutralization

In [331]:
# Make 2 toy features which we'll neutralize to
df["feature1"] = [-0.2, 0.3, 0.1, -0.5, 0.3]
df["feature2"] = [0.3, 0.1, -0.2, -0.3, 0.1]

In [332]:
# put into uniform distribution now
# different than before because now 0.5s for missing data are included
df["ranked_preds2"] = (df["ranked_preds"].rank() - 0.5)/len(df)

In [333]:
# gaussianize predictions to make the data more natural for the neutralization
df["gaussianized_preds"] = scipy.stats.norm.ppf(df["ranked_preds2"])
scores = df[["gaussianized_preds"]]
df

Unnamed: 0,preds,ranked_preds,feature1,feature2,ranked_preds2,gaussianized_preds
A,0.4,0.625,-0.2,0.3,0.7,0.524401
B,0.2,0.125,0.3,0.1,0.1,-1.281552
C,0.8,0.875,0.1,-0.2,0.9,1.281552
D,,0.5,-0.5,-0.3,0.5,0.0
E,0.3,0.375,0.3,0.1,0.3,-0.524401


In [334]:
exposures = df[["feature1", "feature2"]].values
# subtract out the exposures from the gaussianized_preds
df["neutralized_preds"] = df["gaussianized_preds"] - exposures.dot(np.linalg.pinv(exposures).dot(np.array(df["gaussianized_preds"])))

In [335]:
df

Unnamed: 0,preds,ranked_preds,feature1,feature2,ranked_preds2,gaussianized_preds,neutralized_preds
A,0.4,0.625,-0.2,0.3,0.7,0.524401,0.54901
B,0.2,0.125,0.3,0.1,0.1,-1.281552,-0.944742
C,0.8,0.875,0.1,-0.2,0.9,1.281552,1.235272
D,,0.5,-0.5,-0.3,0.5,0.0,-0.651949
E,0.3,0.375,0.3,0.1,0.3,-0.524401,-0.187591


In [336]:
# exposure to all features is now 0 
np.corrcoef(df["feature1"], df["neutralized_preds"])[0, 1]

7.357446759471349e-17

In [337]:
# the final neutralized preds are more or less correlated to your original predictions
# depending on how neutral to our features they were to start
np.corrcoef(df["neutralized_preds"], df["ranked_preds"])[0, 1]

0.9124255165448361

## Get the Final Correlation with the Target

In [338]:
df["target"] = [0.4, 0.2, 0.8, 1.0, 0.6]

In [339]:
# one last rank in case neutralization gives some really strange distribution
# (for example if you submit one of the features exactly)
# method="first" breaks ties but there really should be none anymore
score = np.corrcoef(df["neutralized_preds"].rank(method="first"), df["target"])[0, 1]
print("final score:", score)

final score: 0.30000000000000004
