In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_parquet("../input/janestreet-preprocessing/train.parquet")
train.info()

***

In [None]:
df = train.query("date >= 86").query("weight > 0").reset_index(drop=True)
print(len(df) / 1e6)

In [None]:
actions = df[["resp_1","resp_2","resp_3","resp_4","resp"]].copy()
actions = (actions > 0).astype(int).copy()
actions["acum"] = actions.eval("resp_1 + resp_2 + resp_3 + resp_4")
actions

In [None]:
actions.groupby("resp").count()

In [None]:
print("# samples with resp=0 :", len(actions.query("resp == 0")))
print("% samples with resp=0 | 4/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==0")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 3/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==1")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 2/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==2")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 1/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==3")) / len(actions.query("resp==0")))
print("% samples with resp=0 | 0/4 resp_*=0: ", 100*len(actions.query("resp==0 & acum==4")) / len(actions.query("resp==0")))

In [None]:
print("# samples with resp=1 :", len(actions.query("resp == 1")))
print("% samples with resp=1 | 4/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==4")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 3/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==3")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 2/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==2")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 1/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==1")) / len(actions.query("resp==1")))
print("% samples with resp=1 | 0/4 resp_*=1: ", 100*len(actions.query("resp==1 & acum==0")) / len(actions.query("resp==1")))

***
### correlation analysis

In [None]:
# Correlation between resp and resp_1
print(stats.pearsonr(df.resp.values, df.resp_1.values))
print(stats.spearmanr(df.resp.values, df.resp_1.values))

In [None]:
# Correlation between resp and resp_2
print(stats.pearsonr(df.resp.values, df.resp_2.values))
print(stats.spearmanr(df.resp.values, df.resp_2.values))

In [None]:
# Correlation between resp and resp_3
print(stats.pearsonr(df.resp.values, df.resp_3.values))
print(stats.spearmanr(df.resp.values, df.resp_3.values))

In [None]:
# Correlation between resp and resp_4
print(stats.pearsonr(df.resp.values, df.resp_4.values))
print(stats.spearmanr(df.resp.values, df.resp_4.values))

In [None]:
# pearson
correlations =[0.4512157484199811, 0.596432491057167, 0.8176558055871396, 0.9561611811400158, 1]
T = np.sum(correlations)
print("T:", T)
for corr in correlations:
    print(corr/T)

In [None]:
# total weight of resp+resp_4
(0.9561611811400158+1)/T

In [None]:
# spearman
correlations = [0.40644028129967835, 0.5388620969420198, 0.7832930727088052, 0.9390918675717485, 1.]
T = np.sum(correlations)
print("T:", T)
for corr in correlations:
    print(corr/T)

In [None]:
# total weight of resp+resp_4
( 0.9390918675717485+1)/T

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_1.values)

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_2.values)

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_3.values)

In [None]:
stats.pearsonr(actions.resp.values, actions.resp_4.values)

In [None]:
correlations = [0.29274992128071264, 0.3947637162946275, 0.6256287383834749, 0.8145742627784853, 1] 
T = np.sum(correlations)
print("T:", T)
for corr in correlations:
    print(corr/T)

In [None]:
(1+0.8145742627784853)/T

In [None]:
(actions.query("resp==0 & acum==1")[["resp_1","resp_2","resp_3","resp_4"]]==1).sum()

In [None]:
(actions.query("resp==1 & acum==1")[["resp_1","resp_2","resp_3","resp_4"]]==0).sum()

***