In [None]:
import numpy as np
from dask.distributed import Client, LocalCluster
import dask.array as da
import dask.dataframe as dd
import pandas as pd

import paths
import math

In [None]:
# Housekeeping 
cluster = LocalCluster(n_workers=6)
client = Client(cluster)
pd.set_option('display.max_columns', None)

In [None]:
sg_with_ss_calcs = dd.read_csv(paths.processed + "/EngineeredTrainData/*.part").drop("Unnamed: 0", axis=1)

In [None]:
sg_with_ss_calcs.persist()

In [None]:
sg_with_ss_calcs.head()

In [None]:
# Persisting the first-wins and second-wins dataframes to make the printStats calls faster
f_dub = sg_with_ss_calcs[sg_with_ss_calcs["T1_Win_Indicator"] == 1].persist()
s_dub = sg_with_ss_calcs[sg_with_ss_calcs["T1_Win_Indicator"] == 0].persist()

In [None]:

# Function that calculates all the steps necessary to compute a t-statistic
def printStats (column1, column2, size):
    
    mean1 = column1.mean().compute()
    var1 = column1.var().compute()
    
    mean2 = column2.mean().compute()
    var2 = column2.var().compute()
    
#     print("1-Mean: ", mean1)
#     print("1-Var: ", var1)
#     print("0-Mean: ", mean2)
#     print("0-Var: ", var2)
    
    numerator = mean1 - mean2
    
#     pooled_stdev = math.sqrt(((size-1)*var1 + (size-1)*var2)/(size * 2 - 2))    
#     denom = pooled_stdev * math.sqrt(1/size + 1/size)
    
    pooled_stdev = math.sqrt((var1 + var2)/2)    
    denom = pooled_stdev
    
    t = numerator / denom
    
    return t
     
    

In [None]:

size = len(f_dub)
cols = f_dub.columns[42:]

# Using Dask futures to calculate t-statistics in parallel
futures = []
for s in cols:
    stat = client.submit(printStats, f_dub[s], s_dub[s], size)
    futures.append(stat)
    
num = 0
feature_set = []
for i in range(len(futures)):
    real_stat = futures[i].result()
    # Selecting the t-statistics above a certain threshold
    if real_stat != None and abs(real_stat) > 0.6:
        feature_set.append((cols[i], real_stat))

# Sorting by most significant t-statistic (descending) 
feature_set.sort(key=lambda e: -abs(e[1]))        
        
print("T-Statistics")
print("Features: ", len(feature_set))
a = [print(f[0], ": ", f[1]) for f in feature_set]
