In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
df = pd.read_csv("data/train.csv")

Wall time: 13.5 s


## Target

In [3]:
target = pd.DataFrame(df['target']).to_records(index=False)
data = {'target' : target}

## Original dataset with features

In [14]:
df.shape

(200000, 202)

In [34]:
%%time
vanilla = df.drop(['ID_code', 'target'], axis=1).to_records(index=False)
print(vanilla.shape)

data['vanilla'] = vanilla

(200000,)
Wall time: 1.34 s


## Dataset sorted by density

In [5]:
def density_factor(var, _df=df, verbosity=False):
    """
    Density difference between values of a variable grouped by target 0 and 1
    """
    var_df = _df[[var, 'target']]
    var_0 = var_df[var_df['target']  == 0][[var, 'target']]
    var_1 = var_df[var_df['target']  == 1][[var, 'target']]
    
    var_0_mean = np.round(var_0[var].sum()/var_0.shape[0], 2)
    var_1_mean = np.round(var_1[var].sum()/var_1.shape[0], 2)

    diff = np.round(np.abs(var_0_mean - var_1_mean), 4)
        
    if verbosity:
        print("var:", var)
        print("Shape:", var_0.shape, var_1.shape)
        print("Sum:", var_0_mean, var_1_mean)
        print("Difference between mean of 0 and 1:", diff)
        
    return diff

In [6]:
%%time
features_list = df.columns[2:]
target_diff_df = pd.DataFrame(features_list, columns=['feature'])
target_diff_df['diff'] = [density_factor(var=f, _df=df) for f in features_list]

density_top_100 = target_diff_df.sort_values(by='diff', ascending=False).head(100)
density_tail_100 = target_diff_df.sort_values(by='diff', ascending=False).tail(100)

Wall time: 5.56 s


In [7]:
density_top_100 = density_top_100.to_records(index=False)
density_tail_100 = density_tail_100.to_records(index=False)

In [8]:
data['density_top_100'] = density_top_100
data['density_tail_100'] = density_tail_100

## Transformation: Log

In [85]:
temp = df.drop(['ID_code', 'target'], axis=1)
temp_log = pd.DataFrame(np.log(temp), columns=features_list)

  
  


In [86]:
temp_log.describe()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,72481.0,200000.0,199999.0,200000.0,58231.0,200000.0,200000.0,107425.0,200000.0,...,149178.0,199741.0,180104.0,158535.0,200000.0,91172.0,128351.0,200000.0,200000.0,82129.0
mean,2.324693,-inf,2.340091,1.864811,2.393902,-inf,1.674986,2.783924,-inf,2.009804,...,-inf,1.906492,-inf,1.265692,2.874267,-inf,-inf,2.181528,2.745043,-inf
std,0.304704,,0.255375,0.336458,0.150443,,0.162816,0.213609,,0.169996,...,,0.496247,,0.963259,0.179656,,,0.104829,0.201914,
min,-0.895508,-inf,0.750047,-4.122744,1.624287,-inf,0.853266,1.67704,-inf,1.378892,...,-inf,-5.115996,-inf,-9.21034,2.162679,-inf,-inf,1.785171,1.840439,-inf
25%,2.134622,0.08286948,2.165903,1.659009,2.290834,0.4960977,1.561864,2.635035,0.3491769,1.889914,...,0.9063616,1.642118,0.1973744,0.873341,2.749179,-0.7987299,1.00454,2.110553,2.626818,1.122524
50%,2.35373,0.8240439,2.358965,1.920592,2.407688,1.243693,1.683636,2.800739,1.012364,2.032035,...,1.560815,1.995136,0.7386223,1.507937,2.888033,-0.0735389,1.688896,2.184725,2.768458,1.836207
75%,2.546174,1.350849,2.527064,2.119155,2.506434,1.791134,1.792259,2.94984,1.432367,2.14995,...,2.004432,2.25288,1.1235,1.92912,3.015365,0.4749912,2.089553,2.261065,2.893961,2.315254
max,3.01136,2.339573,2.962847,2.57933,2.813695,2.847905,2.133894,3.321136,2.317602,2.411493,...,2.914571,2.816396,2.128517,2.905906,3.329658,1.452293,2.908075,2.48494,3.261134,3.349929


In [87]:
temp_log.shape

(200000, 200)

After the logaritmic transofmration is done, use only features that do not return any `nan`

In [96]:
%%time
temp_log_is_null = temp_log.isnull().values.sum(axis=0)
temp_log_features = [features_list[i] for i, x in enumerate(temp_log_is_null) if x == 0]
log_features = temp_log[temp_log_features].to_records(index=False)

data['log_features'] = log_features

Wall time: 586 ms


## Show keys

In [97]:
data.keys()

dict_keys(['density_top_100', 'log_features', 'vanilla', 'target', 'density_tail_100'])

## Save dictionary

In [98]:
%%time
np.save('data/dictionary.npy', data)

Wall time: 7.77 s


## Load dictionary
Sanity check

In [99]:
%%time
data2 = np.load('data/dictionary.npy')

Wall time: 3.81 s


In [100]:
data2.item().keys()

dict_keys(['density_top_100', 'log_features', 'target', 'vanilla', 'density_tail_100'])

In [101]:
data2[()]['log_features']
data2.item().get('log_features')

rec.array([(2.18891235, 2.47721884, 2.43892379, 1.6329005 , 2.92459067, 1.74867798, 2.64003542, 2.17462671, 2.67890285, 1.74897374, 0.90486571, 2.59767714, 1.59546068, 2.55754509, 2.82678686, 2.41519924, 2.44899446, 2.48861643, 1.77877554, 2.55873006, 1.64356877, 2.42186035, 1.68309721, 2.31417764, 2.16042227, 1.75882297, 1.61161554, 3.15318037, 2.5012966 , 2.62736062, 3.06651221, 2.07924152, 1.91406559, 2.30171471, 2.69746774, 2.58853066, 0.36485143, 2.68852753, 1.79929765, 2.25576599, 2.64691211, 3.19305327, 1.91105248, 2.27488496, 2.4852316 , 2.62879774, 2.73736302, 2.55168485, 2.26704081, 1.88211802, 2.42181597, 2.16319658, 2.44806111, 1.39346856, 2.91872703, 2.81519313, 2.49937646, 1.67021311, 1.28534079, 2.53879491, 0.99339991, 1.70254567, 2.0466988 , 2.61915341, 2.19904514, 2.86643761, 2.91899701, 2.14750899, 2.54790492),
           (2.44239921, 2.62892041, 2.51464343, 1.726474  , 2.80540677, 2.09002286, 2.64076302, 1.69276752, 2.61741773, 2.62665936, 1.21538741, 2.63124115, 1.6