In [1]:
import pandas as pd
import numpy as  np

from TargetEncoderv2 import TargetEncoder
from FeatureSelector import FeatureSelector

from sklearn.metrics import *
from sklearn.model_selection import *

import lightgbm as lgb
import logging

In [3]:
#Read train and test with features
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train = pd.read_csv("../input/train_featureset1_v2.csv", dtype=dtypes, compression='gzip')
test = pd.read_csv("../input/test_featureset1_v2.csv", dtype=dtypes, compression='gzip')

In [4]:
from sklearn.feature_selection import *
feats = ['app','device','os','channel','ip_hour_day_count','ip_count','app_count',
         'device_count','os_count','channel_count','hourofday_count','ip_app_count',
         'app_device_count','app_os_count','app_channel_count','app_hourofday_count',
         'device_hourofday_count','os_hourofday_count','channel_hourofday_count',
         'channel_os_count','ip_mean','app_mean','device_mean','os_mean','channel_mean',
         'hourofday_mean','ip_app_mean','app_device_mean','app_os_mean','app_channel_mean',
         'app_hourofday_mean','device_hourofday_mean','os_hourofday_mean','channel_hourofday_mean','channel_os_mean']

In [14]:
for col in feats:
    print("ANOVA F Score of feature {} is {}".format(col, f_classif(train[col].fillna(train[col].mean()).reshape(-1,1), train['is_attributed'])))
    print("Chi Score for feature {} is {}".format(col, chi2(train[col].fillna(train[col].mean()).reshape(-1,1), train['is_attributed'])))
    if ((train[col].mean() - test[col].mean())/train[col].std() >= 0.1):
        print("WARNING - CHECK THIS FEATURE OUT MORE CLOSELY")
    print("Stats for train")
    print(train[col].fillna(train[col].mean()).describe())
    print("Stats for test")
    print(test[col].fillna(train[col].mean()).describe())

  
  This is separate from the ipykernel package so we can avoid doing imports until


ANOVA F Score of feature app is (array([416.79503491]), array([1.87951837e-92]))
Chi Score for feature app is (array([7691.23260593]), array([0.]))
Stats for train
count    100000.00000
mean         12.04788
std          14.94150
min           1.00000
25%           3.00000
50%          12.00000
75%          15.00000
max         551.00000
Name: app, dtype: float64
Stats for test
count    1.879047e+07
mean     1.221480e+01
std      1.164924e+01
min      0.000000e+00
25%      3.000000e+00
50%      1.200000e+01
75%      1.800000e+01
max      5.210000e+02
Name: app, dtype: float64
ANOVA F Score of feature device is (array([0.04833755]), array([0.82598218]))
Chi Score for feature device is (array([149.67133928]), array([2.04549138e-34]))
Stats for train
count    100000.000000
mean         21.771250
std         259.667767
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max        3867.000000
Name: device, dtype: float64
Stats for test


  
  This is separate from the ipykernel package so we can avoid doing imports until


count    1.879047e+07
mean     1.730513e+00
std      2.597038e+01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      3.031000e+03
Name: device, dtype: float64
ANOVA F Score of feature os is (array([3.82365233]), array([0.05053675]))
Chi Score for feature os is (array([524.40885328]), array([4.64884628e-116]))
Stats for train
count    100000.000000
mean         22.818280
std          55.943136
min           0.000000
25%          13.000000
50%          18.000000
75%          19.000000
max         866.000000
Name: os, dtype: float64
Stats for test


  
  This is separate from the ipykernel package so we can avoid doing imports until


count    1.879047e+07
mean     1.873312e+01
std      1.135059e+01
min      0.000000e+00
25%      1.300000e+01
50%      1.800000e+01
75%      1.900000e+01
max      6.040000e+02
Name: os, dtype: float64
ANOVA F Score of feature channel is (array([54.50797756]), array([1.5601407e-13]))
Chi Score for feature channel is (array([3408.98126278]), array([0.]))
Stats for train
count    100000.000000
mean        268.832460
std         129.724248
min           3.000000
25%         145.000000
50%         258.000000
75%         379.000000
max         498.000000
Name: channel, dtype: float64
Stats for test


  
  This is separate from the ipykernel package so we can avoid doing imports until


count    1.879047e+07
mean     2.648059e+02
std      1.355254e+02
min      0.000000e+00
25%      1.350000e+02
50%      2.360000e+02
75%      4.010000e+02
max      4.980000e+02
Name: channel, dtype: float64
ANOVA F Score of feature ip_hour_day_count is (array([0.67452452]), array([0.41148054]))
Chi Score for feature ip_hour_day_count is (array([1.84423201]), array([0.17445574]))
Stats for train
count    100000.000000
mean          1.493280
std           2.020593
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          28.000000
Name: ip_hour_day_count, dtype: float64
Stats for test


  
  This is separate from the ipykernel package so we can avoid doing imports until


count    1.879047e+07
mean     1.133618e+03
std      4.505620e+03
min      1.000000e+00
25%      6.200000e+01
50%      1.310000e+02
75%      3.120000e+02
max      4.023100e+04
Name: ip_hour_day_count, dtype: float64
ANOVA F Score of feature ip_count is (array([2.82765282]), array([0.09265681]))
Chi Score for feature ip_count is (array([651.62248223]), array([9.91411445e-144]))
Stats for train
count    100000.000000
mean         28.337405
std          80.810724
min           1.000000
25%           2.000000
50%           7.000000
75%          28.337405
max         647.000000
Name: ip_count, dtype: float64
Stats for test
count    1.879047e+07
mean     3.316915e+01
std      9.558080e+01
min      1.000000e+00
25%      3.000000e+00
50%      8.000000e+00
75%      2.833741e+01
max      6.690000e+02
Name: ip_count, dtype: float64
ANOVA F Score of feature app_count is (array([425.07722911]), array([3.01227795e-94]))
Chi Score for feature app_count is (array([1449444.59780162]), array([0.]))
Stat

count    1.879047e+07
mean     6.213384e+02
std      5.523803e+02
min      1.000000e+00
25%      1.150000e+02
50%      2.800000e+02
75%      1.180000e+03
max      1.459000e+03
Name: os_hourofday_count, dtype: float64
ANOVA F Score of feature channel_hourofday_count is (array([86.63635126]), array([1.32954006e-20]))
Chi Score for feature channel_hourofday_count is (array([13054.05388586]), array([0.]))
Stats for train
count    100000.000000
mean        125.600253
std         137.627037
min           1.000000
25%          40.000000
50%          81.000000
75%         156.000000
max         662.000000
Name: channel_hourofday_count, dtype: float64
Stats for test
count    1.879047e+07
mean     9.855884e+01
std      7.254250e+01
min      1.000000e+00
25%      4.300000e+01
50%      7.700000e+01
75%      1.380000e+02
max      6.880000e+02
Name: channel_hourofday_count, dtype: float64
ANOVA F Score of feature channel_os_count is (array([44.82555475]), array([2.16525639e-11]))
Chi Score for featu

count    1.879047e+07
mean     3.345597e-03
std      2.567090e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: app_hourofday_mean, dtype: float64
ANOVA F Score of feature device_hourofday_mean is (array([2536.05503698]), array([0.]))
Chi Score for feature device_hourofday_mean is (array([150.46280995]), array([1.37341408e-34]))
Stats for train
count    100000.000000
mean          0.002064
std           0.011204
min           0.000000
25%           0.001033
50%           0.001513
75%           0.001954
max           1.000000
Name: device_hourofday_mean, dtype: float64
Stats for test
count    1.879047e+07
mean     2.913513e-03
std      1.601988e-02
min      0.000000e+00
25%      8.770007e-04
50%      1.366120e-03
75%      1.929012e-03
max      1.000000e+00
Name: device_hourofday_mean, dtype: float64
ANOVA F Score of feature os_hourofday_mean is (array([3032.89981233]), array([0.]))
Chi Score for feature os_hourofday_m

In [None]:
#remove 
#ip_app_count
#ip_mean
#hourofday_mean
#ip_app_mean