In [1]:
import pandas as pd
import numpy as np
import calendar
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics
np.set_printoptions(suppress=True)

In [2]:
data = pd.read_pickle('data/merged_with_missing.pickle')
data.head()

Unnamed: 0,session_id,time,user_id,product_id,event,discount,purchase_id,made_purchase,known_product,is_user_invalid,...,duration,syntetic_duration,agg_time_spent,weekday,hour,weekday_sin,weekday_cos,hour_sin,hour_cos,ends_with_purchase
0,100001.0,2020-01-17 16:08:57,102.0,1001.0,view_product,0,,False,True,False,...,00:02:32.306757,True,00:07:44.613515,4,16,-0.433884,-0.900969,-0.866025,-0.5,False
1,100002.0,2020-01-07 05:10:41,102.0,1277.0,view_product,20,,False,True,False,...,00:03:45,False,00:05:12.306757,1,5,0.781831,0.62349,0.965926,0.258819,False
2,100002.0,2020-01-07 05:14:26,102.0,1276.0,view_product,20,,False,True,False,...,00:02:32.306757,True,00:03:45,1,5,0.781831,0.62349,0.965926,0.258819,False
3,100003.0,2020-01-16 09:37:42,102.0,1276.0,view_product,0,,False,True,False,...,00:00:40,False,00:06:17.306757,3,9,0.433884,-0.900969,0.707107,-0.707107,False
4,100003.0,2020-01-16 09:38:22,102.0,1277.0,view_product,0,,False,True,False,...,00:02:32.306757,True,00:08:57.306757,3,9,0.433884,-0.900969,0.707107,-0.707107,False


# Missing data analysis

To decide if missing data is MCAR or MAR problem we are using mutual_info_classif from sklearn library to estimate mutual information between each feature and the target ('is_user_invalid', 'is_delivery_invalid', 'is_price_invalid', 'known_product').

Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.

### Invalid user info

In [3]:
columns = data.dropna(axis=1).columns.to_series()
user_info = columns.apply(lambda c: metrics.mutual_info_score(data[c], data.is_user_invalid))
user_info.sort_values()

ends_with_purchase     0.000002
event                  0.000003
made_purchase          0.000003
known_product          0.000009
is_price_invalid       0.000014
name_2                 0.000014
syntetic_duration      0.000015
cat_3                  0.000020
cat_0                  0.000038
cat_2                  0.000055
discount               0.000057
cat_1                  0.000084
weekday_sin            0.000084
weekday                0.000084
weekday_cos            0.000084
hour_sin               0.000438
hour_cos               0.000454
hour                   0.000455
product                0.000786
duration               0.000998
is_delivery_invalid    0.004806
female                 0.057365
session_id             0.090562
agg_time_spent         0.121231
time                   0.194409
surname                0.194888
street                 0.194888
name_1                 0.194888
city                   0.194888
is_user_invalid        0.194888
dtype: float64

### Invalid delivery info

In [4]:
columns = data[data.made_purchase].dropna(axis=1).columns.to_series()
delivery_info = columns.apply(lambda c: metrics.mutual_info_score(data[data.made_purchase][c], data[data.made_purchase].is_delivery_invalid))
delivery_info.sort_values()

ends_with_purchase     0.000000
name_2                 0.000000
duration               0.000000
made_purchase          0.000000
syntetic_duration      0.000000
event                  0.000000
discount               0.000531
weekday                0.000778
weekday_sin            0.000778
weekday_cos            0.000778
is_price_invalid       0.000963
cat_3                  0.002268
hour_sin               0.003119
hour_cos               0.003344
hour                   0.003380
female                 0.025391
cat_2                  0.033347
is_user_invalid        0.121829
city                   0.122716
known_product          0.142053
cat_0                  0.142116
cat_1                  0.143369
name_1                 0.143428
surname                0.152253
street                 0.154015
product                0.154724
agg_time_spent         0.220211
purchase_time          0.323026
purchase_id            0.323026
time                   0.323026
session_id             0.323026
is_deliv

### Invalid price info

In [5]:
columns = data.dropna(axis=1).columns.to_series()
price_info = columns.apply(lambda c: metrics.mutual_info_score(data[c], data.is_price_invalid))
price_info.sort_values()

female                 0.000004
name_2                 0.000004
is_user_invalid        0.000014
ends_with_purchase     0.000063
discount               0.000066
weekday                0.000076
weekday_cos            0.000076
weekday_sin            0.000076
syntetic_duration      0.000081
is_delivery_invalid    0.000094
city                   0.000105
made_purchase          0.000174
event                  0.000174
hour_sin               0.000549
hour_cos               0.000550
hour                   0.000577
duration               0.000825
name_1                 0.001884
surname                0.003035
known_product          0.003064
street                 0.003092
cat_3                  0.008480
agg_time_spent         0.016810
cat_0                  0.027105
cat_1                  0.050821
cat_2                  0.073548
session_id             0.105790
time                   0.218753
is_price_invalid       0.219191
product                0.219191
dtype: float64

In [23]:
data[data.is_price_invalid].set_index('product').index.unique()

Index(['call of duty 2 (pc)', 'plantronics savi w740', 'oki b840dn',
       'bioshock infinite (xbox 360)', 'jabra talk'],
      dtype='object', name='product')

Price of those products is invalid.

### Invalid product info

In [24]:
columns = data.dropna(axis=1).columns.to_series()
product_info = columns.apply(lambda c: metrics.mutual_info_score(data[c], data.known_product))
product_info.sort_values()

female                 7.987249e-08
syntetic_duration      4.787389e-06
is_user_invalid        9.021084e-06
ends_with_purchase     1.054449e-05
event                  1.349698e-05
made_purchase          1.349698e-05
name_2                 1.494774e-05
discount               1.598062e-05
weekday_cos            6.307612e-05
weekday_sin            6.307612e-05
weekday                6.307612e-05
city                   9.557460e-05
hour_cos               3.598095e-04
hour_sin               4.027123e-04
hour                   4.329925e-04
duration               8.714671e-04
name_1                 1.735197e-03
surname                2.923973e-03
street                 2.975770e-03
is_price_invalid       3.063506e-03
is_delivery_invalid    4.995716e-03
cat_3                  7.480535e-03
agg_time_spent         7.173252e-02
cat_2                  8.082401e-02
session_id             9.171798e-02
time                   2.000048e-01
known_product          2.004034e-01
cat_1                  2.004

# Conclusions

High mutual information score of time feature suggests that there were random problems with the systems data aggregation module. In most cases invalid data seems to be occurring at random thus it can be dropped safely.