# Feature Engineering and Exploration

This will create monthly features for users, and then explore the best combination of temporal attributes to retain for final data matrix

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from functools import reduce

In [2]:
trx = pd.read_csv('../data/historical_transactions.csv')
trx['new'] = False
merchants = pd.read_csv('../data/merchants.csv')
new_merchant_trx = pd.read_csv('../data/new_merchant_transactions.csv')
new_merchant_trx['new'] = True
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

Need to de-normalize purchase amounts and loyalty scores based on the work of [raddar](https://www.kaggle.com/raddar/towards-de-anonymizing-the-data-some-insights)

In [10]:
new_merchant_trx['purchase_amount_new'] = np.round(new_merchant_trx['purchase_amount'] / 0.00150265118 + 497.06,2)
trx['purchase_amount_new'] = np.round(trx['purchase_amount'] / 0.00150265118 + 497.06,2)
train['target_raw'] = 2**train['target']

In [11]:
random.seed(903)
sample = np.random.choice(trx.card_id.unique(), size=1000, replace=False)
trx[trx.card_id.isin(sample)].card_id.unique().size

1000

In [12]:
trx_sample = pd.concat([trx[trx.card_id.isin(sample)], new_merchant_trx[new_merchant_trx.card_id.isin(sample)]])
trx_sample.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,new,purchase_amount_new
10570,Y,C_ID_9c199928d9,60,N,0,A,307,M_ID_666290188f,-1,-0.671775,2018-01-09 16:17:19,1.0,16,19,False,50.0
10571,Y,C_ID_9c199928d9,11,N,0,A,690,M_ID_e0f8a4be8b,0,-0.71687,2018-02-15 22:10:54,2.0,23,1,False,19.99
10572,Y,C_ID_9c199928d9,11,N,0,A,544,M_ID_8cfd16febe,-6,-0.476731,2017-08-12 12:43:37,2.0,23,29,False,179.8
10573,Y,C_ID_9c199928d9,11,N,0,A,705,M_ID_b692f6f5d1,-2,-0.455138,2017-12-30 20:35:03,2.0,23,33,False,194.17
10574,Y,C_ID_9c199928d9,69,N,0,A,879,M_ID_00a6ca8a8a,-1,-0.698823,2018-01-30 13:10:03,1.0,9,29,False,32.0


In [13]:
trx_sample.sort_values(['card_id', 'month_lag']).drop_duplicates(['card_id', 'merchant_id'])\
    .groupby(['card_id', 'month_lag']).merchant_id.size().unstack().fillna(0).cumsum(axis=1).head()

month_lag,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C_ID_000c778b05,0.0,0.0,0.0,0.0,0.0,1.0,5.0,7.0,8.0,12.0,14.0,16.0,16.0,18.0,18.0,18.0
C_ID_00136926d8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,40.0,63.0,85.0,95.0
C_ID_002198cdf1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,13.0,19.0,23.0,42.0,51.0,57.0,62.0,72.0
C_ID_007ad8000a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,4.0,4.0,5.0
C_ID_007d884232,4.0,4.0,6.0,8.0,8.0,8.0,9.0,10.0,11.0,14.0,15.0,18.0,18.0,23.0,23.0,23.0


In [19]:
monthly_new_merchants = trx_sample.sort_values(['card_id', 'month_lag'])\
    .drop_duplicates(['card_id', 'merchant_id'])[['card_id', 'month_lag', 'merchant_id']]
monthly_new_merchants.head(15)

Unnamed: 0,card_id,month_lag,merchant_id
28379126,C_ID_000c778b05,-8,M_ID_e19e124dda
28379109,C_ID_000c778b05,-7,M_ID_c1a7024db1
28379120,C_ID_000c778b05,-7,M_ID_f92c0bfe89
28379123,C_ID_000c778b05,-7,M_ID_c745609933
28379128,C_ID_000c778b05,-7,M_ID_eb5c15699e
28379119,C_ID_000c778b05,-6,M_ID_c89e1bbe83
28379124,C_ID_000c778b05,-6,M_ID_fb8b483978
28379118,C_ID_000c778b05,-5,M_ID_d5ed70c9b4
28379110,C_ID_000c778b05,-4,M_ID_c7796182d7
28379111,C_ID_000c778b05,-4,M_ID_508ecbdcde


In [113]:
new_merch_dict = monthly_new_merchants.groupby(['card_id', 'month_lag'])\
    .apply(lambda x: list(x['merchant_id'])).unstack(0).fillna('').to_dict()

In [114]:
new_merch_dict

{'C_ID_000c778b05': {-13: '',
  -12: '',
  -11: '',
  -10: '',
  -9: '',
  -8: ['M_ID_e19e124dda'],
  -7: ['M_ID_c1a7024db1',
   'M_ID_f92c0bfe89',
   'M_ID_c745609933',
   'M_ID_eb5c15699e'],
  -6: ['M_ID_c89e1bbe83', 'M_ID_fb8b483978'],
  -5: ['M_ID_d5ed70c9b4'],
  -4: ['M_ID_c7796182d7',
   'M_ID_508ecbdcde',
   'M_ID_2857277b5c',
   'M_ID_72cb7913d3'],
  -3: ['M_ID_03bce12537', 'M_ID_ac1db31b2b'],
  -2: ['M_ID_c3e75d50fb', 'M_ID_462252c479'],
  -1: '',
  0: ['M_ID_60c9775184', 'M_ID_dcc7de5216'],
  1: '',
  2: ''},
 'C_ID_00136926d8': {-13: '',
  -12: '',
  -11: '',
  -10: '',
  -9: '',
  -8: '',
  -7: '',
  -6: '',
  -5: '',
  -4: '',
  -3: '',
  -2: ['M_ID_98227dab01',
   'M_ID_f5e44ecb50',
   'M_ID_ceddb97df2',
   'M_ID_bf2d99f08e',
   'M_ID_6a818966f1',
   'M_ID_23e4a4ff5a',
   'M_ID_50f575c681',
   'M_ID_9023e3101d',
   'M_ID_18f3c665fe',
   'M_ID_23b313c34d',
   'M_ID_3a6c7625ad',
   'M_ID_71d0981a4b',
   'M_ID_2fd0706c4d',
   'M_ID_00a6ca8a8a',
   'M_ID_fea5e4d04c',
   'M_ID

In [117]:
def f(group):
    new_merch = list(new_merch_dict.get(group['card_id'].iloc[0]).get(group['month_lag'].iloc[0]))
    return group[group['merchant_id'].isin(new_merch)].purchase_amount_new.sum()

In [133]:
grouped = trx_sample.groupby(['card_id', 'month_lag'])
groups = pd.DataFrame({'amt_total': grouped.purchase_amount_new.sum()})
groups['amt_new'] = grouped.apply(f)
groups

Unnamed: 0_level_0,Unnamed: 1_level_0,amt_total,amt_new
card_id,month_lag,Unnamed: 2_level_1,Unnamed: 3_level_1
C_ID_000c778b05,-8,49.52,49.52
C_ID_000c778b05,-7,167.71,167.71
C_ID_000c778b05,-6,359.98,359.98
C_ID_000c778b05,-5,36.37,36.37
C_ID_000c778b05,-4,120.59,120.59
C_ID_000c778b05,-3,511.67,511.67
C_ID_000c778b05,-2,359.78,359.78
C_ID_000c778b05,-1,68.92,0.00
C_ID_000c778b05,0,249.60,135.35
C_ID_00136926d8,-2,11971.84,11971.84
