In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import os
import time
import warnings
import gc
gc.collect()
import os
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler

In [2]:
# to make this notebook's output stable across runs
np.random.seed(123)
gc.collect()
# To plot pretty figures
%matplotlib inline
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [3]:
#Reduce the memory usage - by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [108]:
merchants = reduce_mem_usage(pd.read_csv('merchants_clean.csv'))

Mem. usage decreased to 16.91 Mb (71.2% reduction)


In [109]:
merchants = merchants.drop(['Unnamed: 0'], axis = 1)

In [64]:
merchants.columns

Index(['merchant_id', 'merchant_group_id', 'merchant_category_id',
       'subsector_id', 'numerical_1', 'numerical_2', 'category_1',
       'most_recent_sales_range', 'most_recent_purchases_range',
       'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3',
       'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6',
       'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12',
       'category_4', 'city_id', 'state_id', 'category_2'],
      dtype='object')

In [110]:
merchants.head(2)

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8265,292,8,-0.057465,-0.057465,0,4,4,-0.4,...,-2.25,18.671875,6,-2.32,13.914062,12,0,189,9,1.0
1,M_ID_9339d880ad,3147,310,19,-0.057465,-0.057465,0,4,4,-0.72,...,-0.74,1.291992,6,-0.57,1.6875,12,0,18,16,1.0


In [83]:
merchants = merchants.drop(['merchant_group_id','merchant_category_id','subsector_id','city_id','state_id'], axis = 1)

In [100]:
d = dict(zip(merchants.columns[1:], ['new_merchant_{}'.format(x) for x in (merchants.columns[1:])]))
d.update({"merchant_id": "new_merchant_id_<lambda>"})


In [101]:
merchants = merchants.rename(index=str, columns= d)
merchants.head(2)

Unnamed: 0,new_merchant_id_<lambda>,new_merchant_numerical_1,new_merchant_numerical_2,new_merchant_category_1,new_merchant_most_recent_sales_range,new_merchant_most_recent_purchases_range,new_merchant_avg_sales_lag3,new_merchant_avg_purchases_lag3,new_merchant_active_months_lag3,new_merchant_avg_sales_lag6,new_merchant_avg_purchases_lag6,new_merchant_active_months_lag6,new_merchant_avg_sales_lag12,new_merchant_avg_purchases_lag12,new_merchant_active_months_lag12,new_merchant_category_4,new_merchant_category_2
0,M_ID_838061e48c,-0.057465,-0.057465,0,4,4,-0.4,9.664062,3,-2.25,18.671875,6,-2.32,13.914062,12,0,1.0
1,M_ID_9339d880ad,-0.057465,-0.057465,0,4,4,-0.72,1.75,3,-0.74,1.291992,6,-0.57,1.6875,12,0,1.0


In [66]:
new_transactions = reduce_mem_usage(pd.read_csv('new_merchant_transactions_clean.csv'))

Mem. usage decreased to 106.71 Mb (52.5% reduction)


In [67]:
new_transactions.columns

Index(['Unnamed: 0', 'authorized_flag', 'card_id', 'city_id', 'category_1',
       'installments', 'category_3', 'merchant_category_id', 'merchant_id',
       'month_lag', 'purchase_amount', 'purchase_date', 'category_2',
       'state_id', 'subsector_id'],
      dtype='object')

In [68]:
from scipy import stats
agg_func = {
        'merchant_id' : ['nunique', lambda x:stats.mode(x)[0]]
}

new_transactions = new_transactions.groupby(['card_id']).agg(agg_func)
new_transactions.columns = ['new_' + '_'.join(col).strip() for col in new_transactions.columns.values]
new_transactions.reset_index(inplace=True)

In [69]:
new_transactions.head(2)

Unnamed: 0,card_id,new_merchant_id_nunique,new_merchant_id_<lambda>
0,C_ID_00007093c1,2,M_ID_00a6ca8a8a
1,C_ID_0001238066,25,M_ID_00a6ca8a8a


In [79]:
new_transactions = new_transactions.merge(merchants, on = 'new_merchant_id_<lambda>', how = 'left')

In [80]:
new_transactions.head()

Unnamed: 0,card_id,new_merchant_id_nunique,new_merchant_id_<lambda>,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,C_ID_00007093c1,2,M_ID_00a6ca8a8a,34.0,189.0,6.0,-0.047546,-0.057465,1.0,4.0,...,8.42,15.859375,6.0,8.57,16.21875,7.0,1.0,0.0,0.0,3.0
1,C_ID_0001238066,25,M_ID_00a6ca8a8a,34.0,189.0,6.0,-0.047546,-0.057465,1.0,4.0,...,8.42,15.859375,6.0,8.57,16.21875,7.0,1.0,0.0,0.0,3.0
2,C_ID_0001506ef0,1,M_ID_ab756f937e,32730.0,264.0,32.0,-0.047546,-0.047546,0.0,2.0,...,1.03,1.041016,6.0,0.98,1.00293,12.0,1.0,107.0,19.0,3.0
3,C_ID_0001793786,31,M_ID_0360f86430,106049.0,7.0,35.0,-0.047546,-0.057465,0.0,3.0,...,1.19,1.145508,6.0,1.37,1.391602,12.0,1.0,54.0,9.0,1.0
4,C_ID_000183fdda,11,M_ID_113378fe3b,34.0,130.0,15.0,1.162109,1.092773,0.0,2.0,...,1.03,1.045898,6.0,0.96,0.996094,12.0,1.0,126.0,3.0,3.0
