# Exploratory analysis with historical transactions

In [1]:
# Library imports
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid', font_scale = 1.5, color_codes=True)
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt

# Constants
n_rows_to_load = 10000000

In [2]:
# Import the origin datasets that we will be using
hist_transactions = pd.read_csv("data/historical_transactions.csv")
train_set = pd.read_csv("data/train.csv", index_col='card_id', parse_dates=["first_active_month"])

##  Target variable (loyalty score)

Some inspiration analysis comes from: https://www.kaggle.com/sudalairajkumar/simple-exploration-notebook-elo

In [3]:
# Plot the target value to see what type of scores we are looking for
 
print("Statistics of target column (loyalty score)\n")

print(train_set['target'].describe())

#plt.figure(figsize=(8,5))
#plt.scatter(range(0,len(train_set)), np.sort(train_set['target'].values))
#plt.ylabel('Loyalty score (target)')
#plt.show()

Statistics of target column (loyalty score)

count    201917.000000
mean         -0.393636
std           3.850500
min         -33.219281
25%          -0.883110
50%          -0.023437
75%           0.765453
max          17.965068
Name: target, dtype: float64


We can see that majority of scores are in (-1,1) range with some extreme outliers in either direction.

## Identifying linear relationships between variables

*Hint: we have found none of statistical significance*

In [4]:
# Truncate data to only include transactions in 2017
hist_transactions = hist_transactions[(hist_transactions['purchase_date'] >= '2017-01-01 00:00:00') & (hist_transactions['purchase_date'] < '2018-01-01 00:00:00')]

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37
6,Y,C_ID_4e6213e9bc,88,N,0,A,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,1.0,16,37
7,Y,C_ID_4e6213e9bc,3,N,0,A,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,1.0,16,37
8,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,1.0,16,37
9,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,1.0,16,37
10,Y,C_ID_4e6213e9bc,88,N,0,A,683,M_ID_1449f22bfb,-9,-0.734135,2017-05-09 12:42:07,1.0,16,34


In [5]:
hist_train = pd.merge(
    hist_transactions,
    train_set,
    how='inner',
    left_on='card_id',
    right_index=True,
    suffixes=('_trans', '_train')
)

hist_groupby_id = hist_train[['card_id', 'purchase_amount','merchant_id']].groupby('card_id').agg({
    'purchase_amount': {
        'purchase_mean': 'mean',
        'purchase_sum': 'sum',
        'purchase_max': 'max',
        'n_transactions': 'count'
    },
    'merchant_id': 'nunique'
})

MemoryError: 

In [None]:
hist_groupby_id_w_train = pd.merge(
    pd.merge(hist_groupby_id['purchase_amount'],hist_groupby_id['merchant_id'],on="card_id"),
    train_set,
    how="inner", # only look at it if it exists in the train set
    left_index=True,
    right_index=True)

hist_groupby_id_w_train.rename(columns={'nunique':'n_unique_merchants'},inplace=True)

Use pairplots to check if there is a relationship between any purchase amount statistic and the loyalty score that seems to be immediately apparent.

In [None]:
cols_to_show = [
    'purchase_mean',
    'purchase_sum',
    'purchase_max',
    'n_transactions',
    'n_unique_merchants',
    'target'
]
sns.pairplot(hist_groupby_id_w_train[cols_to_show].sample(5000))

The most interesting result we see here is that as the number of transactions increases, we see that the deviation of the loyalty score starts to decrease. We see the same decrease in variability as a user's mean, total, and max purchase amounts increase, but to a much lesser extent.