In [1]:
from utils import *
from scipy.spatial.distance import cdist

  dfs = dict([(_df_name_reference.get(csv), pd.read_csv( os.path.join(_path, csv) ).rename(columns = _translation_reference.get(_table_reference.get(csv)))) for csv in os.listdir(_path) if csv.endswith('.txt')])


# Handling NA / NaN

In [2]:
for key, df in dfs.items():
    nas = df.isna().sum().replace(0,np.nan).dropna() / df.shape[0] * 100
    print(key, '\n\t', nas)

transaction 
	 trans_spending_category    12.813584
transaction_type            4.066077
merch_x_coord              60.049890
merch_y_coord              60.049890
dtype: float64
branch_visit 
	 total_daily_transactions    0.011494
dtype: float64
statement 
	 Series([], dtype: float64)
customer_demog 
	 customer_home_x_coord    0.003876
customer_home_y_coord    0.003876
customer_work_x_coord    0.006782
customer_work_x_coord    0.006782
customer_income_level    2.373824
akbank_banking_age       0.004845
dtype: float64
transfer 
	 Series([], dtype: float64)
transaction_statement 
	 Series([], dtype: float64)
atm 
	 atm_x_coord    2.948569
atm_y_coord    2.948569
dtype: float64


In [3]:
for key, df in dfs.items():
    print(key, df.columns)

transaction Index(['customer_id', 'transaction_date', 'transaction_time',
       'transaction_total', 'trans_spending_category', 'merch_id',
       'online_payment', 'transaction_type', 'currency', 'merch_x_coord',
       'merch_y_coord'],
      dtype='object')
branch_visit Index(['customer_id', 'branch_visit_date', 'branch_visit_time',
       'total_daily_transactions', 'branch_id', 'branch_x_coord',
       'branch_y_coord'],
      dtype='object')
statement Index(['customer_id', 'cc_id', 'end_of_month_date', 'statement_amount_TL',
       'statement_amount_USD', 'statement_amount_Euro', 'statement_date',
       'statement_due_date'],
      dtype='object')
customer_demog Index(['customer_id', 'customer_segment', 'branch_id',
       'customer_main_branch_x_coord', 'customer_main_branch_y_coord',
       'customer_home_x_coord', 'customer_home_y_coord',
       'customer_work_x_coord', 'customer_work_x_coord', 'customer_gender',
       'customer_marital_status', 'customer_education_level',


**Impute customer demographic nans** only as we don't use the other missing categories in our final dataset

In [4]:
median_val = np.nanmedian(dfs.get('customer_demog')['customer_income_level'])
dfs.get('customer_demog')['customer_income_level'] = dfs.get('customer_demog')['customer_income_level'].replace(np.nan, median_val)

In [5]:
for key, df in dfs.items():
    nas = df.isna().sum().replace(0,np.nan).dropna() / df.shape[0] * 100
    print(key, '\n\t', nas)

transaction 
	 trans_spending_category    12.813584
transaction_type            4.066077
merch_x_coord              60.049890
merch_y_coord              60.049890
dtype: float64
branch_visit 
	 total_daily_transactions    0.011494
dtype: float64
statement 
	 Series([], dtype: float64)
customer_demog 
	 customer_home_x_coord    0.003876
customer_home_y_coord    0.003876
customer_work_x_coord    0.006782
customer_work_x_coord    0.006782
akbank_banking_age       0.004845
dtype: float64
transfer 
	 Series([], dtype: float64)
transaction_statement 
	 Series([], dtype: float64)
atm 
	 atm_x_coord    2.948569
atm_y_coord    2.948569
dtype: float64


# Feature Engineering

## Transfers

In [83]:
df_transfer = dfs.get('transfer').copy()
df_transfer.set_index('customer_id', inplace = True)
df_transfer.head()

Unnamed: 0_level_0,internal_transfer_date,destination_id,currency,internal_transaction_type,internal_transfer_amount,destination_type
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6839524,04SEP2014:00:00:00,4839175,TURK_LIRASI,DIGER_ODEMELER,265.0,GERCEK
6852526,04SEP2014:00:00:00,15252635,TURK_LIRASI,DIGER_ODEMELER,4000.0,GERCEK
6900211,04SEP2014:00:00:00,7514119,TURK_LIRASI,DIGER_ODEMELER,250.0,GERCEK
6918588,04SEP2014:00:00:00,16275084,TURK_LIRASI,DIGER_ODEMELER,2250.0,GERCEK
6938781,04SEP2014:00:00:00,3553710,TURK_LIRASI,DIGER_ODEMELER,65.0,GERCEK


## Branch Visits

In [110]:
df_branch = dfs.get('branch_visit').copy()

Converts a visit date and time to categorical (dummy encoded) day of week and categorical (dummy encoded) time of day

In [111]:
df_visit_time = pd.to_datetime(df_branch['branch_visit_time'], format='%d%b%Y:%H:%M:%S.000000')
day_of_week = pd.get_dummies(df_visit_time.dt.dayofweek, prefix = 'BV_DoW') #Branch Visit Day of Week
time_of_day = pd.get_dummies(pd.cut(df_visit_time.dt.hour, bins=4)) #Branch Visit Time of Day
time_of_day.columns = ['BV_very_early','BV_early','BV_later','BV_late']

df_branch = df_branch.join(day_of_week).join(time_of_day)
df_branch = df_branch.drop(['branch_visit_date','branch_visit_time','branch_id'],axis=1)
df_branch = df_branch.groupby('customer_id')[df_branch.columns[2:]].sum()
df_branch.head()

Unnamed: 0_level_0,branch_x_coord,branch_y_coord,BV_DoW_0,BV_DoW_1,BV_DoW_2,BV_DoW_3,BV_DoW_4,BV_DoW_5,BV_DoW_6,BV_very_early,BV_early,BV_later,BV_late
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1560993,120.885052,85.121291,0,1,2,0,0,0,0,0,3,0,0
1566513,163.059159,119.511353,0,0,2,1,1,0,0,1,2,1,0
1567711,287.116353,200.630018,1,2,1,3,0,0,0,4,2,1,0
1569761,450.438796,317.342252,5,0,3,0,3,0,0,3,4,4,0
1569938,77.844623,64.71732,2,0,0,0,0,0,0,1,0,1,0


Finding centered branch location and the average distance from the center (distance not currently working)

In [112]:
df_branch['branch_x_mean'] = df_branch.groupby('customer_id')['branch_x_coord'].transform('mean')
df_branch['branch_y_mean'] = df_branch.groupby('customer_id')['branch_y_coord'].transform('mean')
branch_mean = df_branch.groupby('customer_id')[['branch_y_mean','branch_x_mean','branch_x_coord','branch_y_coord']].apply(
    lambda df: cdist(df[['branch_x_coord','branch_y_coord']], np.array([df[['branch_x_mean','branch_y_mean']].iloc[0,:]])).mean()).to_frame()
branch_mean.columns = ['branch_mean_distance']

df_branch = df_branch.merge(branch_mean,on='customer_id',how='left')
df_branch.head()

Unnamed: 0_level_0,branch_x_coord,branch_y_coord,BV_DoW_0,BV_DoW_1,BV_DoW_2,BV_DoW_3,BV_DoW_4,BV_DoW_5,BV_DoW_6,BV_very_early,BV_early,BV_later,BV_late,branch_x_mean,branch_y_mean,branch_mean_distance
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1560993,120.885052,85.121291,0,1,2,0,0,0,0,0,3,0,0,120.885052,85.121291,0.0
1566513,163.059159,119.511353,0,0,2,1,1,0,0,1,2,1,0,163.059159,119.511353,0.0
1567711,287.116353,200.630018,1,2,1,3,0,0,0,4,2,1,0,287.116353,200.630018,0.0
1569761,450.438796,317.342252,5,0,3,0,3,0,0,3,4,4,0,450.438796,317.342252,0.0
1569938,77.844623,64.71732,2,0,0,0,0,0,0,1,0,1,0,77.844623,64.71732,0.0


In [113]:
df_branch = df_branch.drop(['branch_x_coord','branch_y_coord','branch_x_mean','branch_y_mean'],axis=1)
df_branch.head()

Unnamed: 0_level_0,BV_DoW_0,BV_DoW_1,BV_DoW_2,BV_DoW_3,BV_DoW_4,BV_DoW_5,BV_DoW_6,BV_very_early,BV_early,BV_later,BV_late,branch_mean_distance
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1560993,0,1,2,0,0,0,0,0,3,0,0,0.0
1566513,0,0,2,1,1,0,0,1,2,1,0,0.0
1567711,1,2,1,3,0,0,0,4,2,1,0,0.0
1569761,5,0,3,0,3,0,0,3,4,4,0,0.0
1569938,2,0,0,0,0,0,0,1,0,1,0,0.0


## Payments

In [11]:
df_payment = dfs.get('transaction_statement').copy()
df_payment.set_index('customer_id', inplace = True)
df_payment.head()

Unnamed: 0_level_0,cc_id,payment_date,statement_currency,payment_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9008066,186504394,31DEC2014:00:00:00,TURK_LIRASI,2451.71
4618889,77327431,31DEC2014:00:00:00,TURK_LIRASI,100.0
25504657,161523393,31DEC2014:00:00:00,TURK_LIRASI,828.74
8166770,112547717,31DEC2014:00:00:00,TURK_LIRASI,50.0
21966278,122448646,31DEC2014:00:00:00,TURK_LIRASI,288.5


In [12]:
df_payment=df_payment[df_payment.statement_currency=='TURK_LIRASI']

# Get the date range of transaction data; used to calculate monthly averages
dates = pd.to_datetime(df_payment['payment_date'], format='%d%b%Y:%H:%M:%S')
transaction_range = (dates.max().to_period('M') - dates.min().to_period('M')).n + 1

In [13]:
# Average amount paid for any given payment
payment_mean = df_payment.groupby('customer_id')['payment_amount'].mean()

# Variation of payment amount
payment_std = df_payment.groupby('customer_id')['payment_amount'].std()

# Average times customer makes a payment in a month
payment_freq = df_payment.groupby('customer_id')['payment_amount'].count() / transaction_range

In [14]:
df_payment = pd.concat([payment_mean, payment_std, payment_freq], axis = 1, keys = ['payment_mean', 'payment_std', 'payment_monthly_freq'])
df_payment.head()

Unnamed: 0_level_0,payment_mean,payment_std,payment_monthly_freq
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1560993,118.0625,171.504379,0.333333
1566513,573.497333,635.42824,2.5
1567711,741.538462,391.35347,2.166667
1569761,856.911765,681.381552,1.416667
1569938,748.461538,607.158053,1.083333


## Transactions

In [97]:
df_transactions = dfs.get('transaction').copy()
df_transactions.head()

Unnamed: 0,customer_id,transaction_date,transaction_time,transaction_total,trans_spending_category,merch_id,online_payment,transaction_type,currency,merch_x_coord,merch_y_coord
0,11861396,23OCT2014:00:00:00,21:01:13,10.57,RESTORAN,999999,0,AV,TURK_LIRASI,,
1,13667756,12APR2015:00:00:00,15:18:50,82.0,RESTORAN,999999,0,AV,TURK_LIRASI,,
2,11024865,23JUL2014:00:00:00,21:12:12,15.0,RESTORAN,999999,0,AV,TURK_LIRASI,,
3,10922788,06AUG2014:00:00:00,21:54:49,86.0,RESTORAN,999999,0,AV,TURK_LIRASI,,
4,10335987,12APR2015:00:00:00,19:10:09,30.0,RESTORAN,999999,0,AV,TURK_LIRASI,,


In [98]:
df_transactions.trans_spending_category.value_counts(sort = True)

GIDA                                      2718699
RESTORAN                                  1265400
TEKSTÝL                                    657625
AKARYAKIT                                  646976
TELEKOMÜNÝKASYON                           343764
SAÐLIK                                     319314
DÝÐER                                      300104
HÝZMET SEKTÖRLERÝ                          252031
TEKNOLOJÝ                                  189665
NAKÝT AVANS                                148677
MOBÝLYA, DEKORASYON                        125705
KOZMETÝK                                   124840
SÝGORTA-MAIL ORDER                         103323
SEYAHAT ACENTALARI - TAÞIMACILIK           101652
AYAKKABI                                    97174
YAPI MALZ., HIRDAVAT, NALBURÝYE             75576
OTOMOTÝV                                    71630
SPOR GÝYÝM                                  66044
MUZIK MARKET KIRTASÝYE                      60341
HOTEL                                       56372


Grabbing the top 4 spending categories

In [99]:
tran_cat_list = df_transactions.trans_spending_category.value_counts(sort = True)[:4].index
tran_cat_list

Index(['GIDA', 'RESTORAN', 'TEKSTÝL', 'AKARYAKIT'], dtype='object')

In [100]:
df_transactions['trans_spending_category'] = df_transactions['trans_spending_category'].apply(lambda i: i if i in tran_cat_list else 'OTHER')
df_transactions.trans_spending_category.value_counts()

OTHER        4045925
GIDA         2718699
RESTORAN     1265400
TEKSTÝL       657625
AKARYAKIT     646976
Name: trans_spending_category, dtype: int64

The sample dataset was largely transactions in one currency. The others we consider outliers (since so small).

In [101]:
df_transactions=df_transactions[df_transactions.currency=='TURK_LIRASI']

Prep for calculation

In [102]:
# Get the date range of transaction data; used to calculate monthly averages
dates = pd.to_datetime(df_transactions['transaction_date'], format='%d%b%Y:%H:%M:%S')
transaction_range = (dates.max().to_period('M') - dates.min().to_period('M')).n + 1

# Grab relevant columns
df_transactions = df_transactions.get(['customer_id', 'trans_spending_category','transaction_total'])

df_transactions.set_index(['customer_id','trans_spending_category'], inplace = True)
df_transactions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,transaction_total
customer_id,trans_spending_category,Unnamed: 2_level_1
11861396,RESTORAN,10.57
13667756,RESTORAN,82.0
11024865,RESTORAN,15.0
10922788,RESTORAN,86.0
10335987,RESTORAN,30.0


Average amount in one given transaction and the monthly frequency of transactions

In [103]:
transaction_average_amount = df_transactions.groupby(['customer_id','trans_spending_category'])['transaction_total'].mean()
transaction_average_monthly_frequency = df_transactions.groupby(['customer_id','trans_spending_category'])['transaction_total'].count()/transaction_range
df_transactions = pd.concat([transaction_average_amount, transaction_average_monthly_frequency], axis = 1, keys = ["trans_average_amount","trans_average_monthly_freq"])
df_transactions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,trans_average_amount,trans_average_monthly_freq
customer_id,trans_spending_category,Unnamed: 2_level_1,Unnamed: 3_level_1
1560993,GIDA,71.275,0.333333
1560993,OTHER,84.0,0.083333
1566513,AKARYAKIT,72.333333,2.5
1566513,GIDA,32.421,5.0
1566513,OTHER,156.60463,4.5


In [104]:
df_transactions = df_transactions.reset_index().pivot(index = 'customer_id', columns = 'trans_spending_category')
df_transactions.columns = df_transactions.columns.map('_'.join)
df_transactions.head()

Unnamed: 0_level_0,trans_average_amount_AKARYAKIT,trans_average_amount_GIDA,trans_average_amount_OTHER,trans_average_amount_RESTORAN,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1560993,,71.275,84.0,,,,0.333333,0.083333,,
1566513,72.333333,32.421,156.60463,18.125,,2.5,5.0,4.5,0.333333,
1567711,41.702222,35.58,88.682613,27.313659,77.600278,0.75,2.333333,9.25,3.416667,6.0
1569761,,,767.268182,,131.934,,,1.833333,,0.833333
1569938,,33.576667,568.320833,33.0,19.9,,0.25,1.0,0.083333,0.083333


## Statements

In [49]:
df_statement = dfs.get('statement')

df_statement = df_statement.groupby('customer_id')['statement_amount_TL'].mean().to_frame().merge(
    df_statement.groupby('customer_id')['statement_amount_TL'].std().to_frame()
                                                                        , on='customer_id', how='left')
df_statement.columns = ['statement_amount_TL_mean','statement_amount_TL_std']
df_statement.head()

Unnamed: 0_level_0,statement_amount_TL_mean,statement_amount_TL_std
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1560993,179.46,164.244753
1566513,459.045455,601.144372
1567711,3077.044167,1159.815099
1569761,1130.805833,875.263636
1569938,1531.865,1082.873206


## Customer Demographics

Customer demographics contains a lot of data on individual users we'll be keeping, like age, home coordinate, marital status, etc.  There is a set of monthly risk codes that we will be condensing from 12 columns (1 per month) down to 1 (1 per risk code).

First, we establish what the risk codes are.  

In [131]:
df_customer_demog = dfs.get('customer_demog').copy()
df_customer_demog.set_index('customer_id', inplace = True)
df_customer_demog.head()

Unnamed: 0_level_0,customer_segment,branch_id,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_work_x_coord,customer_work_x_coord,customer_gender,customer_marital_status,...,KK_RISK_KODU_201409,KK_RISK_KODU_201410,KK_RISK_KODU_201411,KK_RISK_KODU_201412,KK_RISK_KODU_201501,KK_RISK_KODU_201502,KK_RISK_KODU_201503,KK_RISK_KODU_201504,KK_RISK_KODU_201505,KK_RISK_KODU_201506
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25139695,BÝREYSEL,2,36.991388,35.325423,36.971366,35.362328,41.02981,29.15245,E,EVLÝ,...,2)GECIKME 1-15 GUN,1)RISKSIZ,2)GECIKME 1-15 GUN,2)GECIKME 1-15 GUN,1)RISKSIZ,2)GECIKME 1-15 GUN,1)RISKSIZ,2)GECIKME 1-15 GUN,4)GECIKME 30-59 GUN,1)RISKSIZ
21213035,BÝREYSEL,2,36.991388,35.325423,37.01067,35.36954,41.07044,28.98941,E,BÝLINMÝYOR,...,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,2)GECIKME 1-15 GUN,1)RISKSIZ,2)GECIKME 1-15 GUN,2)GECIKME 1-15 GUN,1)RISKSIZ,2)GECIKME 1-15 GUN,2)GECIKME 1-15 GUN
21242221,BÝREYSEL,2,36.991388,35.325423,40.920454,29.187872,36.99532,35.27417,E,BEKAR,...,4)GECIKME 30-59 GUN,1)RISKSIZ,2)GECIKME 1-15 GUN,4)GECIKME 30-59 GUN,5)GECIKME 60+ GUN,5)GECIKME 60+ GUN,6)TAKIP,6)TAKIP,6)TAKIP,6)TAKIP
23187800,BÝREYSEL,2,36.991388,35.325423,41.100502,28.892116,41.090251,28.896666,E,EVLÝ,...,4)GECIKME 30-59 GUN,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ
16400475,BÝREBÝR,2,36.991388,35.325423,41.00829,29.083431,40.98475,29.16613,E,EVLÝ,...,1)RISKSIZ,1)RISKSIZ,1)RISKSIZ,2)GECIKME 1-15 GUN,4)GECIKME 30-59 GUN,5)GECIKME 60+ GUN,5)GECIKME 60+ GUN,6)TAKIP,6)TAKIP,6)TAKIP


In [132]:
print( df_customer_demog.shape )
print( df_customer_demog['customer_marital_status'].value_counts() )
df_customer_demog = df_customer_demog[df_customer_demog['customer_marital_status']=='EVLÝ']
print( df_customer_demog.shape)
df_customer_demog['customer_marital_status'].value_counts()

(103209, 27)
EVLÝ                65640
BEKAR               30880
BOÞANMIÞ             4832
BÝLINMÝYOR           1397
DUL                   457
EVLÝLÝÐÝN ÝPTALÝ        3
Name: customer_marital_status, dtype: int64
(65640, 27)


EVLÝ    65640
Name: customer_marital_status, dtype: int64

In [133]:
lst_risk_codes = ['1)RISKSIZ','2)GECIKME 1-15 GUN','3)GECIKME 16-29 GUN','4)GECIKME 30-59 GUN','5)GECIKME 60+ GUN','6)TAKIP']

Next, we iterate through our set of risk codes.  For each risk code, we make a copy of our demographics data frame and set all instances of that risk code in our dataset to 1.  Then, we set all non-1 values to 0, and sum each column.  Lastly, we add a new column to our original dataset, and add those aggregated values to the original dataset.

In [134]:
j=0
for i in lst_risk_codes:
    df_risk_code = df_customer_demog.filter(like = 'kk_risk_kodu')
    df_risk_code = df_risk_code.replace(i,1)
    df_risk_code[df_risk_code!=1] = 0
    df_risk_code = df_risk_code.sum(axis='columns')
    df_customer_demog[lst_risk_codes[j]] = df_risk_code
    j+=1

Lastly, we select out only the columns we'll be retaining.  branch_id is converted into a string, so it will be interpreted as categorical.

In [135]:
df_customer_demog.drop(df_customer_demog.filter(like = 'KK_RISK_KODU').columns, axis = 1, inplace = True)
df_customer_demog.drop(['branch_id', 'customer_segment', 'customer_work_x_coord', 'customer_work_x_coord','customer_marital_status'], axis = 1, inplace = True)
df_customer_demog.head()

Unnamed: 0_level_0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_gender,customer_education_level,customer_job_status,customer_income_level,customer_age,akbank_banking_age,1)RISKSIZ,2)GECIKME 1-15 GUN,3)GECIKME 16-29 GUN,4)GECIKME 30-59 GUN,5)GECIKME 60+ GUN,6)TAKIP
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
25139695,36.991388,35.325423,36.971366,35.362328,E,LÝSE,ÜCRETLÝ (ÖZEL),2500.0,55,2.0,0.0,0.0,0.0,0.0,0.0,0.0
23187800,36.991388,35.325423,41.100502,28.892116,E,LÝSE,ÜCRETLÝ (ÖZEL),850.0,39,3.0,0.0,0.0,0.0,0.0,0.0,0.0
16400475,36.991388,35.325423,41.00829,29.083431,E,ÜNÝVERSÝTE,SERBEST MESLEK,2000.0,45,8.0,0.0,0.0,0.0,0.0,0.0,0.0
17303354,36.991388,35.325423,37.017533,35.337688,E,LÝSE,ÜCRETLÝ (ÖZEL),3500.0,35,7.0,0.0,0.0,0.0,0.0,0.0,0.0
15522570,36.991388,35.325423,41.08011,28.98946,E,LÝSE,ÜCRETLÝ (ÖZEL),2000.0,48,8.0,0.0,0.0,0.0,0.0,0.0,0.0


## ATM

# Combine Tables

Transfer doesn't work

In [136]:
df_actions = pd.concat([df_branch, df_payment, df_transactions, df_statement], axis = 1)
df_actions.head()

Unnamed: 0_level_0,BV_DoW_0,BV_DoW_1,BV_DoW_2,BV_DoW_3,BV_DoW_4,BV_DoW_5,BV_DoW_6,BV_very_early,BV_early,BV_later,...,trans_average_amount_OTHER,trans_average_amount_RESTORAN,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1560993,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,84.0,,,,0.333333,0.083333,,,179.46,164.244753
1566513,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,2.0,1.0,...,156.60463,18.125,,2.5,5.0,4.5,0.333333,,459.045455,601.144372
1567711,1.0,2.0,1.0,3.0,0.0,0.0,0.0,4.0,2.0,1.0,...,88.682613,27.313659,77.600278,0.75,2.333333,9.25,3.416667,6.0,3077.044167,1159.815099
1569761,5.0,0.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,4.0,...,767.268182,,131.934,,,1.833333,,0.833333,1130.805833,875.263636
1569938,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,568.320833,33.0,19.9,,0.25,1.0,0.083333,0.083333,1531.865,1082.873206


In [137]:
df_full = df_customer_demog.join(df_actions, how = 'left')
df_full.head()

Unnamed: 0_level_0,customer_main_branch_x_coord,customer_main_branch_y_coord,customer_home_x_coord,customer_home_y_coord,customer_gender,customer_education_level,customer_job_status,customer_income_level,customer_age,akbank_banking_age,...,trans_average_amount_OTHER,trans_average_amount_RESTORAN,trans_average_amount_TEKSTÝL,trans_average_monthly_freq_AKARYAKIT,trans_average_monthly_freq_GIDA,trans_average_monthly_freq_OTHER,trans_average_monthly_freq_RESTORAN,trans_average_monthly_freq_TEKSTÝL,statement_amount_TL_mean,statement_amount_TL_std
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25139695,36.991388,35.325423,36.971366,35.362328,E,LÝSE,ÜCRETLÝ (ÖZEL),2500.0,55,2.0,...,260.16,,,0.5,0.166667,0.416667,,,536.446923,483.135578
23187800,36.991388,35.325423,41.100502,28.892116,E,LÝSE,ÜCRETLÝ (ÖZEL),850.0,39,3.0,...,233.65,,,,2.333333,0.833333,,,940.334167,389.7594
16400475,36.991388,35.325423,41.00829,29.083431,E,ÜNÝVERSÝTE,SERBEST MESLEK,2000.0,45,8.0,...,5344.329,,,,0.166667,0.833333,,,39089.802727,23847.265741
17303354,36.991388,35.325423,37.017533,35.337688,E,LÝSE,ÜCRETLÝ (ÖZEL),3500.0,35,7.0,...,248.151,35.166667,27.5,1.5,2.583333,4.166667,0.25,0.166667,755.028333,1271.491472
15522570,36.991388,35.325423,41.08011,28.98946,E,LÝSE,ÜCRETLÝ (ÖZEL),2000.0,48,8.0,...,138.499167,,47.532667,,5.416667,2.0,,1.25,680.2625,521.931112


**Percentage of missing data in each column**

In [138]:
df_full.iloc[:,df_customer_demog.shape[1]:].isna().sum() / df_full.shape[0] * 100

BV_DoW_0                                19.373857
BV_DoW_1                                19.373857
BV_DoW_2                                19.373857
BV_DoW_3                                19.373857
BV_DoW_4                                19.373857
BV_DoW_5                                19.373857
BV_DoW_6                                19.373857
BV_very_early                           19.373857
BV_early                                19.373857
BV_later                                19.373857
BV_late                                 19.373857
branch_mean_distance                    19.373857
payment_mean                             1.759598
payment_std                              3.957952
payment_monthly_freq                     1.759598
trans_average_amount_AKARYAKIT          40.214808
trans_average_amount_GIDA               13.214503
trans_average_amount_OTHER               1.410725
trans_average_amount_RESTORAN           32.807739
trans_average_amount_TEKSTÝL            31.255332


All of the customers with missing values related to branch visits have no recorded branch visits at all. Therefore, we can safely impute these columns to zero along with all the other columns

In [139]:
df_full[~np.isnan(df_full['branch_mean_distance'])].loc[:,"BV_DoW_0":"branch_mean_distance"].isna().sum() 

BV_DoW_0                0
BV_DoW_1                0
BV_DoW_2                0
BV_DoW_3                0
BV_DoW_4                0
BV_DoW_5                0
BV_DoW_6                0
BV_very_early           0
BV_early                0
BV_later                0
BV_late                 0
branch_mean_distance    0
dtype: int64

In [140]:
df_full.fillna(0).to_csv('../data/aggregated_raw_data.txt', encoding = 'utf-8')