# DATA MERGING AND VALIDATION FOR CREDIT RISK ANALYSIS

We will import and join the datasets and explore some missingness of the data

In [206]:
import pandas as pd
import pickle

In [207]:
trans = pd.read_csv('trans.asc',sep=';')
client = pd.read_csv('client.asc',sep=';')
account = pd.read_csv('account.asc',sep=';')
disp = pd.read_csv('disp.asc',sep=';')
order = pd.read_csv('order.asc',sep=';')
loan = pd.read_csv('loan.asc',sep=';')
card = pd.read_csv('card.asc',sep=';')
district = pd.read_csv('district.asc',sep=';')

### Reading and merging the loan and account -related datasets

In [208]:
df = pd.merge(loan, account,on='account_id', suffixes=['_loan','_acnt'], how='outer')
df = pd.merge(df, disp, on='account_id', how='outer')
df = pd.merge(df, client, on='client_id', how='outer', suffixes = ['_clnt','_acnt'])
df = pd.merge(df, district, left_on='district_id_clnt', right_on='A1', how='outer')
df = pd.merge(df, card, on='disp_id', how='outer', suffixes=['', '_card'])

### Read in and merge transaction data

In [209]:
trans = pd.read_csv('trans.asc', sep=';')

In [210]:
trans = pd.merge(trans, df, on='account_id', suffixes=['_trans', ''], how='left')
#Subset the data set to transactions for accounts with loans:
trans_loans = trans[~pd.isna(trans.loan_id)]

In [211]:
print('Total transactions: ', len(trans),'\nTransactions for accounts with a loan: ', len(trans_loans))

Total transactions:  1262625 
Transactions for accounts with a loan:  233627


### Explore proportional missingness in full data and the loan data

In [212]:
print(np.sum(trans.isna())/len(trans))

trans_id            0.000000
account_id          0.000000
date                0.000000
type_trans          0.000000
operation           0.172902
amount_trans        0.000000
balance             0.000000
k_symbol            0.451296
bank                0.733849
account             0.713619
loan_id             0.814967
date_loan           0.814967
amount              0.814967
duration            0.814967
payments            0.814967
status              0.814967
district_id_clnt    0.000000
frequency           0.000000
date_acnt           0.000000
disp_id             0.000000
client_id           0.000000
type                0.000000
birth_number        0.000000
district_id_acnt    0.000000
A1                  0.000000
A2                  0.000000
A3                  0.000000
A4                  0.000000
A5                  0.000000
A6                  0.000000
A7                  0.000000
A8                  0.000000
A9                  0.000000
A10                 0.000000
A11           

In [213]:
print(np.sum(trans_loans.isna())/len(trans_loans))

trans_id            0.000000
account_id          0.000000
date                0.000000
type_trans          0.000000
operation           0.163277
amount_trans        0.000000
balance             0.000000
k_symbol            0.482050
bank                0.732989
account             0.670102
loan_id             0.000000
date_loan           0.000000
amount              0.000000
duration            0.000000
payments            0.000000
status              0.000000
district_id_clnt    0.000000
frequency           0.000000
date_acnt           0.000000
disp_id             0.000000
client_id           0.000000
type                0.000000
birth_number        0.000000
district_id_acnt    0.000000
A1                  0.000000
A2                  0.000000
A3                  0.000000
A4                  0.000000
A5                  0.000000
A6                  0.000000
A7                  0.000000
A8                  0.000000
A9                  0.000000
A10                 0.000000
A11           

The missingness of credit card data is no issue, as the type and issue date of cards are likely not relevant to the problem. Bank and account columns report the bank and account of a partner in a transaction. As such, the specific bank and of a partner are likely not relevant and missingness should not be an issue.

Because we want to utilize the transactions for predicting credit defaults, transaction types are intuitively important. The operation and k_symbol columns describe the transaction type. We will explore the missing operation data further by examining, whethere there are cases where both operation and k_symbol are missing:

In [214]:
print('Percentage of cases where both operation and k_symbol are missing: ')
print(np.sum(trans.operation.isna() & trans.k_symbol.isna())/len(trans))
print('Percentage of cases where both operation and k_symbol are reported: ')
print(np.sum(~trans.operation.isna() & ~trans.k_symbol.isna())/len(trans))

print('Percentage of cases where both operation and k_symbol are missing: ')
print(np.sum(trans_loans.operation.isna() & trans_loans.k_symbol.isna())/len(trans))
print('Percentage of cases where both operation and k_symbol are reported: ')
print(np.sum(~trans_loans.operation.isna() & ~trans_loans.k_symbol.isna())/len(trans))

Percentage of cases where both operation and k_symbol are missing: 
0.0
Percentage of cases where both operation and k_symbol are reported: 
0.3758019998019998
Percentage of cases where both operation and k_symbol are missing: 
0.0
Percentage of cases where both operation and k_symbol are reported: 
0.06562597762597763


We see that, while operation and k_symbol are missing at times, there are no cases where neither one is present. Thus, missingness should be no issue.

Because missing data is not an issue and we do not need to use the transaction data to predict missing values, we can focus on the subset of the transactions data, where there are loans present.

### Datetime wrangling

Because we're interested in predicting bad loans, we should use transaction data from only prior to giving out the loan.
In order to work with the dates, we will first transform them to datetime format

In [215]:
trans_loans.columns

Index(['trans_id', 'account_id', 'date', 'type_trans', 'operation',
       'amount_trans', 'balance', 'k_symbol', 'bank', 'account', 'loan_id',
       'date_loan', 'amount', 'duration', 'payments', 'status',
       'district_id_clnt', 'frequency', 'date_acnt', 'disp_id', 'client_id',
       'type', 'birth_number', 'district_id_acnt', 'A1', 'A2', 'A3', 'A4',
       'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15',
       'A16', 'card_id', 'type_card', 'issued'],
      dtype='object')

In [216]:
trans_loans.date = pd.to_datetime(trans_loans.date, format='%y%m%d')
trans_loans.date_loan = pd.to_datetime(trans_loans.date_loan, format='%y%m%d')
trans_loans.issued = pd.to_datetime(trans_loans.issued.str[:6], format='%y%m%d')

In [217]:
trans_loans = trans_loans[trans_loans.date <= trans_loans.date_loan]

In [219]:
print('We end up with {} transactions for our final data.'.format(len(trans_loans)))

We end up with 66976 transactions for our final data.


### Dropping unnecessary columns

In [220]:
print('\n'.join(trans_loans.columns))

trans_id
account_id
date
type_trans
operation
amount_trans
balance
k_symbol
bank
account
loan_id
date_loan
amount
duration
payments
status
district_id_clnt
frequency
date_acnt
disp_id
client_id
type
birth_number
district_id_acnt
A1
A2
A3
A4
A5
A6
A7
A8
A9
A10
A11
A12
A13
A14
A15
A16
card_id
type_card
issued


## Export final transactions data



In [None]:
with open('transactions_data','wb') as file:
    pickle.dump(trans_loans, file)

## Some additional data wrangling

Unnamed: 0,trans_id,account_id,date,amount_trans,balance,account,loan_id,date_loan,amount,duration,...,A6,A7,A8,A9,A10,A11,A13,A14,A16,card_id
count,233627.0,233627.0,233627.0,233627.0,233627.0,77073.0,233627.0,233627.0,233627.0,233627.0,...,233627.0,233627.0,233627.0,233627.0,233627.0,233627.0,233627.0,233627.0,233627.0,47127.0
mean,2084692.0,6034.333245,966293.534476,8239.434745,45520.418867,40671350.0,6216.416921,956830.527619,147130.014733,35.513121,...,21.533367,5.469137,1.682785,5.291794,68.555529,9551.048946,3.468643,122.373737,17582.671104,820.616738
std,1115108.0,3233.873384,13622.799197,11740.717669,24765.050048,32721120.0,672.007484,14668.16034,111462.998365,17.255362,...,16.112486,4.601195,1.071525,2.90706,20.480304,1356.102185,2.142688,23.791757,32637.127783,327.342498
min,276.0,2.0,930113.0,0.0,-19310.0,0.0,4959.0,930705.0,4980.0,12.0,...,0.0,0.0,0.0,1.0,33.9,8110.0,0.43,81.0,888.0,16.0
25%,1113868.0,3115.0,960307.0,199.5,27441.45,7603472.0,5611.0,941003.0,67464.0,24.0,...,8.0,2.0,1.0,4.0,52.7,8546.0,1.96,106.0,2252.0,587.0
50%,2279742.0,6148.0,970508.0,3600.0,40934.0,38668640.0,6253.0,960119.0,108144.0,36.0,...,23.0,5.0,1.0,6.0,61.9,8994.0,3.49,117.0,3868.0,890.0
75%,3091876.0,8784.0,980305.0,10530.0,59333.2,69415770.0,6766.0,970518.0,203940.0,48.0,...,33.0,8.0,2.0,7.0,87.7,9897.0,4.72,137.0,6872.0,1081.0
max,3682987.0,11362.0,981231.0,87300.0,209637.0,99994200.0,7308.0,981208.0,590820.0,60.0,...,70.0,20.0,5.0,11.0,100.0,12541.0,9.4,167.0,99107.0,1247.0
