# Applied Machine Learning - Mini Challenge: Cross-Selling of Credit Cards
**Author**: Nils Fahrni

In [117]:
import sklearn
import numpy as np
import sys
import pandas as pd
import seaborn as sns

sns.set_palette("pastel")

sys.path.append('scripts')
from dataloader import DataLoader

## Data Preprocessing

### Data Loading

In [118]:
data_loader = DataLoader(base_path='data', translations_name='translation_mappings.json')
data_loader.list_datasets()

Unnamed: 0,Dataset,Number of Rows
0,loan,682
1,client,5369
2,district,77
3,trans,1056320
4,account,4500
5,card,892
6,order,6471
7,disp,5369


#### Account

In [119]:
account = data_loader.load_csv('account', parse_dates={'date': '%y%m%d'})
account.sample(5)

Mapped frequency:
{
    "POPLATEK MESICNE": "MONTHLY CHARGES",
    "POPLATEK TYDNE": "WEEKLY CHARGES",
    "POPLATEK PO OBRATU": "TRANSACTION CHARGES"
}


Unnamed: 0,account_id,district_id,frequency,date
918,3264,70,MONTHLY CHARGES,1993-10-23
1198,1169,49,TRANSACTION CHARGES,1994-03-02
4264,3657,62,MONTHLY CHARGES,1997-09-19
3460,8094,67,MONTHLY CHARGES,1996-11-24
2974,3543,74,MONTHLY CHARGES,1996-07-21


#### Client

In [120]:
client = data_loader.load_csv('client')

client = client.assign(gender=client['birth_number'].apply(lambda x: 'FEMALE' if int(str(x)[2:4]) > 50 else 'MALE'))

client = client.assign(birth_number=client.apply(lambda x: x['birth_number'] - 5000 if x['gender'] == 'FEMALE' else x['birth_number'], axis=1))

client['birth_number'] = pd.to_datetime(client['birth_number'], format='%y%m%d')
client['birth_date'] = client['birth_number'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 1999 else x)
client.drop('birth_number', axis=1, inplace=True)

client['age'] = (pd.to_datetime('1999-12-31') - client['birth_date']).dt.days // 365

client.sample(5)

Unnamed: 0,client_id,district_id,gender,birth_date,age
4664,5236,25,MALE,1978-06-03,21
3049,3218,49,MALE,1965-12-16,34
1003,1057,52,MALE,1939-02-21,60
3604,3806,54,FEMALE,1967-06-27,32
2169,2294,32,FEMALE,1952-12-27,47


#### Disposition

Removing disponents as the goal is to only advertise to owners. Disponents may be secondary users that have been authorized to use an account. They may be allowed to execute transactions on that account but they are not the authorized owners.

In [121]:
disp = data_loader.load_csv('disp')

disp = disp[disp['type'] != 'DISPONENT']

disp.drop('type', axis=1, inplace=True)

disp.sample(5)

Unnamed: 0,disp_id,client_id,account_id
2318,2445,2445,2013
3012,3179,3179,2634
3624,3830,3830,3167
2588,2735,2735,2259
5181,11189,11497,9337


#### Permanent Order

In [122]:
order = data_loader.load_csv('order')

order.sample(5)

Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SIPO": "HOUSEHOLD",
    "LEASING": "LEASING",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
1115,30625,843,WX,32156717,128.0,
341,29776,246,YZ,8283996,1970.0,HOUSEHOLD
249,29678,183,AB,5552345,5202.0,HOUSEHOLD
432,29873,314,KL,99187093,13691.0,HOUSEHOLD
3191,32936,2395,IJ,13785899,3696.0,HOUSEHOLD


**Are there Null Values?**

In [123]:
display(order.isnull().sum())

display(order[order['k_symbol'].isnull()].sample(5))

order_id         0
account_id       0
bank_to          0
account_to       0
amount           0
k_symbol      1379
dtype: int64

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
1038,30542,784,KL,80516755,1215.0,
425,29866,311,WX,80353498,371.0,
2819,32496,2114,CD,42759567,1742.0,
5339,35383,4066,WX,20153117,7246.0,
3768,33560,2813,GH,57591462,4789.0,


In [124]:
order['k_symbol'] = order['k_symbol'].fillna('MISSING')

order_pivot = order.pivot_table(index='account_id', columns='k_symbol', values='amount', aggfunc='count', fill_value=0)

order_pivot.sample(5)

k_symbol,HOUSEHOLD,INSURANCE PAYMENT,LEASING,LOAN PAYMENT,MISSING
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5650,1,0,0,1,0
3770,1,0,0,0,0
712,0,0,1,0,0
3850,1,0,0,0,1
1546,1,0,0,0,0


#### Transaction

TODO: 
- amount to negative or positive based on if withdrawal or deposit
- Research account number 19 (time series account balance, at least per month)
    - this acc is volatile
    - account's balance goes negative sometimes
- what happens to accounts with multiple transactions on a day?
    - how to obtain the actual end of day balance?
        - add up withdrawals with deposits and add to balance of day before 
        - try to vectorize this problem (R antijoin mentioned)

In [125]:
transaction = data_loader.load_csv('trans', parse_dates={'date': '%y%m%d'})

transaction.sample(5)

Mapped type:
{
    "PRIJEM": "CREDIT",
    "VYDAJ": "WITHDRAWAL"
}
Mapped operation:
{
    "VYBER KARTOU": "CREDIT CARD WITHDRAWAL",
    "VKLAD": "CREDIT IN CASH",
    "PREVOD Z UCTU": "COLLECTION FROM ANOTHER BANK",
    "VYBER": "WITHDRAWAL IN CASH",
    "PREVOD NA UCET": "REMITTANCE TO ANOTHER BANK"
}
Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SLUZBY": "PAYMENT FOR STATEMENT",
    "UROK": "INTEREST CREDITED",
    "SANKC. UROK": "SANCTION INTEREST IF NEGATIVE BALANCE",
    "SIPO": "HOUSEHOLD",
    "DUCHOD": "OLD-AGE PENSION",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
1022015,1730827,5868,1998-11-23,WITHDRAWAL,WITHDRAWAL IN CASH,54800.0,62453.9,,,
886478,1047,4,1998-06-11,CREDIT,COLLECTION FROM ANOTHER BANK,5553.0,24328.2,OLD-AGE PENSION,UV,89899795.0
797343,3602773,2199,1998-02-28,CREDIT,,83.8,22092.5,INTEREST CREDITED,,
345076,272242,931,1996-07-05,WITHDRAWAL,REMITTANCE TO ANOTHER BANK,864.0,19402.9,,MN,2668875.0
116843,101115,345,1994-12-30,WITHDRAWAL,WITHDRAWAL IN CASH,2700.0,34274.3,,,


#### Loan

In [126]:
loan = data_loader.load_csv('loan', parse_dates={'date': '%y%m%d'})

loan.sample(5)

Mapped status:
{
    "A": "contract finished, no problems",
    "B": "contract finished, loan not payed",
    "C": "running contract, OK so far",
    "D": "running contract, client in debt"
}


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
319,6011,4981,1996-11-28,179328,48,3736.0,"running contract, OK so far"
504,5651,3329,1997-12-08,19248,12,1604.0,"contract finished, no problems"
201,7243,11042,1995-12-04,217152,36,6032.0,"contract finished, no problems"
374,5210,1252,1997-04-01,148140,36,4115.0,"running contract, OK so far"
285,6461,7136,1996-09-19,74124,36,2059.0,"running contract, OK so far"


**Can an account have multiple loans?**

In [127]:
print(f'Are there accounts with multiple loans: {loan["account_id"].nunique() < loan.shape[0]}')

Are there accounts with multiple loans: False


#### Credit Card

In [128]:
card = data_loader.load_csv('card', parse_dates={'issued': '%y%m%d'})

card.sample(5)

Unnamed: 0,card_id,disp_id,type,issued
329,628,4007,classic,1997-08-11
781,74,453,classic,1998-10-18
357,896,7567,classic,1997-09-18
170,1010,9413,classic,1996-10-27
296,108,597,classic,1997-06-25


#### District

In [129]:
district = data_loader.load_csv('district')

district = district.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'population',
    'A5': 'n_municipalities_with_inhabitants_lt_499',
    'A6': 'n_municipalities_with_inhabitants_500_to_1999',
    'A7': 'n_municipalities_with_inhabitants_2000_to_9999',
    'A8': 'n_municipalities_with_inhabitants_gt_10000',
    'A9': 'n_cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_95',
    'A13': 'unemployment_rate_96',
    'A14': 'enterpreneurs_per_1000_inhabitants',
    'A15': 'n_commited_crimes_95',
    'A16': 'n_commited_crimes_96'
})

district.sample(5)

Unnamed: 0,district_id,district_name,region,population,n_municipalities_with_inhabitants_lt_499,n_municipalities_with_inhabitants_500_to_1999,n_municipalities_with_inhabitants_2000_to_9999,n_municipalities_with_inhabitants_gt_10000,n_cities,ratio_urban_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,enterpreneurs_per_1000_inhabitants,n_commited_crimes_95,n_commited_crimes_96
18,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.13,3.98,120,999,1099
21,22,Domazlice,west Bohemia,58400,65,16,4,1,6,52.4,8620,1.1,1.25,100,1089,1117
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
71,72,Olomouc,north Moravia,226122,32,50,7,4,4,62.6,8994,3.8,4.79,110,9672,9208
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892


### Data Merging

In [130]:
from utils import add_prefix_except_id

account = add_prefix_except_id(account, 'account_')
client_df = disp.merge(account, on='account_id', how='left')

client = add_prefix_except_id(client, 'client_')
client_df = client_df.merge(client, on='client_id', how='left')

order_pivot = add_prefix_except_id(order_pivot, 'ordertype_')
client_df = client_df.merge(order_pivot, on='account_id', how='left')

loan = add_prefix_except_id(loan, 'loan_')
client_df = client_df.merge(loan, on='account_id', how='left')

card = add_prefix_except_id(card, 'card_')
client_df = client_df.merge(card, on='disp_id', how='left')

client_df.sample(5)

Unnamed: 0,loan_id,account_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status
0,5314,1787,1993-07-05,96396,12,8033.0,"contract finished, loan not payed"
1,5316,1801,1993-07-11,165960,36,4610.0,"contract finished, no problems"
2,6863,9188,1993-07-28,127080,60,2118.0,"contract finished, no problems"
3,5325,1843,1993-08-03,105804,36,2939.0,"contract finished, no problems"
4,7240,11013,1993-09-06,274740,60,4579.0,"contract finished, no problems"


### Data Cleaning

## Model Construction

### Identifying Existing Customers

### Identifying Existing Non-Customers

### Generating Event-Based Customer Information

## Feature Engineering

### Deriving New Features