# Applied Machine Learning - Mini Challenge: Cross-Selling of Credit Cards
**Author**: Nils Fahrni

In [147]:
import sklearn
import numpy as np
import sys
import pandas as pd
import seaborn as sns

sns.set_palette("pastel")
pd.set_option('display.max_columns', None)

sys.path.append('scripts')
from dataloader import DataLoader

## Data Preprocessing

### Data Loading

In [148]:
data_loader = DataLoader(base_path='data', translations_name='translation_mappings.json')
data_loader.list_datasets()

Unnamed: 0,Dataset,Number of Rows
0,loan,682
1,client,5369
2,district,77
3,trans,1056320
4,account,4500
5,card,892
6,order,6471
7,disp,5369


#### Account

In [149]:
account = data_loader.load_csv('account', parse_dates={'date': '%y%m%d'})
account.sample(5)

Mapped frequency:
{
    "POPLATEK MESICNE": "MONTHLY CHARGES",
    "POPLATEK TYDNE": "WEEKLY CHARGES",
    "POPLATEK PO OBRATU": "TRANSACTION CHARGES"
}


Unnamed: 0,account_id,district_id,frequency,date
1101,441,27,WEEKLY CHARGES,1993-12-17
2497,3455,34,MONTHLY CHARGES,1996-03-13
3975,10679,1,MONTHLY CHARGES,1997-05-30
4333,1968,16,WEEKLY CHARGES,1997-10-14
4403,2658,27,MONTHLY CHARGES,1997-11-14


#### Client

In [150]:
client = data_loader.load_csv('client')

client = client.assign(gender=client['birth_number'].apply(lambda x: 'FEMALE' if int(str(x)[2:4]) > 50 else 'MALE'))

client = client.assign(birth_number=client.apply(lambda x: x['birth_number'] - 5000 if x['gender'] == 'FEMALE' else x['birth_number'], axis=1))

client['birth_number'] = pd.to_datetime(client['birth_number'], format='%y%m%d')
client['birth_date'] = client['birth_number'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 1999 else x)
client.drop('birth_number', axis=1, inplace=True)

client['age'] = (pd.to_datetime('1999-12-31') - client['birth_date']).dt.days // 365

client.sample(5)

Unnamed: 0,client_id,district_id,gender,birth_date,age
4537,4797,59,FEMALE,1940-09-03,59
1094,1150,74,FEMALE,1945-06-24,54
4009,4240,46,FEMALE,1960-10-21,39
914,964,10,FEMALE,1953-10-21,46
63,67,29,FEMALE,1932-11-11,67


#### Disposition

Removing disponents as the goal is to only advertise to owners. Disponents may be secondary users that have been authorized to use an account. They may be allowed to execute transactions on that account but they are not the authorized owners.

In [151]:
disp = data_loader.load_csv('disp')

disp = disp[disp['type'] != 'DISPONENT']

disp.drop('type', axis=1, inplace=True)

disp.sample(5)

Unnamed: 0,disp_id,client_id,account_id
3387,3575,3575,2961
4848,6729,6729,5572
3631,3837,3837,3173
3520,3717,3717,3075
3888,4107,4107,3397


#### Permanent Order

In [152]:
order = data_loader.load_csv('order')

order.sample(5)

Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SIPO": "HOUSEHOLD",
    "LEASING": "LEASING",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
1428,30965,1075,WX,87197690,3868.0,HOUSEHOLD
4241,34094,3164,UV,53459769,2971.0,HOUSEHOLD
5216,35179,3917,YZ,75628006,789.0,INSURANCE PAYMENT
3652,33435,2728,EF,96346175,11222.0,HOUSEHOLD
3914,33721,2919,ST,15447330,7331.0,HOUSEHOLD


**Are there Null Values?**

In [153]:
display(order.isnull().sum())

display(order[order['k_symbol'].isnull()].sample(5))

order_id         0
account_id       0
bank_to          0
account_to       0
amount           0
k_symbol      1379
dtype: int64

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
5935,40977,7815,ST,1738054,268.0,
3839,33646,2865,OP,25167072,1.0,
1748,31334,1319,YZ,35347509,884.0,
900,30373,666,ST,70089435,2282.0,
3705,33491,2764,ST,93326770,2382.0,


In [154]:
order['k_symbol'] = order['k_symbol'].fillna('MISSING')

order_pivot = order.pivot_table(index='account_id', columns='k_symbol', values='amount', aggfunc='count', fill_value=0)

order_pivot.sample(5)

k_symbol,HOUSEHOLD,INSURANCE PAYMENT,LEASING,LOAN PAYMENT,MISSING
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1792,1,0,0,0,0
3808,1,0,0,0,1
2278,1,0,0,0,1
3116,1,0,0,0,0
8320,1,0,0,1,0


#### Transaction

TODO: 
- amount to negative or positive based on if withdrawal or deposit
- Research account number 19 (time series account balance, at least per month)
    - this acc is volatile
    - account's balance goes negative sometimes
- what happens to accounts with multiple transactions on a day?
    - how to obtain the actual end of day balance?
        - add up withdrawals with deposits and add to balance of day before 
        - try to vectorize this problem (R antijoin mentioned)

In [155]:
transaction = data_loader.load_csv('trans', parse_dates={'date': '%y%m%d'})

transaction.sample(5)

Mapped type:
{
    "PRIJEM": "CREDIT",
    "VYDAJ": "WITHDRAWAL"
}
Mapped operation:
{
    "VYBER KARTOU": "CREDIT CARD WITHDRAWAL",
    "VKLAD": "CREDIT IN CASH",
    "PREVOD Z UCTU": "COLLECTION FROM ANOTHER BANK",
    "VYBER": "WITHDRAWAL IN CASH",
    "PREVOD NA UCET": "REMITTANCE TO ANOTHER BANK"
}
Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SLUZBY": "PAYMENT FOR STATEMENT",
    "UROK": "INTEREST CREDITED",
    "SANKC. UROK": "SANCTION INTEREST IF NEGATIVE BALANCE",
    "SIPO": "HOUSEHOLD",
    "DUCHOD": "OLD-AGE PENSION",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
821503,3448601,3018,1998-03-31,CREDIT,,16.6,3154.0,INTEREST CREDITED,,
113663,1339473,4576,1994-12-12,WITHDRAWAL,REMITTANCE TO ANOTHER BANK,8339.0,46781.4,LOAN PAYMENT,UV,20694566.0
559733,742358,2534,1997-05-31,WITHDRAWAL,WITHDRAWAL IN CASH,14.6,26102.6,PAYMENT FOR STATEMENT,,
673675,3581466,1552,1997-10-31,CREDIT,,269.8,62281.2,INTEREST CREDITED,,
371250,3442178,1360,1996-08-31,CREDIT,,140.0,32199.8,INTEREST CREDITED,,


#### Loan

In [156]:
loan = data_loader.load_csv('loan', parse_dates={'date': '%y%m%d'})

loan.sample(5)

Mapped status:
{
    "A": "contract finished, no problems",
    "B": "contract finished, loan not payed",
    "C": "running contract, OK so far",
    "D": "running contract, client in debt"
}


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
528,6237,6075,1998-01-06,407952,48,8499.0,"running contract, client in debt"
114,5450,2345,1994-12-11,199680,60,3328.0,"running contract, OK so far"
56,5918,4576,1994-06-04,300204,36,8339.0,"contract finished, no problems"
599,5313,1776,1998-05-27,43200,24,1800.0,"running contract, OK so far"
655,5318,1808,1998-09-30,385584,48,8033.0,"running contract, OK so far"


**Can an account have multiple loans?**

In [157]:
print(f'Are there accounts with multiple loans: {loan["account_id"].nunique() < loan.shape[0]}')

Are there accounts with multiple loans: False


#### Credit Card

In [158]:
card = data_loader.load_csv('card', parse_dates={'issued': '%y%m%d'})

card.sample(5)

Unnamed: 0,card_id,disp_id,type,issued
525,444,2815,gold,1998-04-03
684,519,3297,classic,1998-08-16
359,279,1791,classic,1997-09-21
175,1014,9452,classic,1996-11-02
547,421,2644,classic,1998-04-27


#### District

In [159]:
district = data_loader.load_csv('district')

district = district.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'population',
    'A5': 'n_municipalities_with_inhabitants_lt_499',
    'A6': 'n_municipalities_with_inhabitants_500_to_1999',
    'A7': 'n_municipalities_with_inhabitants_2000_to_9999',
    'A8': 'n_municipalities_with_inhabitants_gt_10000',
    'A9': 'n_cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_95',
    'A13': 'unemployment_rate_96',
    'A14': 'enterpreneurs_per_1000_inhabitants',
    'A15': 'n_commited_crimes_95',
    'A16': 'n_commited_crimes_96'
})

district.sample(5)

Unnamed: 0,district_id,district_name,region,population,n_municipalities_with_inhabitants_lt_499,n_municipalities_with_inhabitants_500_to_1999,n_municipalities_with_inhabitants_2000_to_9999,n_municipalities_with_inhabitants_gt_10000,n_cities,ratio_urban_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,enterpreneurs_per_1000_inhabitants,n_commited_crimes_95,n_commited_crimes_96
40,41,Usti nad Labem,north Bohemia,118650,8,8,5,1,3,85.6,9832,3.32,4.48,114,6445,5471
16,17,Pelhrimov,south Bohemia,74062,99,15,4,2,7,61.4,8114,2.38,2.62,119,1003,1181
10,11,Praha - zapad,central Bohemia,75637,35,36,9,0,7,36.5,9622,0.45,0.59,154,3475,3529
20,21,Tabor,south Bohemia,103347,87,16,7,1,7,67.0,9104,1.51,2.07,123,2299,2354
47,48,Rychnov nad Kneznou,east Bohemia,78955,50,24,8,1,9,59.0,9060,1.78,2.44,124,1655,1717


### Data Merging

In [160]:
from utils import add_prefix_except_id

account = add_prefix_except_id(account, 'account_', id_exceptions=['district_id'])
client_df = disp.merge(account, on='account_id', how='left')

client = add_prefix_except_id(client, 'client_', id_exceptions=['district_id'])
client_df = client_df.merge(client, on='client_id', how='left')

order_pivot = add_prefix_except_id(order_pivot, 'ordertype_')
client_df = client_df.merge(order_pivot, on='account_id', how='left')

loan = add_prefix_except_id(loan, 'loan_')
client_df = client_df.merge(loan, on='account_id', how='left')

card = add_prefix_except_id(card, 'card_')
client_df = client_df.merge(card, on='disp_id', how='left')

client_district = add_prefix_except_id(district, 'client_district_')
client_df = client_df.merge(client_district, left_on='client_district_id', right_on='district_id', how='left')

account_district = add_prefix_except_id(district, 'account_district_')
client_df = client_df.merge(client_district, left_on='account_district_id', right_on='district_id', how='left')

client_df.sample(5)

Unnamed: 0,disp_id,client_id,account_id,account_district_id,account_frequency,account_date,client_district_id,client_gender,client_birth_date,client_age,ordertype_HOUSEHOLD,ordertype_INSURANCE PAYMENT,ordertype_LEASING,ordertype_LOAN PAYMENT,ordertype_MISSING,loan_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,card_id,card_type,card_issued,district_id_x,client_district_district_name_x,client_district_region_x,client_district_population_x,client_district_n_municipalities_with_inhabitants_lt_499_x,client_district_n_municipalities_with_inhabitants_500_to_1999_x,client_district_n_municipalities_with_inhabitants_2000_to_9999_x,client_district_n_municipalities_with_inhabitants_gt_10000_x,client_district_n_cities_x,client_district_ratio_urban_inhabitants_x,client_district_average_salary_x,client_district_unemployment_rate_95_x,client_district_unemployment_rate_96_x,client_district_enterpreneurs_per_1000_inhabitants_x,client_district_n_commited_crimes_95_x,client_district_n_commited_crimes_96_x,district_id_y,client_district_district_name_y,client_district_region_y,client_district_population_y,client_district_n_municipalities_with_inhabitants_lt_499_y,client_district_n_municipalities_with_inhabitants_500_to_1999_y,client_district_n_municipalities_with_inhabitants_2000_to_9999_y,client_district_n_municipalities_with_inhabitants_gt_10000_y,client_district_n_cities_y,client_district_ratio_urban_inhabitants_y,client_district_average_salary_y,client_district_unemployment_rate_95_y,client_district_unemployment_rate_96_y,client_district_enterpreneurs_per_1000_inhabitants_y,client_district_n_commited_crimes_95_y,client_district_n_commited_crimes_96_y
670,846,846,705,35,MONTHLY CHARGES,1997-07-14,35,MALE,1949-12-01,50,1.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,35,Jablonec n. Nisou,north Bohemia,88768,10,18,5,1,7,80.5,8867,1.02,1.21,130,3384,3620,35,Jablonec n. Nisou,north Bohemia,88768,10,18,5,1,7,80.5,8867,1.02,1.21,130,3384,3620
2962,3767,3767,3115,14,MONTHLY CHARGES,1996-03-27,14,MALE,1936-09-07,63,1.0,0.0,0.0,1.0,0.0,5611.0,1996-11-15,321180.0,60.0,5353.0,"running contract, OK so far",,,NaT,14,Ceske Budejovice,south Bohemia,177686,69,27,10,1,9,74.8,10045,1.42,1.71,135,6604,6295,14,Ceske Budejovice,south Bohemia,177686,69,27,10,1,9,74.8,10045,1.42,1.71,135,6604,6295
1175,1491,1491,1236,19,MONTHLY CHARGES,1993-05-05,19,MALE,1982-04-21,17,,,,,,,NaT,,,,,,,NaT,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.13,3.98,120,999,1099,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.13,3.98,120,999,1099
2463,3125,3125,2589,10,MONTHLY CHARGES,1996-12-04,10,FEMALE,1922-10-30,77,2.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,10,Praha - vychod,central Bohemia,92084,55,29,4,3,5,46.7,10124,0.56,0.54,141,3810,4316,10,Praha - vychod,central Bohemia,92084,55,29,4,3,5,46.7,10124,0.56,0.54,141,3810,4316
4261,9901,10209,8268,59,MONTHLY CHARGES,1996-09-19,59,MALE,1975-10-27,24,1.0,0.0,0.0,1.0,1.0,6688.0,1997-09-26,8616.0,24.0,359.0,"running contract, OK so far",,,NaT,59,Kromeriz,south Moravia,108871,41,29,7,2,6,62.1,8444,3.24,3.47,106,2595,2305,59,Kromeriz,south Moravia,108871,41,29,7,2,6,62.1,8444,3.24,3.47,106,2595,2305


In [164]:
assert client_df['account_id'].nunique() == client_df.shape[0]

### Data Cleaning

#### Removing Junior Cards

In [162]:
# define junior_clients as clients with the card_type junior and age <= 20
junior_clients = client_df[client_df['card_type'] == 'junior']

client_df = client_df[~client_df['account_id'].isin(junior_clients['account_id'])]

transaction = transaction[~transaction['account_id'].isin(junior_clients['account_id'])]

print(f'Number of junior clients: {junior_clients.shape[0]}')
print(f'Number of clients remaining: {client_df.shape[0]}')

Number of junior clients: 145
Number of clients remaining: 4355


## Model Construction

### Processing Transactional Data
- The goal is to predict if a non-card-owner will buy a card or not

The first task is to look if every account in the transactions dataframe has a "first transaction". This would make the calculation of the monthly balance much easier since everything can be summed up without having to worry that there were months without records in the transaction dataframe.

In [173]:
# Find the minimum (first) transaction(s) date for each account
min_dates = transaction.groupby('account_id')['date'].min().reset_index()
min_dates.rename(columns={'date': 'min_date'}, inplace=True)

# Merge the minimum date back to the transactions to identify all transactions on the first day
transactions_with_min_date = pd.merge(transaction, min_dates, on='account_id')

# Filter transactions that are on the first day
first_day_transactions = transactions_with_min_date[transactions_with_min_date['date'] == transactions_with_min_date['min_date']]
first_day_transactions = first_day_transactions.copy()

# Now, for each of these first day transactions, check if any have amount equals balance
first_day_transactions['amount_equals_balance'] = first_day_transactions['amount'] == first_day_transactions['balance']

# Group by account_id and check if any transactions for each account meet the condition
accounts_meeting_condition = first_day_transactions.groupby('account_id')['amount_equals_balance'].any().reset_index()

# Verify if all accounts have at least one transaction on the first day meeting the condition
all_accounts_covered = accounts_meeting_condition['amount_equals_balance'].all()

print("Does every account's first day of transactions include at least one transaction where amount equals balance?", all_accounts_covered)

Does every account's first day of transactions include at least one transaction where amount equals balance? True


Now every accounts balance needs to be calculated per month.

In [200]:
# Extract year and month to create a 'month' column
transaction['month'] = transaction['date'].dt.to_period('M')

# Group by account_id and month to summarize transactions
transactions_monthly = transaction.groupby(['account_id', 'month']).agg(
    volume=('amount', 'sum'),
    credit=('amount', lambda x: x[x > 0].sum()),
    withdrawal=('amount', lambda x: x[x < 0].sum()),
    n_transactions=('amount', 'size')
).reset_index()

In [201]:
# Ensure 'month' is a PeriodIndex for proper handling
transactions_monthly['month'] = pd.PeriodIndex(transactions_monthly['month'])

# Create a date range for each account spanning from the minimum to maximum month
date_ranges = transactions_monthly.groupby('account_id')['month'].agg(['min', 'max'])

def reindex_df(group, account_id):
    idx = pd.period_range(start=group['month'].min(), end=group['month'].max(), freq='M')
    group.set_index('month', inplace=True)
    group = group.reindex(idx, fill_value=0)
    # Since 'account_id' is now explicitly passed, we ensure it's set correctly in the reindexed DataFrame
    group['account_id'] = account_id
    return group

transactions_monthly = (transactions_monthly.groupby('account_id')
                        .apply(lambda x: reindex_df(x, x.name))
                        .reset_index(level=0, drop=True)
                        .reset_index())

# Calculate cumulative balance
transactions_monthly['balance'] = transactions_monthly.groupby('account_id')['volume'].cumsum()

transactions_monthly

  .apply(lambda x: reindex_df(x, x.name))


Unnamed: 0,index,account_id,volume,credit,withdrawal,n_transactions,balance
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7
...,...,...,...,...,...,...,...
179049,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6
179050,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6
179051,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8
179052,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5


### Generating Event-Based Customer Information

## Feature Engineering

### Deriving New Features