# Applied Machine Learning - Mini Challenge: Cross-Selling of Credit Cards
**Author**: Nils Fahrni

In [177]:
import sklearn
import numpy as np
import sys
import pandas as pd
import seaborn as sns

sns.set_palette("pastel")
pd.set_option('display.max_columns', None)

sys.path.append('scripts')
from dataloader import DataLoader

## Data Preprocessing

### Data Loading

In [178]:
data_loader = DataLoader(base_path='data', translations_name='translation_mappings.json')
data_loader.list_datasets()

Unnamed: 0,Dataset,Number of Rows
0,loan,682
1,client,5369
2,district,77
3,trans,1056320
4,account,4500
5,card,892
6,order,6471
7,disp,5369


#### Account

In [179]:
account = data_loader.load_csv('account', parse_dates={'date': '%y%m%d'})
account.sample(5)

Mapped frequency:
{
    "POPLATEK MESICNE": "MONTHLY CHARGES",
    "POPLATEK TYDNE": "WEEKLY CHARGES",
    "POPLATEK PO OBRATU": "TRANSACTION CHARGES"
}


Unnamed: 0,account_id,district_id,frequency,date
2048,1704,70,MONTHLY CHARGES,1995-09-22
3598,1390,69,MONTHLY CHARGES,1996-12-30
3980,3700,1,MONTHLY CHARGES,1997-06-03
3542,4794,72,MONTHLY CHARGES,1996-12-18
1168,6472,61,MONTHLY CHARGES,1994-02-08


#### Client

In [180]:
client = data_loader.load_csv('client')

client = client.assign(gender=client['birth_number'].apply(lambda x: 'FEMALE' if int(str(x)[2:4]) > 50 else 'MALE'))

client = client.assign(birth_number=client.apply(lambda x: x['birth_number'] - 5000 if x['gender'] == 'FEMALE' else x['birth_number'], axis=1))

client['birth_number'] = pd.to_datetime(client['birth_number'], format='%y%m%d')
client['birth_date'] = client['birth_number'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 1999 else x)
client.drop('birth_number', axis=1, inplace=True)

client['age'] = (pd.to_datetime('1999-12-31') - client['birth_date']).dt.days // 365

client.sample(5)

Unnamed: 0,client_id,district_id,gender,birth_date,age
4670,5246,70,MALE,1982-05-15,17
4160,4401,8,MALE,1968-07-20,31
1770,1878,47,MALE,1963-09-02,36
2120,2238,47,FEMALE,1954-02-19,45
3706,3916,47,FEMALE,1970-08-27,29


#### Disposition

Removing disponents as the goal is to only advertise to owners. Disponents may be secondary users that have been authorized to use an account. They may be allowed to execute transactions on that account but they are not the authorized owners.

In [181]:
disp = data_loader.load_csv('disp')

disp = disp[disp['type'] != 'DISPONENT']

disp.drop('type', axis=1, inplace=True)

disp.sample(5)

Unnamed: 0,disp_id,client_id,account_id
2441,2573,2573,2127
602,632,632,526
5204,11447,11755,9549
3822,4036,4036,3338
2008,2124,2124,1752


#### Permanent Order

In [182]:
order = data_loader.load_csv('order')

order.sample(5)

Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SIPO": "HOUSEHOLD",
    "LEASING": "LEASING",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
3917,33724,2923,GH,9396666,2786.0,HOUSEHOLD
3223,32973,2424,EF,92266177,14138.0,HOUSEHOLD
1236,30751,921,QR,13826614,1240.0,HOUSEHOLD
5875,40429,7465,ST,28778660,1060.6,LEASING
4973,34918,3746,KL,7049614,556.0,


**Are there Null Values?**

In [183]:
display(order.isnull().sum())

display(order[order['k_symbol'].isnull()].sample(5))

order_id         0
account_id       0
bank_to          0
account_to       0
amount           0
k_symbol      1379
dtype: int64

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
1324,30846,992,QR,40572267,2450.0,
5050,34998,3795,CD,41867381,6679.0,
2807,32484,2103,EF,60364776,2211.0,
2505,32166,1891,YZ,43767422,172.0,
5391,35670,4259,AB,40486842,2255.0,


In [184]:
order['k_symbol'] = order['k_symbol'].fillna('MISSING')

order_pivot = order.pivot_table(index='account_id', columns='k_symbol', values='amount', aggfunc='count', fill_value=0)

order_pivot.sample(5)

k_symbol,HOUSEHOLD,INSURANCE PAYMENT,LEASING,LOAN PAYMENT,MISSING
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
806,1,0,0,0,0
3219,1,0,0,0,0
1868,1,0,0,0,0
1217,1,0,0,0,1
3789,1,0,0,0,0


#### Transaction

TODO: 
- amount to negative or positive based on if withdrawal or deposit
- Research account number 19 (time series account balance, at least per month)
    - this acc is volatile
    - account's balance goes negative sometimes
- what happens to accounts with multiple transactions on a day?
    - how to obtain the actual end of day balance?
        - add up withdrawals with deposits and add to balance of day before 
        - try to vectorize this problem (R antijoin mentioned)

In [185]:
transaction = data_loader.load_csv('trans', parse_dates={'date': '%y%m%d'})

transaction.sample(5)

Mapped type:
{
    "PRIJEM": "CREDIT",
    "VYDAJ": "WITHDRAWAL"
}
Mapped operation:
{
    "VYBER KARTOU": "CREDIT CARD WITHDRAWAL",
    "VKLAD": "CREDIT IN CASH",
    "PREVOD Z UCTU": "COLLECTION FROM ANOTHER BANK",
    "VYBER": "WITHDRAWAL IN CASH",
    "PREVOD NA UCET": "REMITTANCE TO ANOTHER BANK"
}
Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SLUZBY": "PAYMENT FOR STATEMENT",
    "UROK": "INTEREST CREDITED",
    "SANKC. UROK": "SANCTION INTEREST IF NEGATIVE BALANCE",
    "SIPO": "HOUSEHOLD",
    "DUCHOD": "OLD-AGE PENSION",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
274612,2586344,8547,1996-02-03,CREDIT,CREDIT IN CASH,5413.0,55368.2,,,
465761,493155,1682,1997-01-14,CREDIT,CREDIT IN CASH,1700.0,28998.5,,,
144931,121448,408,1995-03-06,CREDIT,CREDIT IN CASH,13304.0,39171.1,,,
377635,57478,192,1996-09-06,CREDIT,COLLECTION FROM ANOTHER BANK,5859.0,28931.0,OLD-AGE PENSION,GH,15384018.0
664593,850368,2896,1997-10-10,WITHDRAWAL,REMITTANCE TO ANOTHER BANK,295.0,23308.2,,WX,62635773.0


#### Loan

In [186]:
loan = data_loader.load_csv('loan', parse_dates={'date': '%y%m%d'})

loan.sample(5)

Mapped status:
{
    "A": "contract finished, no problems",
    "B": "contract finished, loan not payed",
    "C": "running contract, OK so far",
    "D": "running contract, client in debt"
}


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
433,5377,2116,1997-08-04,92400,12,7700.0,"contract finished, no problems"
551,5889,4456,1998-02-09,321240,60,5354.0,"running contract, OK so far"
644,6078,5283,1998-09-09,170832,48,3559.0,"running contract, OK so far"
61,6736,8558,1994-06-14,288360,60,4806.0,"running contract, OK so far"
590,7172,10652,1998-05-07,45720,12,3810.0,"running contract, OK so far"


**Can an account have multiple loans?**

In [187]:
print(f'Are there accounts with multiple loans: {loan["account_id"].nunique() < loan.shape[0]}')

Are there accounts with multiple loans: False


#### Credit Card

In [188]:
card = data_loader.load_csv('card', parse_dates={'issued': '%y%m%d'})

card.sample(5)

Unnamed: 0,card_id,disp_id,type,issued
257,1173,12442,classic,1997-05-08
11,181,1066,classic,1994-08-19
776,957,8506,junior,1998-10-16
513,34,225,classic,1998-03-20
803,333,2128,classic,1998-11-04


#### District

In [189]:
district = data_loader.load_csv('district')

district = district.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'population',
    'A5': 'n_municipalities_with_inhabitants_lt_499',
    'A6': 'n_municipalities_with_inhabitants_500_to_1999',
    'A7': 'n_municipalities_with_inhabitants_2000_to_9999',
    'A8': 'n_municipalities_with_inhabitants_gt_10000',
    'A9': 'n_cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_95',
    'A13': 'unemployment_rate_96',
    'A14': 'enterpreneurs_per_1000_inhabitants',
    'A15': 'n_commited_crimes_95',
    'A16': 'n_commited_crimes_96'
})

district.sample(5)

Unnamed: 0,district_id,district_name,region,population,n_municipalities_with_inhabitants_lt_499,n_municipalities_with_inhabitants_500_to_1999,n_municipalities_with_inhabitants_2000_to_9999,n_municipalities_with_inhabitants_gt_10000,n_cities,ratio_urban_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,enterpreneurs_per_1000_inhabitants,n_commited_crimes_95,n_commited_crimes_96
30,31,Tachov,west Bohemia,51313,34,12,3,1,4,59.2,8930,3.52,4.2,108,1328,1452
76,77,Vsetin,north Moravia,148545,8,35,12,3,4,53.5,8909,4.01,5.56,113,3460,3590
10,11,Praha - zapad,central Bohemia,75637,35,36,9,0,7,36.5,9622,0.45,0.59,154,3475,3529
23,24,Karlovy Vary,west Bohemia,122603,25,21,6,2,8,80.0,8991,1.39,2.01,128,5198,5273
7,8,Mlada Boleslav,central Bohemia,112065,95,19,7,1,8,69.4,11277,1.25,1.44,127,5179,4987


### Data Merging

In [190]:
from utils import add_prefix_except_id

account = add_prefix_except_id(account, 'account_', id_exceptions=['district_id'])
client_df = disp.merge(account, on='account_id', how='left')

client = add_prefix_except_id(client, 'client_', id_exceptions=['district_id'])
client_df = client_df.merge(client, on='client_id', how='left')

order_pivot = add_prefix_except_id(order_pivot, 'ordertype_')
client_df = client_df.merge(order_pivot, on='account_id', how='left')

loan = add_prefix_except_id(loan, 'loan_')
client_df = client_df.merge(loan, on='account_id', how='left')

card = add_prefix_except_id(card, 'card_')
client_df = client_df.merge(card, on='disp_id', how='left')

client_district = add_prefix_except_id(district, 'client_district_')
client_df = client_df.merge(client_district, left_on='client_district_id', right_on='district_id', how='left')

account_district = add_prefix_except_id(district, 'account_district_')
client_df = client_df.merge(client_district, left_on='account_district_id', right_on='district_id', how='left')

client_df.sample(5)

Unnamed: 0,disp_id,client_id,account_id,account_district_id,account_frequency,account_date,client_district_id,client_gender,client_birth_date,client_age,ordertype_HOUSEHOLD,ordertype_INSURANCE PAYMENT,ordertype_LEASING,ordertype_LOAN PAYMENT,ordertype_MISSING,loan_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,card_id,card_type,card_issued,district_id_x,client_district_district_name_x,client_district_region_x,client_district_population_x,client_district_n_municipalities_with_inhabitants_lt_499_x,client_district_n_municipalities_with_inhabitants_500_to_1999_x,client_district_n_municipalities_with_inhabitants_2000_to_9999_x,client_district_n_municipalities_with_inhabitants_gt_10000_x,client_district_n_cities_x,client_district_ratio_urban_inhabitants_x,client_district_average_salary_x,client_district_unemployment_rate_95_x,client_district_unemployment_rate_96_x,client_district_enterpreneurs_per_1000_inhabitants_x,client_district_n_commited_crimes_95_x,client_district_n_commited_crimes_96_x,district_id_y,client_district_district_name_y,client_district_region_y,client_district_population_y,client_district_n_municipalities_with_inhabitants_lt_499_y,client_district_n_municipalities_with_inhabitants_500_to_1999_y,client_district_n_municipalities_with_inhabitants_2000_to_9999_y,client_district_n_municipalities_with_inhabitants_gt_10000_y,client_district_n_cities_y,client_district_ratio_urban_inhabitants_y,client_district_average_salary_y,client_district_unemployment_rate_95_y,client_district_unemployment_rate_96_y,client_district_enterpreneurs_per_1000_inhabitants_y,client_district_n_commited_crimes_95_y,client_district_n_commited_crimes_96_y
4289,10438,10746,8713,53,MONTHLY CHARGES,1997-05-18,53,MALE,1971-08-04,28,0.0,0.0,0.0,1.0,0.0,6755.0,1998-07-24,170700.0,60.0,2845.0,"running contract, OK so far",,,NaT,53,Blansko,south Moravia,107911,88,33,6,2,7,50.9,8240,2.53,3.56,99,1850,1903,53,Blansko,south Moravia,107911,88,33,6,2,7,50.9,8240,2.53,3.56,99,1850,1903
2400,3047,3047,2525,45,MONTHLY CHARGES,1995-05-08,45,MALE,1975-10-01,24,1.0,0.0,0.0,0.0,1.0,,NaT,,,,,,,NaT,45,Jicin,east Bohemia,77917,85,19,6,1,5,53.5,8390,2.28,2.89,132,2080,2122,45,Jicin,east Bohemia,77917,85,19,6,1,5,53.5,8390,2.28,2.89,132,2080,2122
1452,1853,1853,1529,1,MONTHLY CHARGES,1996-04-28,1,MALE,1965-08-18,34,1.0,1.0,0.0,0.0,1.0,,NaT,,,,,,,NaT,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
3841,4972,4972,4130,70,MONTHLY CHARGES,1997-06-01,70,FEMALE,1977-07-13,22,,,,,,,NaT,,,,,,,NaT,70,Karvina,north Moravia,285387,0,2,8,5,7,89.9,10177,6.63,7.75,81,9878,10108,70,Karvina,north Moravia,285387,0,2,8,5,7,89.9,10177,6.63,7.75,81,9878,10108
2884,3661,3661,3031,51,MONTHLY CHARGES,1997-03-10,51,MALE,1932-11-25,67,1.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,51,Trutnov,east Bohemia,121947,37,28,7,3,11,70.5,8541,2.51,2.97,131,3496,3839,51,Trutnov,east Bohemia,121947,37,28,7,3,11,70.5,8541,2.51,2.97,131,3496,3839


In [191]:
assert client_df['account_id'].nunique() == client_df.shape[0]

### Data Cleaning

#### Removing Junior Cards

In [192]:
# define junior_clients as clients with the card_type junior and age <= 20
junior_clients = client_df[client_df['card_type'] == 'junior']

client_df = client_df[~client_df['account_id'].isin(junior_clients['account_id'])]

transaction = transaction[~transaction['account_id'].isin(junior_clients['account_id'])]

print(f'Number of junior clients: {junior_clients.shape[0]}')
print(f'Number of clients remaining: {client_df.shape[0]}')

Number of junior clients: 145
Number of clients remaining: 4355


## Model Construction

### Processing Transactional Data
- The goal is to predict if a non-card-owner will buy a card or not

The first task is to look if every account in the transactions dataframe has a "first transaction". This would make the calculation of the monthly balance much easier since everything can be summed up without having to worry that there were months without records in the transaction dataframe.

In [193]:
# Find the minimum (first) transaction(s) date for each account
min_dates = transaction.groupby('account_id')['date'].min().reset_index()
min_dates.rename(columns={'date': 'min_date'}, inplace=True)

# Merge the minimum date back to the transactions to identify all transactions on the first day
transactions_with_min_date = pd.merge(transaction, min_dates, on='account_id')

# Filter transactions that are on the first day
first_day_transactions = transactions_with_min_date[transactions_with_min_date['date'] == transactions_with_min_date['min_date']]
first_day_transactions = first_day_transactions.copy()

# Now, for each of these first day transactions, check if any have amount equals balance
first_day_transactions['amount_equals_balance'] = first_day_transactions['amount'] == first_day_transactions['balance']

# Group by account_id and check if any transactions for each account meet the condition
accounts_meeting_condition = first_day_transactions.groupby('account_id')['amount_equals_balance'].any().reset_index()

# Verify if all accounts have at least one transaction on the first day meeting the condition
all_accounts_covered = accounts_meeting_condition['amount_equals_balance'].all()

print("Does every account's first day of transactions include at least one transaction where amount equals balance?", all_accounts_covered)

Does every account's first day of transactions include at least one transaction where amount equals balance? True


Now every accounts balance needs to be calculated per month.

In [194]:
# Extract year and month to create a 'month' column
transaction['month'] = transaction['date'].dt.to_period('M')

# Group by account_id and month to summarize transactions
transactions_monthly = transaction.groupby(['account_id', 'month']).agg(
    volume=('amount', 'sum'),
    credit=('amount', lambda x: x[x > 0].sum()),
    withdrawal=('amount', lambda x: x[x < 0].sum()),
    n_transactions=('amount', 'size')
).reset_index()

In [195]:
# Ensure 'month' is a PeriodIndex for proper handling
transactions_monthly['month'] = pd.PeriodIndex(transactions_monthly['month'])

# Create a date range for each account spanning from the minimum to maximum month
date_ranges = transactions_monthly.groupby('account_id')['month'].agg(['min', 'max'])

def reindex_df(group, account_id):
    idx = pd.period_range(start=group['month'].min(), end=group['month'].max(), freq='M')
    group.set_index('month', inplace=True)
    group = group.reindex(idx, fill_value=0)
    # Convert the index (which is a PeriodIndex) back into a 'month' column
    group.reset_index(inplace=True)
    group.rename(columns={'index': 'month'}, inplace=True)
    # Since 'account_id' is now explicitly passed, ensure it's set correctly in the reindexed DataFrame
    group['account_id'] = account_id
    return group

transactions_monthly = (transactions_monthly.groupby('account_id')
                        .apply(lambda x: reindex_df(x, x.name))
                        .reset_index(level=0, drop=True))

# After reindexing within each group, the 'month' column is already preserved.
# If you need to reset the index of the whole DataFrame for other reasons but keep 'month' as a column, ensure it's not set as index again.

# Calculate cumulative balance
transactions_monthly['balance'] = transactions_monthly.groupby('account_id')['volume'].cumsum()

  .apply(lambda x: reindex_df(x, x.name))


In [196]:
transactions_monthly

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7
...,...,...,...,...,...,...,...
36,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6
37,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6
38,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8
39,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5


### Defining Roll-Up Windows of Transactions

In [197]:
card_issued = card.groupby('disp_id')['card_issued'].min().reset_index()
card_issued.head(5)

Unnamed: 0,disp_id,card_issued
0,9,1998-10-16
1,19,1998-03-13
2,41,1995-09-03
3,42,1998-11-26
4,51,1995-04-24


In [198]:
transactions_monthly = transactions_monthly.merge(card_issued, left_on='account_id', right_on='disp_id', how='left')

transactions_monthly.head()

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,disp_id,card_issued
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0,,NaT
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2,,NaT
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2,,NaT
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8,,NaT
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7,,NaT


To see if the cards got joined correctly we can check if all `disp_id` in `card_issued` have a non-null value in the `card_issued` variable inside the `transactions_monthly` dataframe:

In [199]:
transactions_monthly[transactions_monthly['disp_id']
                     .isin(card_issued['disp_id'])]['account_id']\
                     .isin(card_issued['disp_id'])\
                     .all()

True

In [200]:
transactions_monthly

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,disp_id,card_issued
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0,,NaT
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2,,NaT
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2,,NaT
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8,,NaT
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7,,NaT
...,...,...,...,...,...,...,...,...,...
179049,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6,,NaT
179050,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6,,NaT
179051,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8,,NaT
179052,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5,,NaT


In [201]:
# Ensure 'card_issued' is a datetime column
transactions_monthly['card_issued'] = pd.to_datetime(transactions_monthly['card_issued'])

# Find the date of the last transaction in the dataset
last_transaction_date = transactions_monthly['card_issued'].max()

# Convert 'card_issued' to period for comparison, use last transaction date where 'card_issued' is NaT
transactions_monthly['card_issued_period'] = transactions_monthly['card_issued'].fillna(last_transaction_date).dt.to_period('M')

# Calculate 'month_diff'. For transactions without a 'card_issued' date, 'card_issued_period' will now use the last transaction date
transactions_monthly['month_diff'] = transactions_monthly.apply(lambda row: (row['card_issued_period'] - row['month']).n, axis=1)

# Only keep the rows where 'month_diff' is between 1 and 13
filtered_transactions = transactions_monthly[transactions_monthly['month_diff'].between(1, 13)]

In [207]:
filtered_transactions.sort_values(by=['account_id', 'month_diff'])

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,disp_id,card_issued,card_issued_period,month_diff
44,1998-11,1,6722.7,6722.7,0.0,5,364490.9,,NaT,1998-12,1
43,1998-10,1,6208.7,6208.7,0.0,4,357768.2,,NaT,1998-12,2
42,1998-09,1,12704.3,12704.3,0.0,5,351559.5,,NaT,1998-12,3
41,1998-08,1,6492.7,6492.7,0.0,5,338855.2,,NaT,1998-12,4
40,1998-07,1,6221.0,6221.0,0.0,4,332362.5,,NaT,1998-12,5
...,...,...,...,...,...,...,...,...,...,...,...
179044,1998-03,11382,42556.1,42556.1,0.0,5,2021453.3,,NaT,1998-12,9
179043,1998-02,11382,49202.4,49202.4,0.0,6,1978897.2,,NaT,1998-12,10
179042,1998-01,11382,103059.9,103059.9,0.0,13,1929694.8,,NaT,1998-12,11
179041,1997-12,11382,75706.9,75706.9,0.0,6,1826634.9,,NaT,1998-12,12


In [208]:
account_summary = pd.DataFrame(filtered_transactions['account_id'].unique(), columns=['account_id'])

# Variables to pivot
variables_to_pivot = ['volume', 'balance', 'withdrawal', 'credit', 'n_transactions']

for variable in variables_to_pivot:
    # Group by 'account_id' and 'month_diff', then summarize the variable
    grouped = filtered_transactions.groupby(['account_id', 'month_diff'])[variable].sum().reset_index()
    
    # Pivot this data to have columns for each 'month_diff'
    pivot = grouped.pivot(index='account_id', columns='month_diff', values=variable).reset_index()
    
    # Rename columns to reflect they are for the current variable and month_diff
    pivot.columns = ['account_id'] + [f'{variable}_month_diff_{int(col)}' if col != 'account_id' else 'account_id' for col in pivot.columns[1:]]
    
    # Merge this pivoted data back into the account summary DataFrame
    account_summary = pd.merge(account_summary, pivot, on='account_id', how='left')

In [209]:
account_summary.head()

Unnamed: 0,account_id,volume_month_diff_1,volume_month_diff_2,volume_month_diff_3,volume_month_diff_4,volume_month_diff_5,volume_month_diff_6,volume_month_diff_7,volume_month_diff_8,volume_month_diff_9,volume_month_diff_10,volume_month_diff_11,volume_month_diff_12,volume_month_diff_13,balance_month_diff_1,balance_month_diff_2,balance_month_diff_3,balance_month_diff_4,balance_month_diff_5,balance_month_diff_6,balance_month_diff_7,balance_month_diff_8,balance_month_diff_9,balance_month_diff_10,balance_month_diff_11,balance_month_diff_12,balance_month_diff_13,withdrawal_month_diff_1,withdrawal_month_diff_2,withdrawal_month_diff_3,withdrawal_month_diff_4,withdrawal_month_diff_5,withdrawal_month_diff_6,withdrawal_month_diff_7,withdrawal_month_diff_8,withdrawal_month_diff_9,withdrawal_month_diff_10,withdrawal_month_diff_11,withdrawal_month_diff_12,withdrawal_month_diff_13,credit_month_diff_1,credit_month_diff_2,credit_month_diff_3,credit_month_diff_4,credit_month_diff_5,credit_month_diff_6,credit_month_diff_7,credit_month_diff_8,credit_month_diff_9,credit_month_diff_10,credit_month_diff_11,credit_month_diff_12,credit_month_diff_13,n_transactions_month_diff_1,n_transactions_month_diff_2,n_transactions_month_diff_3,n_transactions_month_diff_4,n_transactions_month_diff_5,n_transactions_month_diff_6,n_transactions_month_diff_7,n_transactions_month_diff_8,n_transactions_month_diff_9,n_transactions_month_diff_10,n_transactions_month_diff_11,n_transactions_month_diff_12,n_transactions_month_diff_13
0,1,6722.7,6208.7,12704.3,6492.7,6221.0,6667.1,6212.3,7435.5,7018.6,6701.9,9091.5,10907.2,7318.0,364490.9,357768.2,351559.5,338855.2,332362.5,326141.5,319474.4,313262.1,305826.6,298808.0,292106.1,283014.6,272107.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6722.7,6208.7,12704.3,6492.7,6221.0,6667.1,6212.3,7435.5,7018.6,6701.9,9091.5,10907.2,7318.0,5.0,4.0,5.0,5.0,4.0,5.0,4.0,6.0,5.0,5.0,10.0,5.0,5.0
1,2,50299.4,38810.8,39711.1,48309.0,39133.5,49335.2,44570.5,32862.3,52580.1,31656.0,65467.3,59041.9,40711.2,3096502.6,3046203.2,3007392.4,2967681.3,2919372.3,2880238.8,2830903.6,2786333.1,2753470.8,2700890.7,2669234.7,2603767.4,2544725.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50299.4,38810.8,39711.1,48309.0,39133.5,49335.2,44570.5,32862.3,52580.1,31656.0,65467.3,59041.9,40711.2,10.0,6.0,6.0,8.0,6.0,6.0,7.0,6.0,7.0,5.0,11.0,7.0,6.0
2,3,19295.1,14183.9,14564.7,13354.1,13640.8,20775.2,13419.0,16739.3,18205.4,12774.5,29516.3,26041.5,15041.1,275998.2,256703.1,242519.2,227954.5,214600.4,200959.6,180184.4,166765.4,150026.1,131820.7,119046.2,89529.9,63488.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19295.1,14183.9,14564.7,13354.1,13640.8,20775.2,13419.0,16739.3,18205.4,12774.5,29516.3,26041.5,15041.1,8.0,7.0,7.0,8.0,7.0,7.0,7.0,8.0,7.0,7.0,13.0,9.0,4.0
3,4,9060.2,9050.7,9041.2,9031.8,10868.1,10301.1,9016.6,9808.5,9001.2,8991.6,17894.6,13802.6,9659.7,341319.7,332259.5,323208.8,314167.6,305135.8,294267.7,283966.6,274950.0,265141.5,256140.3,247148.7,229254.1,215451.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9060.2,9050.7,9041.2,9031.8,10868.1,10301.1,9016.6,9808.5,9001.2,8991.6,17894.6,13802.6,9659.7,5.0,5.0,5.0,5.0,6.0,7.0,5.0,6.0,5.0,5.0,10.0,8.0,6.0
4,5,7919.1,7809.2,9142.1,12909.6,11918.2,8614.9,10213.3,10012.6,8607.1,7797.2,15720.3,10825.1,7812.5,155004.3,147085.2,139276.0,130133.9,117224.3,105306.1,96691.2,86477.9,76465.3,67858.2,60061.0,44340.7,33515.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7919.1,7809.2,9142.1,12909.6,11918.2,8614.9,10213.3,10012.6,8607.1,7797.2,15720.3,10825.1,7812.5,5.0,4.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,10.0,6.0,4.0


### Bringing the data together

In [210]:
client_df = client_df.merge(account_summary, on='account_id', how='left')

In [211]:
client_df

Unnamed: 0,disp_id,client_id,account_id,account_district_id,account_frequency,account_date,client_district_id,client_gender,client_birth_date,client_age,ordertype_HOUSEHOLD,ordertype_INSURANCE PAYMENT,ordertype_LEASING,ordertype_LOAN PAYMENT,ordertype_MISSING,loan_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,card_id,card_type,card_issued,district_id_x,client_district_district_name_x,client_district_region_x,client_district_population_x,client_district_n_municipalities_with_inhabitants_lt_499_x,client_district_n_municipalities_with_inhabitants_500_to_1999_x,client_district_n_municipalities_with_inhabitants_2000_to_9999_x,client_district_n_municipalities_with_inhabitants_gt_10000_x,client_district_n_cities_x,client_district_ratio_urban_inhabitants_x,client_district_average_salary_x,client_district_unemployment_rate_95_x,client_district_unemployment_rate_96_x,client_district_enterpreneurs_per_1000_inhabitants_x,client_district_n_commited_crimes_95_x,client_district_n_commited_crimes_96_x,district_id_y,client_district_district_name_y,client_district_region_y,client_district_population_y,client_district_n_municipalities_with_inhabitants_lt_499_y,client_district_n_municipalities_with_inhabitants_500_to_1999_y,client_district_n_municipalities_with_inhabitants_2000_to_9999_y,client_district_n_municipalities_with_inhabitants_gt_10000_y,client_district_n_cities_y,client_district_ratio_urban_inhabitants_y,client_district_average_salary_y,client_district_unemployment_rate_95_y,client_district_unemployment_rate_96_y,client_district_enterpreneurs_per_1000_inhabitants_y,client_district_n_commited_crimes_95_y,client_district_n_commited_crimes_96_y,volume_month_diff_1,volume_month_diff_2,volume_month_diff_3,volume_month_diff_4,volume_month_diff_5,volume_month_diff_6,volume_month_diff_7,volume_month_diff_8,volume_month_diff_9,volume_month_diff_10,volume_month_diff_11,volume_month_diff_12,volume_month_diff_13,balance_month_diff_1,balance_month_diff_2,balance_month_diff_3,balance_month_diff_4,balance_month_diff_5,balance_month_diff_6,balance_month_diff_7,balance_month_diff_8,balance_month_diff_9,balance_month_diff_10,balance_month_diff_11,balance_month_diff_12,balance_month_diff_13,withdrawal_month_diff_1,withdrawal_month_diff_2,withdrawal_month_diff_3,withdrawal_month_diff_4,withdrawal_month_diff_5,withdrawal_month_diff_6,withdrawal_month_diff_7,withdrawal_month_diff_8,withdrawal_month_diff_9,withdrawal_month_diff_10,withdrawal_month_diff_11,withdrawal_month_diff_12,withdrawal_month_diff_13,credit_month_diff_1,credit_month_diff_2,credit_month_diff_3,credit_month_diff_4,credit_month_diff_5,credit_month_diff_6,credit_month_diff_7,credit_month_diff_8,credit_month_diff_9,credit_month_diff_10,credit_month_diff_11,credit_month_diff_12,credit_month_diff_13,n_transactions_month_diff_1,n_transactions_month_diff_2,n_transactions_month_diff_3,n_transactions_month_diff_4,n_transactions_month_diff_5,n_transactions_month_diff_6,n_transactions_month_diff_7,n_transactions_month_diff_8,n_transactions_month_diff_9,n_transactions_month_diff_10,n_transactions_month_diff_11,n_transactions_month_diff_12,n_transactions_month_diff_13
0,1,1,1,18,MONTHLY CHARGES,1995-03-24,18,FEMALE,1970-12-13,29,1.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,18,Pisek,south Bohemia,70699,60,13,2,1,4,65.3,8968,2.83,3.35,131,1740,1910,18,Pisek,south Bohemia,70699,60,13,2,1,4,65.3,8968,2.83,3.35,131,1740,1910,6722.7,6208.7,12704.3,6492.7,6221.0,6667.1,6212.3,7435.5,7018.6,6701.9,9091.5,10907.2,7318.0,364490.9,357768.2,351559.5,338855.2,332362.5,326141.5,319474.4,313262.1,305826.6,298808.0,292106.1,283014.6,272107.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6722.7,6208.7,12704.3,6492.7,6221.0,6667.1,6212.3,7435.5,7018.6,6701.9,9091.5,10907.2,7318.0,5.0,4.0,5.0,5.0,4.0,5.0,4.0,6.0,5.0,5.0,10.0,5.0,5.0
1,2,2,2,1,MONTHLY CHARGES,1993-02-26,1,MALE,1945-02-04,54,1.0,0.0,0.0,1.0,0.0,4959.0,1994-01-05,80952.0,24.0,3373.0,"contract finished, no problems",,,NaT,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,50299.4,38810.8,39711.1,48309.0,39133.5,49335.2,44570.5,32862.3,52580.1,31656.0,65467.3,59041.9,40711.2,3096502.6,3046203.2,3007392.4,2967681.3,2919372.3,2880238.8,2830903.6,2786333.1,2753470.8,2700890.7,2669234.7,2603767.4,2544725.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50299.4,38810.8,39711.1,48309.0,39133.5,49335.2,44570.5,32862.3,52580.1,31656.0,65467.3,59041.9,40711.2,10.0,6.0,6.0,8.0,6.0,6.0,7.0,6.0,7.0,5.0,11.0,7.0,6.0
2,4,4,3,5,MONTHLY CHARGES,1997-07-07,5,MALE,1956-12-01,43,1.0,1.0,0.0,0.0,1.0,,NaT,,,,,,,NaT,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040,19295.1,14183.9,14564.7,13354.1,13640.8,20775.2,13419.0,16739.3,18205.4,12774.5,29516.3,26041.5,15041.1,275998.2,256703.1,242519.2,227954.5,214600.4,200959.6,180184.4,166765.4,150026.1,131820.7,119046.2,89529.9,63488.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19295.1,14183.9,14564.7,13354.1,13640.8,20775.2,13419.0,16739.3,18205.4,12774.5,29516.3,26041.5,15041.1,8.0,7.0,7.0,8.0,7.0,7.0,7.0,8.0,7.0,7.0,13.0,9.0,4.0
3,6,6,4,12,MONTHLY CHARGES,1996-02-21,12,MALE,1919-09-22,80,2.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,12,Pribram,central Bohemia,107870,84,29,6,1,6,58.0,8754,3.83,4.31,137,3804,3868,12,Pribram,central Bohemia,107870,84,29,6,1,6,58.0,8754,3.83,4.31,137,3804,3868,9060.2,9050.7,9041.2,9031.8,10868.1,10301.1,9016.6,9808.5,9001.2,8991.6,17894.6,13802.6,9659.7,341319.7,332259.5,323208.8,314167.6,305135.8,294267.7,283966.6,274950.0,265141.5,256140.3,247148.7,229254.1,215451.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9060.2,9050.7,9041.2,9031.8,10868.1,10301.1,9016.6,9808.5,9001.2,8991.6,17894.6,13802.6,9659.7,5.0,5.0,5.0,5.0,6.0,7.0,5.0,6.0,5.0,5.0,10.0,8.0,6.0
4,7,7,5,15,MONTHLY CHARGES,1997-05-30,15,MALE,1929-01-25,70,1.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,15,Cesky Krumlov,south Bohemia,58796,22,16,7,1,5,51.9,9045,3.13,3.60,124,1845,1879,15,Cesky Krumlov,south Bohemia,58796,22,16,7,1,5,51.9,9045,3.13,3.60,124,1845,1879,7919.1,7809.2,9142.1,12909.6,11918.2,8614.9,10213.3,10012.6,8607.1,7797.2,15720.3,10825.1,7812.5,155004.3,147085.2,139276.0,130133.9,117224.3,105306.1,96691.2,86477.9,76465.3,67858.2,60061.0,44340.7,33515.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7919.1,7809.2,9142.1,12909.6,11918.2,8614.9,10213.3,10012.6,8607.1,7797.2,15720.3,10825.1,7812.5,5.0,4.0,6.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,10.0,6.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4350,13623,13931,11333,8,MONTHLY CHARGES,1994-05-26,8,MALE,1942-01-01,58,0.0,0.0,1.0,0.0,0.0,,NaT,,,,,,,NaT,8,Mlada Boleslav,central Bohemia,112065,95,19,7,1,8,69.4,11277,1.25,1.44,127,5179,4987,8,Mlada Boleslav,central Bohemia,112065,95,19,7,1,8,69.4,11277,1.25,1.44,127,5179,4987,22263.6,38496.6,12148.2,66939.5,21806.4,37591.0,35080.8,24538.6,33297.0,4158.9,109908.8,86277.2,186391.0,3278330.5,3256066.9,3217570.3,3205422.1,3138482.6,3116676.2,3079085.2,3044004.4,3019465.8,2986168.8,2982009.9,2872101.1,2785823.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22263.6,38496.6,12148.2,66939.5,21806.4,37591.0,35080.8,24538.6,33297.0,4158.9,109908.8,86277.2,186391.0,5.0,6.0,5.0,6.0,6.0,6.0,5.0,5.0,6.0,4.0,12.0,7.0,10.0
4351,13647,13955,11349,1,WEEKLY CHARGES,1995-05-26,1,FEMALE,1945-10-30,54,1.0,0.0,0.0,1.0,0.0,7304.0,1995-10-29,419880.0,60.0,6998.0,"running contract, OK so far",,,NaT,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,71658.7,82384.8,85896.6,104462.0,95354.8,118560.8,76531.5,81367.9,85436.2,81001.7,91717.9,135801.6,92180.9,3828513.2,3756854.5,3674469.7,3588573.1,3484111.1,3388756.3,3270195.5,3193664.0,3112296.1,3026859.9,2945858.2,2854140.3,2718338.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71658.7,82384.8,85896.6,104462.0,95354.8,118560.8,76531.5,81367.9,85436.2,81001.7,91717.9,135801.6,92180.9,7.0,8.0,7.0,7.0,7.0,6.0,6.0,7.0,7.0,6.0,10.0,9.0,7.0
4352,13660,13968,11359,61,MONTHLY CHARGES,1994-10-01,61,MALE,1968-04-13,31,1.0,0.0,0.0,1.0,1.0,7305.0,1996-08-06,54024.0,12.0,4502.0,"contract finished, no problems",1247.0,classic,1995-06-13,61,Trebic,south Moravia,117897,139,28,5,1,6,53.8,8814,4.76,5.74,107,2112,2059,61,Trebic,south Moravia,117897,139,28,5,1,6,53.8,8814,4.76,5.74,107,2112,2059,55846.4,53168.3,58064.5,55174.7,58885.7,70075.1,47536.6,36184.5,60207.8,38727.1,66813.7,65317.0,61699.0,2877905.8,2822059.4,2768891.1,2710826.6,2655651.9,2596766.2,2526691.1,2479154.5,2442970.0,2382762.2,2344035.1,2277221.4,2211904.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55846.4,53168.3,58064.5,55174.7,58885.7,70075.1,47536.6,36184.5,60207.8,38727.1,66813.7,65317.0,61699.0,7.0,8.0,8.0,7.0,7.0,8.0,7.0,7.0,7.0,7.0,14.0,8.0,9.0
4353,13663,13971,11362,67,MONTHLY CHARGES,1995-10-14,67,FEMALE,1962-10-19,37,1.0,1.0,0.0,1.0,2.0,7308.0,1996-12-27,129408.0,24.0,5392.0,"contract finished, no problems",,,NaT,67,Bruntal,north Moravia,106054,38,25,6,2,6,63.1,8110,5.77,6.55,109,3244,3079,67,Bruntal,north Moravia,106054,38,25,6,2,6,63.1,8110,5.77,6.55,109,3244,3079,30671.3,32730.0,34681.3,35061.5,35805.9,38765.5,36667.9,28262.7,32344.0,31004.2,44355.6,41138.0,33835.5,1292068.7,1261397.4,1228667.4,1193986.1,1158924.6,1123118.7,1084353.2,1047685.3,1019422.6,987078.6,956074.4,911718.8,870580.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30671.3,32730.0,34681.3,35061.5,35805.9,38765.5,36667.9,28262.7,32344.0,31004.2,44355.6,41138.0,33835.5,10.0,9.0,9.0,10.0,9.0,9.0,10.0,9.0,10.0,9.0,15.0,10.0,9.0


### Generating Event-Based Customer Information

## Feature Engineering

### Deriving New Features