# Applied Machine Learning - Mini Challenge: Cross-Selling of Credit Cards
**Author**: Nils Fahrni

In [291]:
import sklearn
import numpy as np
import sys
import pandas as pd
import seaborn as sns

sns.set_palette("pastel")
pd.set_option('display.max_columns', None)

sys.path.append('scripts')
from dataloader import DataLoader

## Data Preprocessing

### Data Loading

In [292]:
data_loader = DataLoader(base_path='data', translations_name='translation_mappings.json')
data_loader.list_datasets()

Unnamed: 0,Dataset,Number of Rows
0,loan,682
1,client,5369
2,district,77
3,trans,1056320
4,account,4500
5,card,892
6,order,6471
7,disp,5369


#### Account

In [293]:
account = data_loader.load_csv('account', parse_dates={'date': '%y%m%d'})
account.sample(5)

Mapped frequency:
{
    "POPLATEK MESICNE": "MONTHLY CHARGES",
    "POPLATEK TYDNE": "WEEKLY CHARGES",
    "POPLATEK PO OBRATU": "TRANSACTION CHARGES"
}


Unnamed: 0,account_id,district_id,frequency,date
1173,1470,19,MONTHLY CHARGES,1994-02-10
3255,2483,1,MONTHLY CHARGES,1996-09-29
1498,2924,56,MONTHLY CHARGES,1994-10-16
314,2002,1,MONTHLY CHARGES,1993-04-08
2697,1752,68,MONTHLY CHARGES,1996-05-08


#### Client

In [294]:
client = data_loader.load_csv('client')

client = client.assign(gender=client['birth_number'].apply(lambda x: 'FEMALE' if int(str(x)[2:4]) > 50 else 'MALE'))

client = client.assign(birth_number=client.apply(lambda x: x['birth_number'] - 5000 if x['gender'] == 'FEMALE' else x['birth_number'], axis=1))

client['birth_number'] = pd.to_datetime(client['birth_number'], format='%y%m%d')
client['birth_date'] = client['birth_number'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 1999 else x)
client.drop('birth_number', axis=1, inplace=True)

client['age'] = (pd.to_datetime('1999-12-31') - client['birth_date']).dt.days // 365

client.sample(5)

Unnamed: 0,client_id,district_id,gender,birth_date,age
1746,1851,31,FEMALE,1975-09-12,24
204,217,59,FEMALE,1974-12-22,25
5205,11768,1,FEMALE,1971-09-12,28
744,780,3,MALE,1924-02-26,75
3881,4100,74,MALE,1955-03-13,44


#### Disposition

Removing disponents as the goal is to only advertise to owners. Disponents may be secondary users that have been authorized to use an account. They may be allowed to execute transactions on that account but they are not the authorized owners.

In [295]:
disp = data_loader.load_csv('disp')

disp = disp[disp['type'] != 'DISPONENT']

disp.drop('type', axis=1, inplace=True)

disp.sample(5)

Unnamed: 0,disp_id,client_id,account_id
2617,2769,2769,2288
2110,2227,2227,1836
3251,3435,3435,2842
3839,4056,4056,3354
2334,2462,2462,2027


#### Permanent Order

In [296]:
order = data_loader.load_csv('order')

order.sample(5)

Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SIPO": "HOUSEHOLD",
    "LEASING": "LEASING",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
6334,44840,10404,ST,55139064,3238.0,LOAN PAYMENT
4502,34381,3364,ST,86091345,5491.0,HOUSEHOLD
5923,40827,7721,IJ,35142588,3788.3,LOAN PAYMENT
5853,40169,7240,WX,51813179,2535.7,LOAN PAYMENT
758,30224,549,CD,87506788,3024.0,


**Are there Null Values?**

In [297]:
display(order.isnull().sum())

display(order[order['k_symbol'].isnull()].sample(5))

order_id         0
account_id       0
bank_to          0
account_to       0
amount           0
k_symbol      1379
dtype: int64

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
5318,35286,3998,KL,25096166,1519.0,
1122,30632,848,ST,72638684,1590.0,
605,30057,436,IJ,61715274,72.0,
5514,36347,4727,UV,87424748,1861.0,
5443,35882,4415,OP,67352598,397.0,


In [298]:
order['k_symbol'] = order['k_symbol'].fillna('MISSING')

order_pivot = order.pivot_table(index='account_id', columns='k_symbol', values='amount', aggfunc='count', fill_value=0)

order_pivot.sample(5)

k_symbol,HOUSEHOLD,INSURANCE PAYMENT,LEASING,LOAN PAYMENT,MISSING
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
673,1,0,0,0,0
644,1,0,0,0,1
2645,1,1,1,0,2
3006,1,0,0,0,0
4794,1,1,0,1,2


#### Transaction

TODO: 
- amount to negative or positive based on if withdrawal or deposit
- Research account number 19 (time series account balance, at least per month)
    - this acc is volatile
    - account's balance goes negative sometimes
- what happens to accounts with multiple transactions on a day?
    - how to obtain the actual end of day balance?
        - add up withdrawals with deposits and add to balance of day before 
        - try to vectorize this problem (R antijoin mentioned)

In [299]:
transaction = data_loader.load_csv('trans', parse_dates={'date': '%y%m%d'})

transaction.sample(5)

Mapped type:
{
    "PRIJEM": "CREDIT",
    "VYDAJ": "WITHDRAWAL"
}
Mapped operation:
{
    "VYBER KARTOU": "CREDIT CARD WITHDRAWAL",
    "VKLAD": "CREDIT IN CASH",
    "PREVOD Z UCTU": "COLLECTION FROM ANOTHER BANK",
    "VYBER": "WITHDRAWAL IN CASH",
    "PREVOD NA UCET": "REMITTANCE TO ANOTHER BANK"
}
Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SLUZBY": "PAYMENT FOR STATEMENT",
    "UROK": "INTEREST CREDITED",
    "SANKC. UROK": "SANCTION INTEREST IF NEGATIVE BALANCE",
    "SIPO": "HOUSEHOLD",
    "DUCHOD": "OLD-AGE PENSION",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
1000764,3652611,3700,1998-10-31,CREDIT,,206.1,50057.5,INTEREST CREDITED,,
537656,3660018,3925,1997-04-30,CREDIT,,277.6,84947.6,INTEREST CREDITED,,
516393,172423,577,1997-03-31,WITHDRAWAL,WITHDRAWAL IN CASH,14.6,30320.6,PAYMENT FOR STATEMENT,,
946322,12489,41,1998-08-27,WITHDRAWAL,WITHDRAWAL IN CASH,13600.0,62883.5,,,
740017,1106263,3779,1998-01-06,CREDIT,CREDIT IN CASH,11404.0,54104.5,,,


#### Loan

In [300]:
loan = data_loader.load_csv('loan', parse_dates={'date': '%y%m%d'})

loan.sample(5)

Mapped status:
{
    "A": "contract finished, no problems",
    "B": "contract finished, loan not payed",
    "C": "running contract, OK so far",
    "D": "running contract, client in debt"
}


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
31,7262,11135,1994-03-01,182628,36,5073.0,"contract finished, no problems"
28,6650,8073,1994-02-17,49320,12,4110.0,"contract finished, loan not payed"
29,7226,10940,1994-02-23,197748,36,5493.0,"contract finished, no problems"
519,6059,5196,1997-12-28,79824,12,6652.0,"contract finished, no problems"
55,6903,9337,1994-06-04,52128,24,2172.0,"contract finished, no problems"


**Can an account have multiple loans?**

In [301]:
print(f'Are there accounts with multiple loans: {loan["account_id"].nunique() < loan.shape[0]}')

Are there accounts with multiple loans: False


#### Credit Card

In [302]:
card = data_loader.load_csv('card', parse_dates={'issued': '%y%m%d'})

card.sample(5)

Unnamed: 0,card_id,disp_id,type,issued
148,38,242,classic,1996-08-31
247,226,1378,classic,1997-04-18
134,1055,10171,classic,1996-07-24
854,692,4493,classic,1998-12-08
519,784,5461,classic,1998-03-24


#### District

In [303]:
district = data_loader.load_csv('district')

district = district.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'population',
    'A5': 'n_municipalities_with_inhabitants_lt_499',
    'A6': 'n_municipalities_with_inhabitants_500_to_1999',
    'A7': 'n_municipalities_with_inhabitants_2000_to_9999',
    'A8': 'n_municipalities_with_inhabitants_gt_10000',
    'A9': 'n_cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_95',
    'A13': 'unemployment_rate_96',
    'A14': 'enterpreneurs_per_1000_inhabitants',
    'A15': 'n_commited_crimes_95',
    'A16': 'n_commited_crimes_96'
})

district.sample(5)

Unnamed: 0,district_id,district_name,region,population,n_municipalities_with_inhabitants_lt_499,n_municipalities_with_inhabitants_500_to_1999,n_municipalities_with_inhabitants_2000_to_9999,n_municipalities_with_inhabitants_gt_10000,n_cities,ratio_urban_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,enterpreneurs_per_1000_inhabitants,n_commited_crimes_95,n_commited_crimes_96
28,29,Rokycany,west Bohemia,45714,52,10,5,1,6,55.6,8843,2.82,3.6,113,818,888
38,39,Most,north Bohemia,119895,17,4,3,2,4,89.9,10446,7.34,9.4,90,4947,4743
18,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.13,3.98,120,999,1099
12,13,Rakovnik,central Bohemia,53921,61,22,1,1,2,41.3,8598,2.77,3.26,123,1597,1875
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


### Data Merging

In [304]:
from utils import add_prefix_except_id

account = add_prefix_except_id(account, 'account_', id_exceptions=['district_id'])
client_df = disp.merge(account, on='account_id', how='left')

client = add_prefix_except_id(client, 'client_', id_exceptions=['district_id'])
client_df = client_df.merge(client, on='client_id', how='left')

order_pivot = add_prefix_except_id(order_pivot, 'ordertype_')
client_df = client_df.merge(order_pivot, on='account_id', how='left')

loan = add_prefix_except_id(loan, 'loan_')
client_df = client_df.merge(loan, on='account_id', how='left')

card = add_prefix_except_id(card, 'card_')
client_df = client_df.merge(card, on='disp_id', how='left')

client_district = add_prefix_except_id(district, 'client_district_')
client_df = client_df.merge(client_district, left_on='client_district_id', right_on='district_id', how='left')

account_district = add_prefix_except_id(district, 'account_district_')
client_df = client_df.merge(client_district, left_on='account_district_id', right_on='district_id', how='left')

client_df.sample(5)

Unnamed: 0,disp_id,client_id,account_id,account_district_id,account_frequency,account_date,client_district_id,client_gender,client_birth_date,client_age,ordertype_HOUSEHOLD,ordertype_INSURANCE PAYMENT,ordertype_LEASING,ordertype_LOAN PAYMENT,ordertype_MISSING,loan_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,card_id,card_type,card_issued,district_id_x,client_district_district_name_x,client_district_region_x,client_district_population_x,client_district_n_municipalities_with_inhabitants_lt_499_x,client_district_n_municipalities_with_inhabitants_500_to_1999_x,client_district_n_municipalities_with_inhabitants_2000_to_9999_x,client_district_n_municipalities_with_inhabitants_gt_10000_x,client_district_n_cities_x,client_district_ratio_urban_inhabitants_x,client_district_average_salary_x,client_district_unemployment_rate_95_x,client_district_unemployment_rate_96_x,client_district_enterpreneurs_per_1000_inhabitants_x,client_district_n_commited_crimes_95_x,client_district_n_commited_crimes_96_x,district_id_y,client_district_district_name_y,client_district_region_y,client_district_population_y,client_district_n_municipalities_with_inhabitants_lt_499_y,client_district_n_municipalities_with_inhabitants_500_to_1999_y,client_district_n_municipalities_with_inhabitants_2000_to_9999_y,client_district_n_municipalities_with_inhabitants_gt_10000_y,client_district_n_cities_y,client_district_ratio_urban_inhabitants_y,client_district_average_salary_y,client_district_unemployment_rate_95_y,client_district_unemployment_rate_96_y,client_district_enterpreneurs_per_1000_inhabitants_y,client_district_n_commited_crimes_95_y,client_district_n_commited_crimes_96_y
1168,1482,1482,1229,46,MONTHLY CHARGES,1997-12-10,46,FEMALE,1950-03-28,49,1.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,46,Nachod,east Bohemia,112709,48,20,7,3,10,73.5,8369,1.79,2.31,117,2854,2618,46,Nachod,east Bohemia,112709,48,20,7,3,10,73.5,8369,1.79,2.31,117,2854,2618
3854,5030,5030,4173,1,MONTHLY CHARGES,1993-11-08,1,MALE,1980-01-15,19,,,,,,,NaT,,,,,,,NaT,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
221,284,284,235,41,MONTHLY CHARGES,1993-10-06,10,MALE,1974-08-06,25,1.0,0.0,0.0,0.0,1.0,,NaT,,,,,,,NaT,10,Praha - vychod,central Bohemia,92084,55,29,4,3,5,46.7,10124,0.56,0.54,141,3810,4316,41,Usti nad Labem,north Bohemia,118650,8,8,5,1,3,85.6,9832,3.32,4.48,114,6445,5471
1303,1659,1659,1371,1,MONTHLY CHARGES,1994-04-27,1,FEMALE,1937-12-15,62,1.0,0.0,0.0,0.0,0.0,,NaT,,,,,,,NaT,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
2491,3158,3158,2617,1,MONTHLY CHARGES,1993-09-24,1,FEMALE,1940-11-25,59,1.0,0.0,0.0,0.0,1.0,,NaT,,,,,,,NaT,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107


In [305]:
assert client_df['account_id'].nunique() == client_df.shape[0]

### Data Cleaning

#### Removing Junior Cards

In [306]:
# define junior_clients as clients with the card_type junior and age <= 20
junior_clients = client_df[client_df['card_type'] == 'junior']

client_df = client_df[~client_df['account_id'].isin(junior_clients['account_id'])]

transaction = transaction[~transaction['account_id'].isin(junior_clients['account_id'])]

print(f'Number of junior clients: {junior_clients.shape[0]}')
print(f'Number of clients remaining: {client_df.shape[0]}')

Number of junior clients: 145
Number of clients remaining: 4355


## Model Construction

### Processing Transactional Data
- The goal is to predict if a non-card-owner will buy a card or not

The first task is to look if every account in the transactions dataframe has a "first transaction". This would make the calculation of the monthly balance much easier since everything can be summed up without having to worry that there were months without records in the transaction dataframe.

In [307]:
# Find the minimum (first) transaction(s) date for each account
min_dates = transaction.groupby('account_id')['date'].min().reset_index()
min_dates.rename(columns={'date': 'min_date'}, inplace=True)

# Merge the minimum date back to the transactions to identify all transactions on the first day
transactions_with_min_date = pd.merge(transaction, min_dates, on='account_id')

# Filter transactions that are on the first day
first_day_transactions = transactions_with_min_date[transactions_with_min_date['date'] == transactions_with_min_date['min_date']]
first_day_transactions = first_day_transactions.copy()

# Now, for each of these first day transactions, check if any have amount equals balance
first_day_transactions['amount_equals_balance'] = first_day_transactions['amount'] == first_day_transactions['balance']

# Group by account_id and check if any transactions for each account meet the condition
accounts_meeting_condition = first_day_transactions.groupby('account_id')['amount_equals_balance'].any().reset_index()

# Verify if all accounts have at least one transaction on the first day meeting the condition
all_accounts_covered = accounts_meeting_condition['amount_equals_balance'].all()

print("Does every account's first day of transactions include at least one transaction where amount equals balance?", all_accounts_covered)

Does every account's first day of transactions include at least one transaction where amount equals balance? True


Now every accounts balance needs to be calculated per month.

In [308]:
# Extract year and month to create a 'month' column
transaction['month'] = transaction['date'].dt.to_period('M')

# Group by account_id and month to summarize transactions
transactions_monthly = transaction.groupby(['account_id', 'month']).agg(
    volume=('amount', 'sum'),
    credit=('amount', lambda x: x[x > 0].sum()),
    withdrawal=('amount', lambda x: x[x < 0].sum()),
    n_transactions=('amount', 'size')
).reset_index()

In [309]:
# Ensure 'month' is a PeriodIndex for proper handling
transactions_monthly['month'] = pd.PeriodIndex(transactions_monthly['month'])

# Create a date range for each account spanning from the minimum to maximum month
date_ranges = transactions_monthly.groupby('account_id')['month'].agg(['min', 'max'])

def reindex_df(group, account_id):
    idx = pd.period_range(start=group['month'].min(), end=group['month'].max(), freq='M')
    group.set_index('month', inplace=True)
    group = group.reindex(idx, fill_value=0)
    # Convert the index (which is a PeriodIndex) back into a 'month' column
    group.reset_index(inplace=True)
    group.rename(columns={'index': 'month'}, inplace=True)
    # Since 'account_id' is now explicitly passed, ensure it's set correctly in the reindexed DataFrame
    group['account_id'] = account_id
    return group

transactions_monthly = (transactions_monthly.groupby('account_id')
                        .apply(lambda x: reindex_df(x, x.name))
                        .reset_index(level=0, drop=True))

# After reindexing within each group, the 'month' column is already preserved.
# If you need to reset the index of the whole DataFrame for other reasons but keep 'month' as a column, ensure it's not set as index again.

# Calculate cumulative balance
transactions_monthly['balance'] = transactions_monthly.groupby('account_id')['volume'].cumsum()

  .apply(lambda x: reindex_df(x, x.name))


In [310]:
transactions_monthly

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7
...,...,...,...,...,...,...,...
36,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6
37,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6
38,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8
39,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5


### Defining Roll-Up Windows of Transactions

In [311]:
card_issued = card.groupby('disp_id')['card_issued'].min().reset_index()
card_issued.head(5)

Unnamed: 0,disp_id,card_issued
0,9,1998-10-16
1,19,1998-03-13
2,41,1995-09-03
3,42,1998-11-26
4,51,1995-04-24


In [312]:
transactions_monthly = transactions_monthly.merge(card_issued, left_on='account_id', right_on='disp_id', how='left')

transactions_monthly.head()

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,disp_id,card_issued
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0,,NaT
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2,,NaT
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2,,NaT
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8,,NaT
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7,,NaT


To see if the cards got joined correctly we can check if all `disp_id` in `card_issued` have a non-null value in the `card_issued` variable inside the `transactions_monthly` dataframe:

In [313]:
transactions_monthly[transactions_monthly['disp_id']
                     .isin(card_issued['disp_id'])]['account_id']\
                     .isin(card_issued['disp_id'])\
                     .all()

True

In [314]:
transactions_monthly

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,disp_id,card_issued
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0,,NaT
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2,,NaT
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2,,NaT
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8,,NaT
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7,,NaT
...,...,...,...,...,...,...,...,...,...
179049,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6,,NaT
179050,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6,,NaT
179051,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8,,NaT
179052,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5,,NaT


In [315]:
# Ensure 'card_issued' is a datetime column and convert to period for comparison
transactions_monthly['card_issued'] = pd.to_datetime(transactions_monthly['card_issued'])
transactions_monthly['card_issued_period'] = transactions_monthly['card_issued'].dt.to_period('M')

transactions_monthly['month_diff'] = transactions_monthly.apply(lambda row: (row['card_issued_period'] - row['month']).n if pd.notnull(row['card_issued_period']) and pd.notnull(row['month']) else None, axis=1)

# only keep the rows where month_diff is between 1 and 13
filtered_transactions = transactions_monthly[transactions_monthly['month_diff'].between(1, 13)]

In [316]:
filtered_transactions.sort_values(by=['account_id', 'month_diff'])

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,disp_id,card_issued,card_issued_period,month_diff
376,1998-09,9,20528.5,20528.5,0.0,5,683183.3,9.0,1998-10-16,1998-10,1.0
375,1998-08,9,1372.0,1372.0,0.0,4,662654.8,9.0,1998-10-16,1998-10,2.0
374,1998-07,9,5787.3,5787.3,0.0,4,661282.8,9.0,1998-10-16,1998-10,3.0
373,1998-06,9,3937.2,3937.2,0.0,4,655495.5,9.0,1998-10-16,1998-10,4.0
372,1998-05,9,24111.8,24111.8,0.0,6,651558.3,9.0,1998-10-16,1998-10,5.0
...,...,...,...,...,...,...,...,...,...,...,...
175451,1997-02,10220,36591.6,36591.6,0.0,5,224175.1,10220.0,1997-11-06,1997-11,9.0
175450,1997-01,10220,46074.2,46074.2,0.0,9,187583.5,10220.0,1997-11-06,1997-11,10.0
175449,1996-12,10220,51686.4,51686.4,0.0,4,141509.3,10220.0,1997-11-06,1997-11,11.0
175448,1996-11,10220,36057.6,36057.6,0.0,4,89822.9,10220.0,1997-11-06,1997-11,12.0


In [321]:
account_summary = pd.DataFrame(filtered_transactions['account_id'].unique(), columns=['account_id'])

# Variables to pivot
variables_to_pivot = ['volume', 'balance', 'withdrawal', 'credit', 'n_transactions']

for variable in variables_to_pivot:
    # Group by 'account_id' and 'month_diff', then summarize the variable
    grouped = filtered_transactions.groupby(['account_id', 'month_diff'])[variable].sum().reset_index()
    
    # Pivot this data to have columns for each 'month_diff'
    pivot = grouped.pivot(index='account_id', columns='month_diff', values=variable).reset_index()
    
    # Rename columns to reflect they are for the current variable and month_diff
    pivot.columns = ['account_id'] + [f'{variable}_month_diff_{int(col)}' if col != 'account_id' else 'account_id' for col in pivot.columns[1:]]
    
    # Merge this pivoted data back into the account summary DataFrame
    account_summary = pd.merge(account_summary, pivot, on='account_id', how='left')

In [323]:
account_summary.head()

# print number of columns
print(f'Number of columns: {account_summary.shape[1]}')

Number of columns: 53


### Generating Event-Based Customer Information

## Feature Engineering

### Deriving New Features