# Applied Machine Learning - Mini Challenge: Cross-Selling of Credit Cards
**Author**: Nils Fahrni

In [275]:
import sklearn
import numpy as np
import sys
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = 'ggplot2'
pd.set_option('display.max_columns', None)

sys.path.append('scripts')
from dataloader import DataLoader

## Data Preprocessing

### Data Loading

In [276]:
data_loader = DataLoader(base_path='data', translations_name='translation_mappings.json')
data_loader.list_datasets()

Unnamed: 0,Dataset,Number of Rows
0,loan,682
1,client,5369
2,district,77
3,trans,1056320
4,account,4500
5,card,892
6,order,6471
7,disp,5369


#### Account

In [277]:
account = data_loader.load_csv('account', parse_dates={'date': '%y%m%d'})
account.sample(5)

Mapped frequency:
{
    "POPLATEK MESICNE": "MONTHLY CHARGES",
    "POPLATEK TYDNE": "WEEKLY CHARGES",
    "POPLATEK PO OBRATU": "TRANSACTION CHARGES"
}


Unnamed: 0,account_id,district_id,frequency,date
2767,687,27,MONTHLY CHARGES,1996-06-02
1238,3804,7,MONTHLY CHARGES,1994-04-07
431,1180,20,MONTHLY CHARGES,1993-05-19
2034,994,72,MONTHLY CHARGES,1995-09-15
2961,81,74,MONTHLY CHARGES,1996-07-18


#### Client

In [278]:
client = data_loader.load_csv('client')

client = client.assign(gender=client['birth_number'].apply(lambda x: 'FEMALE' if int(str(x)[2:4]) > 50 else 'MALE'))

client = client.assign(birth_number=client.apply(lambda x: x['birth_number'] - 5000 if x['gender'] == 'FEMALE' else x['birth_number'], axis=1))

client['birth_number'] = pd.to_datetime(client['birth_number'], format='%y%m%d')
client['birth_date'] = client['birth_number'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 1999 else x)
client.drop('birth_number', axis=1, inplace=True)

client['age'] = (pd.to_datetime('1999-12-31') - client['birth_date']).dt.days // 365

client.sample(5)

Unnamed: 0,client_id,district_id,gender,birth_date,age
5075,10096,1,MALE,1978-01-05,22
4193,4435,64,FEMALE,1965-10-30,34
1126,1182,66,FEMALE,1946-10-16,53
546,572,3,FEMALE,1920-08-04,79
5107,10555,1,MALE,1935-12-15,64


#### Disposition

Removing disponents as the goal is to only advertise to owners. Disponents may be secondary users that have been authorized to use an account. They may be allowed to execute transactions on that account but they are not the authorized owners.

In [279]:
disp = data_loader.load_csv('disp')

disp = disp[disp['type'] != 'DISPONENT']

disp.drop('type', axis=1, inplace=True)

disp.sample(5)

Unnamed: 0,disp_id,client_id,account_id
1636,1734,1734,1430
2572,2716,2716,2243
3394,3582,3582,2967
5154,10944,11252,9140
5148,10828,11136,9041


#### Permanent Order

In [280]:
order = data_loader.load_csv('order')

order.sample(5)

Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SIPO": "HOUSEHOLD",
    "LEASING": "LEASING",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
1605,31175,1212,CD,49310952,2805.0,HOUSEHOLD
6182,43294,9377,QR,64104062,4164.0,LOAN PAYMENT
5453,35902,4424,MN,62918054,720.0,LEASING
5884,40467,7487,KL,47784578,4792.0,
1031,30535,777,GH,7662755,6412.0,LOAN PAYMENT


**Are there Null Values?**

In [281]:
display(order.isnull().sum())

display(order[order['k_symbol'].isnull()].sample(5))

order_id         0
account_id       0
bank_to          0
account_to       0
amount           0
k_symbol      1379
dtype: int64

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
4818,34742,3622,YZ,19400376,2286.0,
5706,38200,5952,MN,40625691,1374.0,
1854,31450,1390,EF,43511337,6147.0,
2308,31952,1747,MN,40159267,1118.0,
4328,34190,3229,WX,5108792,6411.0,


In [282]:
order['k_symbol'] = order['k_symbol'].fillna('MISSING')

order_pivot = order.pivot_table(index='account_id', columns='k_symbol', values='amount', aggfunc='count', fill_value=0)

order_pivot.sample(5)

k_symbol,HOUSEHOLD,INSURANCE PAYMENT,LEASING,LOAN PAYMENT,MISSING
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1417,1,0,0,0,0
2631,1,0,0,0,0
84,1,0,1,0,0
1583,0,0,0,1,0
2619,1,0,0,0,0


#### Transaction

TODO: 
- amount to negative or positive based on if withdrawal or deposit
- Research account number 19 (time series account balance, at least per month)
    - this acc is volatile
    - account's balance goes negative sometimes
- what happens to accounts with multiple transactions on a day?
    - how to obtain the actual end of day balance?
        - add up withdrawals with deposits and add to balance of day before 
        - try to vectorize this problem (R antijoin mentioned)

In [283]:
transaction = data_loader.load_csv('trans', parse_dates={'date': '%y%m%d'})

transaction.sample(5)

Mapped type:
{
    "PRIJEM": "CREDIT",
    "VYDAJ": "WITHDRAWAL"
}
Mapped operation:
{
    "VYBER KARTOU": "CREDIT CARD WITHDRAWAL",
    "VKLAD": "CREDIT IN CASH",
    "PREVOD Z UCTU": "COLLECTION FROM ANOTHER BANK",
    "VYBER": "WITHDRAWAL IN CASH",
    "PREVOD NA UCET": "REMITTANCE TO ANOTHER BANK"
}
Mapped k_symbol:
{
    "POJISTNE": "INSURANCE PAYMENT",
    "SLUZBY": "PAYMENT FOR STATEMENT",
    "UROK": "INTEREST CREDITED",
    "SANKC. UROK": "SANCTION INTEREST IF NEGATIVE BALANCE",
    "SIPO": "HOUSEHOLD",
    "DUCHOD": "OLD-AGE PENSION",
    "UVER": "LOAN PAYMENT"
}


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
533487,576952,1965,1997-04-18,,WITHDRAWAL IN CASH,21329.0,35748.5,,,
197370,1061958,3629,1995-08-11,CREDIT,COLLECTION FROM ANOTHER BANK,15873.0,29405.9,,WX,68042999.0
459491,482927,1644,1997-01-10,WITHDRAWAL,REMITTANCE TO ANOTHER BANK,2534.0,50169.4,HOUSEHOLD,EF,89781009.0
789619,2477303,8173,1998-02-17,WITHDRAWAL,WITHDRAWAL IN CASH,6600.0,41339.6,,,0.0
930501,186745,632,1998-07-31,WITHDRAWAL,WITHDRAWAL IN CASH,14.6,17693.0,PAYMENT FOR STATEMENT,,


#### Loan

In [284]:
loan = data_loader.load_csv('loan', parse_dates={'date': '%y%m%d'})

loan.sample(5)

Mapped status:
{
    "A": "contract finished, no problems",
    "B": "contract finished, loan not payed",
    "C": "running contract, OK so far",
    "D": "running contract, client in debt"
}


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
158,5045,347,1995-05-01,187224,24,7801.0,"contract finished, loan not payed"
232,5128,808,1996-04-11,215616,48,4492.0,"running contract, client in debt"
424,5041,319,1997-07-11,369000,60,6150.0,"running contract, OK so far"
503,6502,7424,1997-12-08,30276,12,2523.0,"contract finished, no problems"
265,5337,1886,1996-07-31,162468,36,4513.0,"running contract, OK so far"


**Can an account have multiple loans?**

In [285]:
print(f'Are there accounts with multiple loans: {loan["account_id"].nunique() < loan.shape[0]}')

Are there accounts with multiple loans: False


#### Credit Card

In [286]:
card = data_loader.load_csv('card', parse_dates={'issued': '%y%m%d'})

card.sample(5)

Unnamed: 0,card_id,disp_id,type,issued
255,564,3590,classic,1997-05-06
11,181,1066,classic,1994-08-19
353,767,5204,classic,1997-09-14
159,854,6753,classic,1996-10-07
88,106,592,classic,1996-01-14


#### District

In [287]:
district = data_loader.load_csv('district')

district = district.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'population',
    'A5': 'n_municipalities_with_inhabitants_lt_499',
    'A6': 'n_municipalities_with_inhabitants_500_to_1999',
    'A7': 'n_municipalities_with_inhabitants_2000_to_9999',
    'A8': 'n_municipalities_with_inhabitants_gt_10000',
    'A9': 'n_cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_95',
    'A13': 'unemployment_rate_96',
    'A14': 'enterpreneurs_per_1000_inhabitants',
    'A15': 'n_commited_crimes_95',
    'A16': 'n_commited_crimes_96'
})

district.sample(5)

Unnamed: 0,district_id,district_name,region,population,n_municipalities_with_inhabitants_lt_499,n_municipalities_with_inhabitants_500_to_1999,n_municipalities_with_inhabitants_2000_to_9999,n_municipalities_with_inhabitants_gt_10000,n_cities,ratio_urban_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,enterpreneurs_per_1000_inhabitants,n_commited_crimes_95,n_commited_crimes_96
32,33,Decin,north Bohemia,133777,24,17,7,3,11,84.7,8705,5.75,7.61,116,4650,4859
57,58,Jihlava,south Moravia,109164,98,16,6,1,4,63.6,8757,3.38,3.95,96,2212,2471
44,45,Jicin,east Bohemia,77917,85,19,6,1,5,53.5,8390,2.28,2.89,132,2080,2122
51,52,Usti nad Orlici,east Bohemia,139012,59,41,8,3,10,61.9,8363,2.52,3.49,108,2564,2799
19,20,Strakonice,south Bohemia,70646,94,14,3,1,4,58.4,8547,2.65,3.64,120,1563,1542


### Data Merging

In [288]:
from utils import add_prefix_except_id

account = add_prefix_except_id(account, 'account_', id_exceptions=['district_id'])
client_df = disp.merge(account, on='account_id', how='left')

client = add_prefix_except_id(client, 'client_', id_exceptions=['district_id'])
client_df = client_df.merge(client, on='client_id', how='left')

order_pivot = add_prefix_except_id(order_pivot, 'ordertype_')
client_df = client_df.merge(order_pivot, on='account_id', how='left')

loan = add_prefix_except_id(loan, 'loan_')
client_df = client_df.merge(loan, on='account_id', how='left')

card = add_prefix_except_id(card, 'card_')
client_df = client_df.merge(card, on='disp_id', how='left')

client_district = add_prefix_except_id(district, 'client_district_')
client_df = client_df.merge(client_district, left_on='client_district_id', right_on='district_id', how='left')

account_district = add_prefix_except_id(district, 'account_district_')
client_df = client_df.merge(account_district, left_on='account_district_id', right_on='district_id', how='left')

client_df.sample(5)

n_merged_base_client = client_df.shape[0]

In [289]:
assert client_df['account_id'].nunique() == client_df.shape[0]

### Data Cleaning

#### Removing Junior Cards

In [290]:
junior_clients = client_df[client_df['card_type'] == 'junior']

client_df = client_df[~client_df['account_id'].isin(junior_clients['account_id'])]

transaction = transaction[~transaction['account_id'].isin(junior_clients['account_id'])]

print(f'Number of junior clients: {junior_clients.shape[0]}')
print(f'Number of clients remaining: {client_df.shape[0]}')

Number of junior clients: 145
Number of clients remaining: 4355


## Model Construction

### Processing Transactional Data
- The goal is to predict if a non-card-owner will buy a card or not

The first task is to look if every account in the transactions dataframe has a "first transaction". This would make the calculation of the monthly balance much easier since everything can be summed up without having to worry that there were months without records in the transaction dataframe.

In [291]:
# Find the minimum (first) transaction(s) date for each account
min_dates = transaction.groupby('account_id')['date'].min().reset_index()
min_dates.rename(columns={'date': 'min_date'}, inplace=True)

# Merge the minimum date back to the transactions to identify all transactions on the first day
transactions_with_min_date = pd.merge(transaction, min_dates, on='account_id')

# Filter transactions that are on the first day
first_day_transactions = transactions_with_min_date[transactions_with_min_date['date'] == transactions_with_min_date['min_date']]
first_day_transactions = first_day_transactions.copy()

# Now, for each of these first day transactions, check if any have amount equals balance
first_day_transactions['amount_equals_balance'] = first_day_transactions['amount'] == first_day_transactions['balance']

# Group by account_id and check if any transactions for each account meet the condition
accounts_meeting_condition = first_day_transactions.groupby('account_id')['amount_equals_balance'].any().reset_index()

# Verify if all accounts have at least one transaction on the first day meeting the condition
all_accounts_covered = accounts_meeting_condition['amount_equals_balance'].all()

print("Does every account's first day of transactions include at least one transaction where amount equals balance?", all_accounts_covered)

Does every account's first day of transactions include at least one transaction where amount equals balance? True


Now every accounts balance needs to be calculated per month.

In [292]:
transaction['month'] = transaction['date'].dt.to_period('M')

transactions_monthly = transaction.groupby(['account_id', 'month']).agg(
    volume=('amount', 'sum'),
    credit=('amount', lambda x: x[x > 0].sum()),
    withdrawal=('amount', lambda x: x[x < 0].sum()),
    n_transactions=('amount', 'size')
).reset_index()

In [293]:
transactions_monthly['month'] = pd.PeriodIndex(transactions_monthly['month'])

date_ranges = transactions_monthly.groupby('account_id')['month'].agg(['min', 'max'])

def reindex_df(group, account_id):
    idx = pd.period_range(start=group['month'].min(), end=group['month'].max(), freq='M')
    group.set_index('month', inplace=True)
    group = group.reindex(idx, fill_value=0)
    group.reset_index(inplace=True)
    group.rename(columns={'index': 'month'}, inplace=True)
    group['account_id'] = account_id
    return group

transactions_monthly = (transactions_monthly.groupby('account_id')
                        .apply(lambda x: reindex_df(x, x.name))
                        .reset_index(level=0, drop=True))

# Calculate cumulative balance
transactions_monthly['balance'] = transactions_monthly.groupby('account_id')['volume'].cumsum()





In [294]:
transactions_monthly

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7
...,...,...,...,...,...,...,...
36,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6
37,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6
38,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8
39,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5


### Defining Roll-Up Windows of Transactions

Before we can continue to filter out customers that have at least 13 Months of transaction history it is also needed to give non-customers a fictional `card_issued` date so we can build negative samples that also contain a 13-Month rollup window.

In [295]:
clients_with_cards = client_df[~client_df['card_issued'].isnull()]
time_between_creation_and_issue = (clients_with_cards['card_issued']-clients_with_cards['account_date']).dt.days

fig = go.Figure()

fig.add_trace(go.Histogram(x=time_between_creation_and_issue, histnorm='percent', nbinsx=50))

fig.update_traces(marker_line_width=1, marker_line_color="white")

fig.update_layout(
    title='Distribution of Card Issuance Dates',
    xaxis_title='Days',
    yaxis_title='Percentage of Accounts'
)

fig.show()

An observation we can make when looking at the distribution of when cards usually get issued after the account creation is that there don't seem to be any issuances before day 200 of the account.

In [296]:
np.random.seed(1337)

sampled_deltas = np.random.choice(time_between_creation_and_issue, size=len(client_df[client_df['card_issued'].isnull()]))

fig = go.Figure()

fig.add_trace(go.Histogram(x=sampled_deltas, histnorm='percent', name='Sampled Deltas', nbinsx=50))
fig.add_trace(go.Histogram(x=time_between_creation_and_issue, histnorm='percent', name='Original Deltas', nbinsx=50))

fig.update_traces(marker_line_width=1, marker_line_color="white")

fig.update_layout(
    title='Distribution of Card Issuance Dates',
    xaxis_title='Days',
    yaxis_title='Percentage of Accounts'
)

fig.show()

In [297]:
# fill the clients with missing card_issued dates with the sampled deltas
if len(client_df[client_df['card_issued'].isnull()]) > 0:
    client_df.loc[client_df['card_issued'].isnull(), 'card_issued'] = client_df.loc[client_df['card_issued'].isnull(), 'account_date'] + pd.to_timedelta(sampled_deltas, unit='D')
    
print(f'Number of NaT/NaN values in card_issued: {client_df['card_issued'].isnull().sum()}')

Number of NaT/NaN values in card_issued: 0


In [298]:
card_issued = card.groupby('disp_id')['card_issued'].min().reset_index()
card_issued.head(5)

Unnamed: 0,disp_id,card_issued
0,9,1998-10-16
1,19,1998-03-13
2,41,1995-09-03
3,42,1998-11-26
4,51,1995-04-24


In [299]:
client_df[client_df['disp_id'].isnull()].shape[0]

transactions_monthly.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179054 entries, 0 to 40
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype    
---  ------          --------------   -----    
 0   month           179054 non-null  period[M]
 1   account_id      179054 non-null  int64    
 2   volume          179054 non-null  float64  
 3   credit          179054 non-null  float64  
 4   withdrawal      179054 non-null  float64  
 5   n_transactions  179054 non-null  int64    
 6   balance         179054 non-null  float64  
dtypes: float64(4), int64(2), period[M](1)
memory usage: 10.9 MB


In [300]:
# join in the card_issued date from client_date to the transactions_monthly
transactions_monthly = transactions_monthly.merge(client_df[['account_id', 'card_issued']], left_on='account_id', right_on='account_id', how='left')

transactions_monthly.head()

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,card_issued
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0,1998-09-18
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2,1998-09-18
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2,1998-09-18
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8,1998-09-18
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7,1998-09-18


Now, let's see if the join worked correctly and we don't have any transactions without a `card_issued` date anymore.

In [301]:
assert transactions_monthly['card_issued'].isnull().sum() == 0

### Finding a way to impute card_issued date for non-buyers

In [302]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

REFERENCE_DATE = client_df['card_issued'].dropna().mean()

class DateFeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, reference_date):
        self.reference_date = reference_date

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for col in X.columns:
            X[col] = (pd.to_datetime(self.reference_date) - pd.to_datetime(X[col])).dt.days
        return X

datetime_cols = ['client_birth_date']
categorical_cols = ['client_gender', 'client_district_region', 'account_district_region']
numerical_cols = [col for col in client_df.columns if client_df[col].dtype in ['int64', 'float64']
                  and not col.endswith('_id')
                  and col not in ['ordertype_HOUSEHOLD', 'ordertype_INSURANCE PAYMENT', 'ordertype_LEASING', 'ordertype_LOAN PAYMENT', 'ordertype_MISSING']
                  and col not in datetime_cols]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

date_transformer = DateFeaturesExtractor(reference_date=REFERENCE_DATE)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('date', date_transformer, datetime_cols)
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', KNeighborsRegressor(n_neighbors=10))
                       ])

X = client_df.drop(columns=['card_issued'])
y = client_df['card_issued'].fillna(pd.Timestamp(REFERENCE_DATE))

y = (pd.to_datetime(REFERENCE_DATE) - pd.to_datetime(y)).dt.days

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model.fit(X_train, y_train)

In [303]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_pred = model.predict(X_test)
predicted_dates = pd.to_datetime(REFERENCE_DATE) - pd.to_timedelta(y_pred, unit='d')
true_dates = pd.to_datetime(REFERENCE_DATE) - pd.to_timedelta(y_test, unit='d')
errors = (predicted_dates - true_dates).dt.days.abs()

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
average_error_days = errors.mean()

print(f'Mean Absolute Error (MAE): {mae:.2f} days')
print(f'Mean Squared Error (MSE): {mse:.2f} days^2')
print(f'Average error in days: {average_error_days:.2f} days')
print(f'Error distribution quantiles: {np.quantile(errors, [0.25, 0.5, 0.75])}')

Mean Absolute Error (MAE): 569.70 days
Mean Squared Error (MSE): 525794.88 days^2
Average error in days: 569.71 days
Error distribution quantiles: [198.5 469.  822. ]


In [304]:
transactions_monthly['card_issued'] = pd.to_datetime(transactions_monthly['card_issued'])
transactions_monthly['card_issued_period'] = transactions_monthly['card_issued'].dt.to_period('M')

transactions_monthly['month_diff'] = transactions_monthly.apply(lambda row: (row['card_issued_period'] - row['month']).n if pd.notnull(row['card_issued_period']) and pd.notnull(row['month']) else None, axis=1)

filtered_transactions = transactions_monthly[transactions_monthly['month_diff'].between(1, 13)]

In [305]:
transactions_monthly

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,card_issued,card_issued_period,month_diff
0,1995-03,1,1000.0,1000.0,0.0,1,1000.0,1998-09-18,1998-09,42
1,1995-04,1,16298.2,16298.2,0.0,3,17298.2,1998-09-18,1998-09,41
2,1995-05,1,5858.0,5858.0,0.0,3,23156.2,1998-09-18,1998-09,40
3,1995-06,1,3979.6,3979.6,0.0,3,27135.8,1998-09-18,1998-09,39
4,1995-07,1,9087.9,9087.9,0.0,3,36223.7,1998-09-18,1998-09,38
...,...,...,...,...,...,...,...,...,...,...
179049,1998-08,11382,54569.5,54569.5,0.0,7,2386853.6,1996-09-30,1996-09,-23
179050,1998-09,11382,44120.0,44120.0,0.0,5,2430973.6,1996-09-30,1996-09,-24
179051,1998-10,11382,63262.2,63262.2,0.0,6,2494235.8,1996-09-30,1996-09,-25
179052,1998-11,11382,50165.7,50165.7,0.0,5,2544401.5,1996-09-30,1996-09,-26


In [306]:
filtered_transactions.sort_values(by=['account_id', 'month_diff'])

Unnamed: 0,month,account_id,volume,credit,withdrawal,n_transactions,balance,card_issued,card_issued_period,month_diff
41,1998-08,1,6492.7,6492.7,0.0,5,338855.2,1998-09-18,1998-09,1
40,1998-07,1,6221.0,6221.0,0.0,4,332362.5,1998-09-18,1998-09,2
39,1998-06,1,6667.1,6667.1,0.0,5,326141.5,1998-09-18,1998-09,3
38,1998-05,1,6212.3,6212.3,0.0,4,319474.4,1998-09-18,1998-09,4
37,1998-04,1,7435.5,7435.5,0.0,6,313262.1,1998-09-18,1998-09,5
...,...,...,...,...,...,...,...,...,...,...
179017,1995-12,11382,82563.6,82563.6,0.0,4,221665.9,1996-09-30,1996-09,9
179016,1995-11,11382,51280.6,51280.6,0.0,4,139102.3,1996-09-30,1996-09,10
179015,1995-10,11382,49912.3,49912.3,0.0,4,87821.7,1996-09-30,1996-09,11
179014,1995-09,11382,37709.4,37709.4,0.0,4,37909.4,1996-09-30,1996-09,12


An issue at this point can be that an account may have months without any transactions, so there needs to be a more thorough process to interpolate the data:
- **Volume, Withdrawal, Credit and Number of Transactions**: If there are missing months in these variables we can just set `0` as their value as there has not been any activity if there were no recorded months.
- **Balance**: The balance will get recursively set to the last preceding recorded month. So if there are consecutive "missing" months in transactions the balance will always be set to the last recorded month.

In [307]:
account_summary = pd.DataFrame(filtered_transactions['account_id'].unique(), columns=['account_id'])

variables_to_pivot = ['volume', 'withdrawal', 'credit', 'n_transactions', 'balance']

for variable in variables_to_pivot:
    grouped = filtered_transactions.groupby(['account_id', 'month_diff'])[variable].sum().reset_index()
    pivot = grouped.pivot(index='account_id', columns='month_diff', values=variable).reset_index()
    pivot.columns = ['account_id'] + [f'{variable}_month_diff_{int(col)}' if col != 'account_id' else 'account_id' for col in pivot.columns[1:]]
    account_summary = pd.merge(account_summary, pivot, on='account_id', how='left')

for variable in ['volume', 'withdrawal', 'credit', 'n_transactions']:
    account_summary.update(account_summary.filter(regex=f'^{variable}_').fillna(0))

def find_last_balance(account_id, starting_month_diff):
    higher_month_diff = transactions_monthly[
        (transactions_monthly['account_id'] == account_id) & 
        (transactions_monthly['month_diff'] >= starting_month_diff)
    ].sort_values('month_diff')
    
    last_balance_row = higher_month_diff[higher_month_diff['balance'].notna()].head(1)
    
    if not last_balance_row.empty:
        return last_balance_row.iloc[0]['balance']
    else:
        return 0

balance_columns = [col for col in account_summary.columns if 'balance_month_diff_' in col]
for idx, row in account_summary.iterrows():
    for col in balance_columns:
        if pd.isna(row[col]):
            month_diff = int(col.split('_')[-1])
            last_balance = find_last_balance(row['account_id'], month_diff + 1)
            account_summary.at[idx, col] = last_balance

In [308]:
account_summary.head()

Unnamed: 0,account_id,volume_month_diff_1,volume_month_diff_2,volume_month_diff_3,volume_month_diff_4,volume_month_diff_5,volume_month_diff_6,volume_month_diff_7,volume_month_diff_8,volume_month_diff_9,volume_month_diff_10,volume_month_diff_11,volume_month_diff_12,volume_month_diff_13,withdrawal_month_diff_1,withdrawal_month_diff_2,withdrawal_month_diff_3,withdrawal_month_diff_4,withdrawal_month_diff_5,withdrawal_month_diff_6,withdrawal_month_diff_7,withdrawal_month_diff_8,withdrawal_month_diff_9,withdrawal_month_diff_10,withdrawal_month_diff_11,withdrawal_month_diff_12,withdrawal_month_diff_13,credit_month_diff_1,credit_month_diff_2,credit_month_diff_3,credit_month_diff_4,credit_month_diff_5,credit_month_diff_6,credit_month_diff_7,credit_month_diff_8,credit_month_diff_9,credit_month_diff_10,credit_month_diff_11,credit_month_diff_12,credit_month_diff_13,n_transactions_month_diff_1,n_transactions_month_diff_2,n_transactions_month_diff_3,n_transactions_month_diff_4,n_transactions_month_diff_5,n_transactions_month_diff_6,n_transactions_month_diff_7,n_transactions_month_diff_8,n_transactions_month_diff_9,n_transactions_month_diff_10,n_transactions_month_diff_11,n_transactions_month_diff_12,n_transactions_month_diff_13,balance_month_diff_1,balance_month_diff_2,balance_month_diff_3,balance_month_diff_4,balance_month_diff_5,balance_month_diff_6,balance_month_diff_7,balance_month_diff_8,balance_month_diff_9,balance_month_diff_10,balance_month_diff_11,balance_month_diff_12,balance_month_diff_13
0,1,6492.7,6221.0,6667.1,6212.3,7435.5,7018.6,6701.9,9091.5,10907.2,7318.0,6218.0,6600.6,8282.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6492.7,6221.0,6667.1,6212.3,7435.5,7018.6,6701.9,9091.5,10907.2,7318.0,6218.0,6600.6,8282.7,5.0,4.0,5.0,4.0,6.0,5.0,5.0,10.0,5.0,5.0,4.0,5.0,6.0,338855.2,332362.5,326141.5,319474.4,313262.1,305826.6,298808.0,292106.1,283014.6,272107.4,264789.4,258571.4,251970.8
1,2,34617.6,45943.4,52856.1,47098.6,52913.9,37980.7,31345.5,23949.5,1100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34617.6,45943.4,52856.1,47098.6,52913.9,37980.7,31345.5,23949.5,1100.0,0.0,0.0,0.0,0.0,6.0,7.0,7.0,6.0,3.0,3.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,327805.3,293187.7,247244.3,194388.2,147289.6,94375.7,56395.0,25049.5,1100.0,0.0,0.0,0.0,0.0
2,4,10917.6,12013.8,9011.7,14254.5,11343.0,9015.0,10769.2,9003.4,20249.4,13529.7,9040.6,9031.2,12329.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10917.6,12013.8,9011.7,14254.5,11343.0,9015.0,10769.2,9003.4,20249.4,13529.7,9040.6,9031.2,12329.5,6.0,6.0,5.0,6.0,7.0,5.0,6.0,5.0,12.0,7.0,5.0,5.0,6.0,195475.7,184558.1,172544.3,163532.6,149278.1,137935.1,128920.1,118150.9,109147.5,88898.1,75368.4,66327.8,57296.6
3,5,8607.1,7797.2,15720.3,10825.1,7812.5,5035.1,5017.0,5017.0,5017.0,5017.0,600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8607.1,7797.2,15720.3,10825.1,7812.5,5035.1,5017.0,5017.0,5017.0,5017.0,600.0,0.0,0.0,5.0,4.0,10.0,6.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,76465.3,67858.2,60061.0,44340.7,33515.6,25703.1,20668.0,15651.0,10634.0,5617.0,600.0,0.0,0.0
4,6,14333.0,10810.0,10798.3,11989.9,20888.8,13055.1,10807.0,13039.2,10792.4,23857.9,11865.2,10815.6,10803.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14333.0,10810.0,10798.3,11989.9,20888.8,13055.1,10807.0,13039.2,10792.4,23857.9,11865.2,10815.6,10803.9,5.0,4.0,4.0,5.0,6.0,5.0,4.0,5.0,4.0,11.0,6.0,4.0,4.0,625481.9,611148.9,600338.9,589540.6,577550.7,556661.9,543606.8,532799.8,519760.6,508968.2,485110.3,473245.1,462429.5


### Bringing the data together

In [309]:
client_df = client_df.merge(account_summary, on='account_id', how='right')

In [314]:
clients_with_card = client_df[~client_df['card_id'].isnull()]

In [322]:
n_lt_13_month_hist = transactions_monthly['account_id'].nunique()-filtered_transactions['account_id'].nunique()

print(n_merged_base_client)

preprocessing_summary = [
        n_merged_base_client,
         -len(junior_clients),
         -n_lt_13_month_hist,
         len(clients_with_card)
]

fig = go.Figure(go.Waterfall(
        name = "20", orientation = "v",
        measure = ["absolute", "relative", "relative", "total"],
    x = ["Base Client List", "Junior Clients", "Clients with less than 13 Months of Transaction History", "Remaining Clients"],
    textposition = "outside",
    y = preprocessing_summary,
    text = [str(x) for x in preprocessing_summary],
    connector = {"line":{"color":"rgb(63, 63, 63)"}},
))

fig.update_layout(
        title = "Profit and loss statement 2018",
        showlegend = True
)

fig.show()

4500


### Generating Event-Based Customer Information

## Feature Engineering

### Deriving New Features