# Neo Bank Data Preparation Notebook

The purpose of this notebook is to clean the dataset and create the features used in the train/test datasets

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from datetime import timedelta

### Load historical (2008-2023) dataset used to train the model

In [2]:
import pandas as pd

dataframes = []

for year in range(2008, 2024):
    file_path = f"../data/train_{year}.parquet"
    try:
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    except Exception as e:
        print(f"Failed to read file {file_path}: {e}")
if dataframes:
    combined_train_raw_df = pd.concat(dataframes, ignore_index=True)
    print(f"Total rows in the combined DataFrame: {len(combined_train_raw_df)}")
else:
    print("No files were successfully read.")

Total rows in the combined DataFrame: 3926058


### Load test dataset (2024)

In [3]:
file_path = f"../data/test.parquet"
test_raw_df = pd.read_parquet(file_path)
print(len(test_raw_df))

1360472


### Create the training dataset (customer_id level data)

Churn definition:
- No interactions with the bank for at 18 or more months since the max date in the dataset (2023-12-31 in train)

In [None]:
combined_train_raw_df['date'] = pd.to_datetime(combined_train_raw_df['date'])

max_date_train = max(combined_train_raw_df['date'])
print(max_date_train)

2023-12-31 00:00:00


In [5]:
test_raw_df['date'] = pd.to_datetime(test_raw_df['date'])

max_date_test = max(test_raw_df['date'])
print(max_date_test)

2026-12-31 00:00:00


In [6]:
last_interaction = combined_train_raw_df.groupby('customer_id')['date'].max()

churn_date = pd.Timestamp(max_date_train) 
cutoff_date = churn_date - pd.DateOffset(months=18) ## make 18 month of inactivity churn

churn_status = last_interaction < cutoff_date

df_train = churn_status.reset_index(name='churn_status')

churn_percentage = (df_train['churn_status'].sum() / len(df_train)) * 100
print(f"{churn_percentage:.2f}% of distinct customers since 2008-2023 have churned under our definition")

df_train.head()

19.94% of distinct customers since 2008-2023 have churned under our definition


Unnamed: 0,customer_id,churn_status
0,1,False
1,2,True
2,3,True
3,4,True
4,5,False


In [7]:
last_interaction = test_raw_df.groupby('customer_id')['date'].max()

churn_date = pd.Timestamp(max_date_test)
cutoff_date = churn_date - pd.DateOffset(months=18)

churn_status = last_interaction < cutoff_date

df_test = churn_status.reset_index(name='churn_status')

churn_percentage = (df_test['churn_status'].sum() / len(df_test)) * 100
print(f"{churn_percentage:.2f}% of distinct customers in 2024 have churned under our definition")

df_test.head()

14.96% of distinct customers in 2024 have churned under our definition


Unnamed: 0,customer_id,churn_status
0,1,False
1,2,False
2,3,True
3,4,True
4,5,True


## Features


### Customer Age

In [8]:
combined_train_raw_df['date_of_birth'] = pd.to_datetime(combined_train_raw_df['date_of_birth'])
last_interaction_dates = combined_train_raw_df.groupby('customer_id')['date'].max().reset_index()
last_interaction_dates = last_interaction_dates.rename(columns={'date': 'last_interaction_date'})
combined_train_raw_df = combined_train_raw_df.merge(last_interaction_dates, on='customer_id')
combined_train_raw_df['age'] = (
    (combined_train_raw_df['last_interaction_date'] - combined_train_raw_df['date_of_birth']).dt.days // 365
)
customer_age = combined_train_raw_df.groupby('customer_id')['age'].first().reset_index()
df_train = df_train.merge(customer_age, on='customer_id', how='left')



test_raw_df['date_of_birth'] = pd.to_datetime(test_raw_df['date_of_birth'])
last_interaction_dates = test_raw_df.groupby('customer_id')['date'].max().reset_index()
last_interaction_dates = last_interaction_dates.rename(columns={'date': 'last_interaction_date'})
test_raw_df = test_raw_df.merge(last_interaction_dates, on='customer_id')
test_raw_df['age'] = (
    (test_raw_df['last_interaction_date'] - test_raw_df['date_of_birth']).dt.days // 365
)
customer_age = test_raw_df.groupby('customer_id')['age'].first().reset_index()
df_test = df_test.merge(customer_age, on='customer_id', how='left')

### Volume Transfer diffs

In [9]:
combined_train_raw_df['date'] = pd.to_datetime(combined_train_raw_df['date'])
combined_train_raw_df = combined_train_raw_df.sort_values(by=['customer_id', 'date'])

combined_train_raw_df['max_date'] = combined_train_raw_df.groupby('customer_id')['date'].transform('max')

combined_train_raw_df['in_window'] = (
    combined_train_raw_df['date'] > 
    (combined_train_raw_df['max_date'] - pd.Timedelta(days=30))
)

combined_train_raw_df['window_in'] = (
    combined_train_raw_df['bank_transfer_in_volume'] * combined_train_raw_df['in_window']
)
combined_train_raw_df['window_out'] = (
    combined_train_raw_df['bank_transfer_out_volume'] * combined_train_raw_df['in_window']
)

sums = combined_train_raw_df.groupby('customer_id', sort=False).agg(
    total_bank_in=('window_in', 'sum'),
    total_bank_out=('window_out', 'sum')
).reset_index()

sums['bank_diff'] = sums['total_bank_in'] - sums['total_bank_out']

df_train = pd.merge(df_train, sums[['customer_id', 'bank_diff']], on='customer_id', how='left')

Unnamed: 0,customer_id,churn_status,age,bank_diff
0,1,False,68,257.42149
1,2,True,63,276.999433
2,3,True,19,23274.169749
3,4,True,35,1840.653045
4,5,False,16,15641.464698


In [None]:
test_raw_df['date'] = pd.to_datetime(test_raw_df['date'])
test_raw_df = test_raw_df.sort_values(by=['customer_id', 'date'])

test_raw_df['max_date'] = test_raw_df.groupby('customer_id')['date'].transform('max')

test_raw_df['in_window'] = (
    test_raw_df['date'] > 
    (test_raw_df['max_date'] - pd.Timedelta(days=30))
)

test_raw_df['window_in'] = (
    test_raw_df['bank_transfer_in_volume'] * test_raw_df['in_window']
)
test_raw_df['window_out'] = (
    test_raw_df['bank_transfer_out_volume'] * test_raw_df['in_window']
)

sums = test_raw_df.groupby('customer_id', sort=False).agg(
    total_bank_in=('window_in', 'sum'),
    total_bank_out=('window_out', 'sum')
).reset_index()

sums['bank_diff'] = sums['total_bank_in'] - sums['total_bank_out']

df_test = pd.merge(df_test, sums[['customer_id', 'bank_diff']], on='customer_id', how='left')

Unnamed: 0,customer_id,churn_status,age,bank_diff
0,1,False,71,734.663785
1,2,False,78,2.824867
2,3,True,21,40025.197105
3,4,True,51,365.017325
4,5,True,18,2698.584398


### Complaints

In [11]:
def calculate_30d_complaints(df):
    last_dates = df.groupby('customer_id')['date'].max().reset_index(name='date_last')
    
    df_with_last_date = df.merge(last_dates, on='customer_id')
    
    complaints_30d = (
        df_with_last_date[
            (df_with_last_date['date'] >= df_with_last_date['date_last'] - pd.Timedelta(days=30)) &
            (df_with_last_date['date'] < df_with_last_date['date_last'])
        ]
        .groupby('customer_id')['complaints']
        .sum()
        .reset_index(name='total_complaints_30d')
    )
    
    return complaints_30d

complaints_30d_train = calculate_30d_complaints(combined_train_raw_df)
complaints_30d_test = calculate_30d_complaints(test_raw_df)

df_train = df_train.merge(complaints_30d_train, on='customer_id', how='left')
df_train['total_complaints_30d'] = df_train['total_complaints_30d'].fillna(0)

df_test = df_test.merge(complaints_30d_test, on='customer_id', how='left')
df_test['total_complaints_30d'] = df_test['total_complaints_30d'].fillna(0)

### longest gap

In [13]:
gap_df = combined_train_raw_df.sort_values(by=['customer_id', 'date'])
gap_df['gap'] = gap_df.groupby('customer_id')['date'].diff()

gap_stats = gap_df.groupby('customer_id')['gap'].agg(
    longest_gap='max',
).reset_index()

gap_stats['longest_gap'] = gap_stats['longest_gap'].dt.days

df_train = df_train.merge(gap_stats, on='customer_id', how='left').fillna(0)



gap_df = test_raw_df.sort_values(by=['customer_id', 'date'])
gap_df['gap'] = gap_df.groupby('customer_id')['date'].diff()

gap_stats = gap_df.groupby('customer_id')['gap'].agg(
    longest_gap='max',
).reset_index()

gap_stats['longest_gap'] = gap_stats['longest_gap'].dt.days

df_test = df_test.merge(gap_stats, on='customer_id', how='left').fillna(0)

### Tenure

In [14]:
latest_tenure = combined_train_raw_df.groupby('customer_id')['tenure'].last().reset_index()
latest_tenure['tenure_bucket'] = pd.cut(latest_tenure['tenure'],
                                       bins=[0, 365, 1095, np.inf],
                                       labels=[0, 1, 2]) #'<1 year' = 0, '1-3 years' = 1, '>3 years' = 2

df_train = df_train.merge(latest_tenure[['customer_id', 'tenure']], on='customer_id', how='left').fillna(1)

In [15]:
latest_tenure = test_raw_df.groupby('customer_id')['tenure'].last().reset_index()
latest_tenure['tenure_bucket'] = pd.cut(latest_tenure['tenure'],
                                       bins=[0, 365, 1095, np.inf],
                                       labels=[0, 1, 2])
df_test = df_test.merge(latest_tenure[['customer_id', 'tenure']], on='customer_id', how='left').fillna(1)

### Unemployment Flag

In [16]:
latest_jobs = (
    combined_train_raw_df
    .sort_values(['customer_id', 'date'], ascending=[True, False])
    .groupby('customer_id')['job']
    .first()
    .reset_index()
)

latest_jobs['is_unemployed'] = (
    latest_jobs['job']
    .str.strip().str.lower()
    .eq('unemployed')
    .astype(int)
)

df_train = df_train.merge(
    latest_jobs[['customer_id', 'is_unemployed']],
    on='customer_id',
    how='left'
).fillna({'is_unemployed': 0})
print(df_train['is_unemployed'].value_counts())




latest_jobs = (
    test_raw_df
    .sort_values(['customer_id', 'date'], ascending=[True, False])
    .groupby('customer_id')['job']
    .first()
    .reset_index()
)

latest_jobs['is_unemployed'] = (
    latest_jobs['job']
    .str.strip().str.lower()
    .eq('unemployed')
    .astype(int)
)

df_test = df_test.merge(
    latest_jobs[['customer_id', 'is_unemployed']],
    on='customer_id',
    how='left'
).fillna({'is_unemployed': 0})
print(df_test['is_unemployed'].value_counts())

is_unemployed
0    94780
1    11399
Name: count, dtype: int64
is_unemployed
0    135846
1     29309
Name: count, dtype: int64


### Export Train/Test sets to CSV

In [17]:
df_train.head()

Unnamed: 0,customer_id,churn_status,age,bank_diff,total_complaints_30d,longest_gap,tenure,is_unemployed
0,1,False,68,257.42149,0.0,384.0,5398,0
1,2,True,63,276.999433,0.0,397.0,887,0
2,3,True,19,23274.169749,0.0,401.0,5217,0
3,4,True,35,1840.653045,0.0,383.0,448,0
4,5,False,16,15641.464698,0.0,398.0,5469,0


In [18]:
df_test.head()

Unnamed: 0,customer_id,churn_status,age,bank_diff,total_complaints_30d,longest_gap,tenure,is_unemployed
0,1,False,71,734.663785,0.0,0.0,5398,0
1,2,False,78,2.824867,0.0,0.0,887,0
2,3,True,21,40025.197105,0.0,0.0,5217,0
3,4,True,51,365.017325,0.0,0.0,448,0
4,5,True,18,2698.584398,0.0,375.0,6235,0


In [19]:
filename = f"../data/train_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
df_train.to_csv(filename, index=False)
print(f"Trainset saved as: {filename}")

Trainset saved as: ../data/train_2025-02-04_17-10-32.csv


In [20]:
filename = f"../data/test_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
df_test.to_csv(filename, index=False)
print(f"Testset saved as: {filename}")

Testset saved as: ../data/test_2025-02-04_17-10-33.csv
