# Neo Bank Data Preparation Notebook

The purpose of this notebook is to clean the dataset and create the features used in the train/test datasets

In [99]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from datetime import timedelta

### Load historical (2008-2023) dataset used to train the model

In [100]:
import pandas as pd

dataframes = []

for year in range(2008, 2023):
    file_path = f"../data/train_{year}.parquet"
    try:
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    except Exception as e:
        print(f"Failed to read file {file_path}: {e}")
if dataframes:
    combined_train_raw_df = pd.concat(dataframes, ignore_index=True)
    print(f"Total rows in the combined DataFrame: {len(combined_train_raw_df)}")
else:
    print("No files were successfully read.")

Total rows in the combined DataFrame: 3518453


### Load test dataset (2024)

In [102]:
file_path = f"../data/test.parquet"
test_raw_df = pd.read_parquet(file_path)
print(len(test_raw_df))

1360472


### Data Cleaning

In [103]:
# drop unrealistic records of atm transfer out

# current justification: it's not realisitc that someone can transfer money out of an ATM thousands of times in single day also it accounts for tiny 
# TODO: find a source to backup the reasoning for removing this data

initial_count = len(combined_train_raw_df)

# drop records with more than 10 atm transfer outs
filtered_df = combined_train_raw_df.drop(combined_train_raw_df[combined_train_raw_df['atm_transfer_out'] > 10].index)

# Calculate dropped records
dropped_count = initial_count - len(filtered_df)
percentage_dropped = (dropped_count / initial_count) * 100

print(f"Records dropped: {dropped_count}")
print(f"Percentage of dataset removed: {percentage_dropped:.2f}%")

combined_train_raw_df = filtered_df

Records dropped: 1373
Percentage of dataset removed: 0.04%


In [104]:
# drop unrealistic records of atm transfer out

# current justification: it's not realisitc that someone can transfer money out of an ATM thousands of times in single day also it accounts for tiny 
# TODO: find a source to backup the reasoning for removing this data

initial_count = len(test_raw_df)

# drop records with more than 10 atm transfer outs
filtered_df = test_raw_df.drop(test_raw_df[test_raw_df['atm_transfer_out'] > 10].index)

# Calculate dropped records
dropped_count = initial_count - len(filtered_df)
percentage_dropped = (dropped_count / initial_count) * 100

print(f"Records dropped: {dropped_count}")
print(f"Percentage of dataset removed: {percentage_dropped:.2f}%")

test_raw_df = filtered_df

Records dropped: 316
Percentage of dataset removed: 0.02%


In [105]:
# drop unrealistic records of atm transfer in

# current justification: it's not realisitc that someone can transfer money out of an ATM thousands of times in single day also it accounts for tiny 
# TODO: find a source to backup the reasoning for removing this data

# drop records with more than 10 atm transfer ins
filtered_df = combined_train_raw_df.drop(combined_train_raw_df[combined_train_raw_df['atm_transfer_in'] > 10].index)

# Calculate dropped records
dropped_count = initial_count - len(filtered_df)
percentage_dropped = (dropped_count / initial_count) * 100

print(f"Records dropped: {dropped_count}")
print(f"Percentage of dataset removed: {percentage_dropped:.2f}%")

combined_train_raw_df = filtered_df

Records dropped: -2155451
Percentage of dataset removed: -158.43%


In [106]:
# drop unrealistic records of atm transfer in

# current justification: it's not realisitc that someone can transfer money out of an ATM thousands of times in single day also it accounts for tiny 
# TODO: find a source to backup the reasoning for removing this data

# drop records with more than 10 atm transfer ins
filtered_df = test_raw_df.drop(test_raw_df[test_raw_df['atm_transfer_in'] > 10].index)

# Calculate dropped records
dropped_count = initial_count - len(filtered_df)
percentage_dropped = (dropped_count / initial_count) * 100

print(f"Records dropped: {dropped_count}")
print(f"Percentage of dataset removed: {percentage_dropped:.2f}%")

test_raw_df = filtered_df

Records dropped: 680
Percentage of dataset removed: 0.05%


### Create the training dataset (customer_id level data)

Churn definition:
- No interactions with the bank for at 18 or more months since the max date in the dataset (2023-12-31)

In [107]:
# Ensure the 'date' column is in datetime format
combined_train_raw_df['date'] = pd.to_datetime(combined_train_raw_df['date'])

max_date = max(combined_train_raw_df['date'])
print(max_date)

2022-12-31 00:00:00


In [108]:
# Ensure the 'date' column is in datetime format
test_raw_df['date'] = pd.to_datetime(test_raw_df['date'])

max_date = max(test_raw_df['date'])
print(max_date)

2026-12-31 00:00:00


In [109]:
# Get the last interaction date for each customer
last_interaction = combined_train_raw_df.groupby('customer_id')['date'].max()

# Define the churn threshold
churn_date = pd.Timestamp(max_date) # Training the model up to the 
cutoff_date = churn_date - pd.DateOffset(months=18)

# Determine churn status (True if last interaction was before cutoff_date)
churn_status = last_interaction < cutoff_date

# Convert to DataFrame
df_train = churn_status.reset_index(name='churn_status')

churn_percentage = (df_train['churn_status'].sum() / len(df_train)) * 100
print(f"{churn_percentage:.2f}% of distinct customers since 2008-2023 have churned under our definition")

df_train.head()

100.00% of distinct customers since 2008-2023 have churned under our definition


Unnamed: 0,customer_id,churn_status
0,1,True
1,2,True
2,3,True
3,4,True
4,5,True


In [110]:
# Get the last interaction date for each customer
last_interaction = test_raw_df.groupby('customer_id')['date'].max()

# Define the churn threshold
churn_date = pd.Timestamp(max_date) # Training the model up to the 
cutoff_date = churn_date - pd.DateOffset(months=18)

# Determine churn status (True if last interaction was before cutoff_date)
churn_status = last_interaction < cutoff_date

# Convert to DataFrame
df_test = churn_status.reset_index(name='churn_status')

churn_percentage = (df_test['churn_status'].sum() / len(df_test)) * 100
print(f"{churn_percentage:.2f}% of distinct customers in 2024 have churned under our definition")

df_test.head()

14.97% of distinct customers in 2024 have churned under our definition


Unnamed: 0,customer_id,churn_status
0,1,False
1,2,False
2,3,True
3,4,True
4,5,True


## Feature Engineering

Features:
- customer age
- country
- account age
- customer job category

In [111]:
# This code compares our definition of churn to the instances of churn due to fraud (since we can assume that these are ACTUAL cases where a customer churned)

# Merge dataframes on customer_id
merged_df = combined_train_raw_df.merge(df_train, on="customer_id", how="inner")

# Categorizing based on churn_due_to_fraud and churn_status
conditions = [
    (merged_df["churn_due_to_fraud"] == True) & (merged_df["churn_status"] == True),
    (merged_df["churn_due_to_fraud"] == False) & (merged_df["churn_status"] == False),
    (merged_df["churn_due_to_fraud"] == True) & (merged_df["churn_status"] == False),
    (merged_df["churn_due_to_fraud"] == False) & (merged_df["churn_status"] == True)
]

categories = ["Both True", "Both False", "Fraud True, Churn False", "Fraud False, Churn True"]

merged_df["Category"] = np.select(conditions, categories, default="Unknown")
category_counts = merged_df["Category"].value_counts()

print(category_counts)

Category
Fraud False, Churn True    3515900
Both True                       23
Name: count, dtype: int64


In [112]:
# This code compares our definition of churn to the instances of churn due to fraud (since we can assume that these are ACTUAL cases where a customer churned)

# Merge dataframes on customer_id
merged_df = test_raw_df.merge(df_test, on="customer_id", how="inner")

# Categorizing based on churn_due_to_fraud and churn_status
conditions = [
    (merged_df["churn_due_to_fraud"] == True) & (merged_df["churn_status"] == True),
    (merged_df["churn_due_to_fraud"] == False) & (merged_df["churn_status"] == False),
    (merged_df["churn_due_to_fraud"] == True) & (merged_df["churn_status"] == False),
    (merged_df["churn_due_to_fraud"] == False) & (merged_df["churn_status"] == True)
]

categories = ["Both True", "Both False", "Fraud True, Churn False", "Fraud False, Churn True"]

merged_df["Category"] = np.select(conditions, categories, default="Unknown")
category_counts = merged_df["Category"].value_counts()

print(category_counts)

Category
Both False                 1283577
Fraud False, Churn True      76160
Fraud True, Churn False         48
Both True                        7
Name: count, dtype: int64


#### Age

In [113]:
# Compute age as of 2023-12-31
combined_train_raw_df['date_of_birth'] = pd.to_datetime(combined_train_raw_df['date_of_birth'])
age = (pd.Timestamp('2023-12-31') - combined_train_raw_df.groupby('customer_id')['date_of_birth'].min()).dt.days // 365

# Convert to DataFrame and merge with df_train
age_df = age.reset_index(name='age')
df_train = df_train.merge(age_df, on='customer_id')

In [114]:
# Compute age as of 2023-12-31
test_raw_df['date_of_birth'] = pd.to_datetime(test_raw_df['date_of_birth'])
age = (pd.Timestamp('2023-12-31') - test_raw_df.groupby('customer_id')['date_of_birth'].min()).dt.days // 365

# Convert to DataFrame and merge with df_test
age_df = age.reset_index(name='age')
df_test = df_test.merge(age_df, on='customer_id')

#### Country

In [115]:
# One-hot encode the 'country' column
country_one_hot = pd.get_dummies(combined_train_raw_df['country'], prefix='country')

# Merge one-hot encoded 'country' columns into df_train
df_train = pd.merge(df_train, combined_train_raw_df[['customer_id']].drop_duplicates().merge(country_one_hot, left_index=True, right_index=True), on='customer_id')

In [116]:
# One-hot encode the 'country' column
country_one_hot = pd.get_dummies(test_raw_df['country'], prefix='country')

# Merge one-hot encoded 'country' columns into df_test
df_test = pd.merge(df_test, test_raw_df[['customer_id']].drop_duplicates().merge(country_one_hot, left_index=True, right_index=True), on='customer_id')

### Export Train/Test sets to CSV

In [117]:
# Display final df_train
df_train.head()

Unnamed: 0,customer_id,churn_status,age,country_Austria,country_Belgium,country_Bulgaria,country_Czech Republic,country_Denmark,country_Estonia,country_Finland,...,country_Netherlands,country_Norway,country_Poland,country_Portugal,country_Romania,country_Slovakia,country_Spain,country_Sweden,country_Switzerland,country_USA
0,1,True,69,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,True,76,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,True,21,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,4,True,50,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,5,True,17,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [118]:
df_test.head()

Unnamed: 0,customer_id,churn_status,age,country_Austria,country_Belgium,country_Bulgaria,country_Czech Republic,country_Denmark,country_Estonia,country_Finland,...,country_Netherlands,country_Norway,country_Poland,country_Portugal,country_Romania,country_Slovakia,country_Spain,country_Sweden,country_Switzerland,country_USA
0,1,False,69,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,False,76,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,True,21,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,4,True,50,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,5,True,17,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [119]:
filename = f"../data/train_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
df_train.to_csv(filename, index=False)
print(f"Trainset saved as: {filename}")

Trainset saved as: ../data/train_2025-01-30_16-47-04.csv


In [120]:
filename = f"../data/test_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
df_test.to_csv(filename, index=False)
print(f"Trainset saved as: {filename}")

Trainset saved as: ../data/test_2025-01-30_16-47-05.csv
