# Neo Bank Data Preparation Notebook

The purpose of this notebook is to clean the dataset and create the features used in the train/test datasets

In [78]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from datetime import timedelta

### Load historical (2008-2023) dataset used to train the model

In [79]:
import pandas as pd

dataframes = []

for year in range(2008, 2024):
    file_path = f"../data/train_{year}.parquet"
    try:
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    except Exception as e:
        print(f"Failed to read file {file_path}: {e}")
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Total rows in the combined DataFrame: {len(combined_df)}")
else:
    print("No files were successfully read.")

Total rows in the combined DataFrame: 3926058


In [80]:
combined_df.head()

Unnamed: 0,Id,customer_id,interest_rate,name,country,date_of_birth,address,date,atm_transfer_in,atm_transfer_out,...,crypto_in_volume,crypto_out_volume,complaints,touchpoints,csat_scores,tenure,from_competitor,job,churn_due_to_fraud,model_predicted_fraud
0,1,1,3.5,Yolanda Parker,Lithuania,1954-07-10,"1929 Erin Lights Suite 709\nLake Michaelburgh,...",2008-01-17,0,0,...,393.14501,220.900654,0,[],"{'appointment': None, 'email': None, 'phone': ...",0,False,Amenity horticulturist,False,False
1,6,1,3.5,Yolanda Parker,Lithuania,1954-07-10,"1929 Erin Lights Suite 709\nLake Michaelburgh,...",2008-01-18,0,0,...,390.005729,221.147296,0,[],"{'appointment': None, 'email': None, 'phone': ...",1,False,Amenity horticulturist,False,False
2,16,1,3.5,Yolanda Parker,Lithuania,1954-07-10,"1929 Erin Lights Suite 709\nLake Michaelburgh,...",2008-01-19,0,0,...,393.209108,260.510535,0,[],"{'appointment': None, 'email': None, 'phone': ...",2,False,Amenity horticulturist,False,False
3,31,1,3.5,Yolanda Parker,Lithuania,1954-07-10,"1929 Erin Lights Suite 709\nLake Michaelburgh,...",2008-01-20,0,0,...,394.078294,203.214128,0,[],"{'appointment': None, 'email': None, 'phone': ...",3,False,Amenity horticulturist,False,False
4,50,1,3.5,Yolanda Parker,Lithuania,1954-07-10,"1929 Erin Lights Suite 709\nLake Michaelburgh,...",2008-01-21,0,0,...,395.174604,207.894467,0,"[whatsapp, email, email]","{'appointment': None, 'email': None, 'phone': ...",4,False,Amenity horticulturist,False,False


### Historical Data Cleaning

In [81]:
# drop unrealistic records of atm transfer out

# current justification: it's not realisitc that someone can transfer money out of an ATM thousands of times in single day also it accounts for tiny 
# TODO: find a source to backup the reasoning for removing this data

initial_count = len(combined_df)

# drop records with more than 10 atm transfer outs
filtered_df = combined_df.drop(combined_df[combined_df['atm_transfer_out'] > 10].index)

# Calculate dropped records
dropped_count = initial_count - len(filtered_df)
percentage_dropped = (dropped_count / initial_count) * 100

print(f"Records dropped: {dropped_count}")
print(f"Percentage of dataset removed: {percentage_dropped:.2f}%")

combined_df = filtered_df

Records dropped: 1442
Percentage of dataset removed: 0.04%


In [82]:
# drop unrealistic records of atm transfer in

# current justification: it's not realisitc that someone can transfer money out of an ATM thousands of times in single day also it accounts for tiny 
# TODO: find a source to backup the reasoning for removing this data

# drop records with more than 10 atm transfer ins
filtered_df = combined_df.drop(combined_df[combined_df['atm_transfer_in'] > 10].index)

# Calculate dropped records
dropped_count = initial_count - len(filtered_df)
percentage_dropped = (dropped_count / initial_count) * 100

print(f"Records dropped: {dropped_count}")
print(f"Percentage of dataset removed: {percentage_dropped:.2f}%")

combined_df = filtered_df

Records dropped: 2680
Percentage of dataset removed: 0.07%


### Create the training dataset (customer_id level data)

Churn definition:
- No interactions with the bank for at 18 or more months since the max date in the dataset (2023-12-31)

In [83]:
# Ensure the 'date' column is in datetime format
combined_df['date'] = pd.to_datetime(combined_df['date'])

max_date = max(combined_df['date'])
print(max_date)

2023-12-31 00:00:00


In [84]:
# Get the last interaction date for each customer
last_interaction = combined_df.groupby('customer_id')['date'].max()

# Define the churn threshold
churn_date = pd.Timestamp(max_date) # Training the model up to the 
cutoff_date = churn_date - pd.DateOffset(months=18)

# Determine churn status (True if last interaction was before cutoff_date)
churn_status = last_interaction < cutoff_date

# Convert to DataFrame
df_train = churn_status.reset_index(name='churn_status')

churn_percentage = (df_train['churn_status'].sum() / len(df_train)) * 100
print(f"{churn_percentage:.2f}% of distinct customers since 2008-2023 have churned under our definition")

df_train.head()

19.95% of distinct customers since 2008-2023 have churned under our definition


Unnamed: 0,customer_id,churn_status
0,1,False
1,2,True
2,3,True
3,4,True
4,5,False


## Feature Engineering

Features:
- customer age
- country
- account age
- customer job category

In [85]:
# This code compares our definition of churn to the instances of churn due to fraud (since we can assume that these are ACTUAL cases where a customer churned)

# Merge dataframes on customer_id
merged_df = combined_df.merge(df_train, on="customer_id", how="inner")

# Categorizing based on churn_due_to_fraud and churn_status
conditions = [
    (merged_df["churn_due_to_fraud"] == True) & (merged_df["churn_status"] == True),
    (merged_df["churn_due_to_fraud"] == False) & (merged_df["churn_status"] == False),
    (merged_df["churn_due_to_fraud"] == True) & (merged_df["churn_status"] == False),
    (merged_df["churn_due_to_fraud"] == False) & (merged_df["churn_status"] == True)
]

categories = ["Both True", "Both False", "Fraud True, Churn False", "Fraud False, Churn True"]

merged_df["Category"] = np.select(conditions, categories, default="Unknown")
category_counts = merged_df["Category"].value_counts()

print(category_counts)

Category
Both False                 2653054
Fraud False, Churn True    1270297
Both True                       23
Fraud True, Churn False          4
Name: count, dtype: int64


#### Age

In [86]:
# Compute age as of 2023-12-31
combined_df['date_of_birth'] = pd.to_datetime(combined_df['date_of_birth'])
age = (pd.Timestamp('2023-12-31') - combined_df.groupby('customer_id')['date_of_birth'].min()).dt.days // 365

# Convert to DataFrame and merge with df_train
age_df = age.reset_index(name='age')
df_train = df_train.merge(age_df, on='customer_id')

#### Country

In [87]:
# One-hot encode the 'country' column
country_one_hot = pd.get_dummies(combined_df['country'], prefix='country')

# Merge one-hot encoded 'country' columns into df_train
df_train = pd.merge(df_train, combined_df[['customer_id']].drop_duplicates().merge(country_one_hot, left_index=True, right_index=True), on='customer_id')

### Export Train/Test sets to CSV

In [88]:
# Display final df_train
df_train.head()

Unnamed: 0,customer_id,churn_status,age,country_Austria,country_Belgium,country_Bulgaria,country_Czech Republic,country_Denmark,country_Estonia,country_Finland,...,country_Netherlands,country_Norway,country_Poland,country_Portugal,country_Romania,country_Slovakia,country_Spain,country_Sweden,country_Switzerland,country_USA
0,1,False,69,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,True,76,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,True,21,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,4,True,50,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,5,False,17,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [94]:
filename = f"../data/train_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
df_train.to_csv(filename, index=False)
print(f"Trainset saved as: {filename}")

Trainset saved as: ../data/train_2025-01-30_14-25-29.csv
