In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

## 1.Load and read basic information of data ##

In [5]:
customer_df=pd.read_csv('../dataset/project-dataset/customer.csv')
customer_df_info = customer_df.info()
customer_df_head = customer_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939470 entries, 0 to 939469
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   customer_id             939470 non-null  int64  
 1   residence_country       939470 non-null  object 
 2   gender                  939470 non-null  int64  
 3   age                     939470 non-null  int64  
 4   first_join_date         939470 non-null  object 
 5   residence_index         939470 non-null  object 
 6   channel_entrace         939470 non-null  object 
 7   activity_status         939470 non-null  int64  
 8   household_gross_income  939470 non-null  float64
 9   saving_account          939470 non-null  int64  
 10  guarantees              939470 non-null  int64  
 11  junior_account          939470 non-null  int64  
 12  loans                   939470 non-null  int64  
 13  credit_card             939470 non-null  int64  
 14  pension             

In [6]:
transaction_df=pd.read_csv('../dataset/project-dataset/transactions.csv')
transaction_df_info = transaction_df.info()
transaction_df_head = transaction_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13094522 entries, 0 to 13094521
Data columns (total 10 columns):
 #   Column            Dtype  
---  ------            -----  
 0   transaction_id    int64  
 1   date              object 
 2   card_id           int64  
 3   amount            float64
 4   transaction_type  object 
 5   merchant_id       int64  
 6   mcc               int64  
 7   card_type         object 
 8   currency          object 
 9   customer_id       int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 999.0+ MB


## 2. tax bracket
Band	             Taxable income	        Tax rate
Personal Allowance	 Up to £12,570          0%
Basic rate	         £12,571 to £50,270	    20%
Higher rate	         £50,271 to £125,140    40%
Additional rate	     over £125,140	        45%  

In [None]:
def assign_tax_bracket(personal_income):
    if personal_income <= 1047.5:
        return '0%'
    elif personal_income <= 4189.2:
        return '20%'
    elif personal_income <= 10428.3:
        return '40%'
    else:
        return '45%'

customer_df['tax_rate'] = customer_df['personal_income'].apply(assign_tax_bracket)

In [None]:
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,personal_income,number_of_children,employment_status,current_loan_amount,credit_score,customer_segment,min_balance,max_balance,avg_balance,tax_rate
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,31711.45,3,1,0.0,766,0-1 year,8503.77,15655.16,11412.3,45%
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,27281.84,0,0,0.0,717,2-4 years,2914.84,5609.51,4173.71,45%
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,28891.38,0,1,19164.84,514,2-4 years,6385.79,17200.45,11109.5,45%
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,87510.26,3,1,30865.72,761,2-4 years,12905.81,76382.09,49543.16,45%
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,24310.19,1,1,2491.76,713,2-4 years,3266.44,7240.37,5317.39,45%


## 3. income stability  ##

In [None]:
def calculate_income_regularity_from_negative_amounts(transactions):
    df = transaction_df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.to_period('M')
    df['week'] = df['date'].dt.to_period('W')
    df['day'] = df['date'].dt.date

    income_df = df[df['transaction_type'] == 'Deposit'].copy()

    # 1. Average days of income per month
    daily_income = income_df.groupby(['customer_id', 'month', 'day'])['amount'].sum().reset_index()
    days_per_month = daily_income.groupby(['customer_id', 'month']).size().reset_index(name='income_days')
    avg_days = days_per_month.groupby('customer_id')['income_days'].mean().reset_index(name='avg_income_days_per_month')

    # 2. CV(Coefficient of Variation) of monthly income amounts
    monthly_total = income_df.groupby(['customer_id', 'month'])['amount'].sum().reset_index()
    monthly_cv = monthly_total.groupby('customer_id')['amount'].agg(['mean', 'std']).reset_index()
    monthly_cv['income_amount_cv'] = monthly_cv['std'] / monthly_cv['mean']

    # 3. merge
    result = avg_days.merge(monthly_cv[['customer_id', 'income_amount_cv']], on='customer_id', how='left')
    return result

In [None]:
income_regularity_df = calculate_income_regularity_from_negative_amounts(transaction_df)
customer_df = pd.merge(customer_df, income_regularity_df, on='customer_id', how='left')
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,employment_status,current_loan_amount,credit_score,customer_segment,min_balance,max_balance,avg_balance,tax_rate,avg_income_days_per_month,income_amount_cv
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,1,0.0,766,0-1 year,8503.77,15655.16,11412.3,45%,1.0,
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,0,0.0,717,2-4 years,2914.84,5609.51,4173.71,45%,,
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,1,19164.84,514,2-4 years,6385.79,17200.45,11109.5,45%,,
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,1,30865.72,761,2-4 years,12905.81,76382.09,49543.16,45%,1.0,
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,1,2491.76,713,2-4 years,3266.44,7240.37,5317.39,45%,,


## 4. expense regularity ##

In [None]:
def calculate_expense_regularity_from_negative_amounts(transactions):
    df = transaction_df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.to_period('M')
    df['week'] = df['date'].dt.to_period('W')
    df['day'] = df['date'].dt.date

    expense_df = df[df['transaction_type'] != 'Deposit'].copy()
    expense_df['amount'] = expense_df['amount'].abs()

    # 1. Average days of expenditure per month
    daily_expense = expense_df.groupby(['customer_id', 'month', 'day'])['amount'].sum().reset_index()
    days_per_month = daily_expense.groupby(['customer_id', 'month']).size().reset_index(name='expense_days')
    avg_days = days_per_month.groupby('customer_id')['expense_days'].mean().reset_index(name='avg_expense_days_per_month')

    # 2. CV(Coefficient of Variation) of monthly expenditure amounts
    monthly_total = expense_df.groupby(['customer_id', 'month'])['amount'].sum().reset_index()
    monthly_cv = monthly_total.groupby('customer_id')['amount'].agg(['mean', 'std']).reset_index()
    monthly_cv['expense_amount_cv'] = monthly_cv['std'] / monthly_cv['mean']

    # 3. merge
    result = avg_days.merge(monthly_cv[['customer_id', 'expense_amount_cv']], on='customer_id', how='left')

    return result

In [None]:
expense_regularity_df = calculate_expense_regularity_from_negative_amounts(transaction_df)
customer_df = pd.merge(customer_df, expense_regularity_df, on='customer_id', how='left')
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,credit_score,customer_segment,min_balance,max_balance,avg_balance,tax_rate,avg_income_days_per_month,income_amount_cv,avg_expense_days_per_month,expense_amount_cv
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,766,0-1 year,8503.77,15655.16,11412.3,45%,1.0,,1.272727,0.939871
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,717,2-4 years,2914.84,5609.51,4173.71,45%,,,1.285714,1.348381
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,514,2-4 years,6385.79,17200.45,11109.5,45%,,,1.0,0.732798
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,761,2-4 years,12905.81,76382.09,49543.16,45%,1.0,,1.266667,0.709893
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,713,2-4 years,3266.44,7240.37,5317.39,45%,,,1.125,0.871065


## 5. transaction volume ##

In [None]:
df = transaction_df.copy()
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M')

# number of transactions per month
monthly_tx = df.groupby(['customer_id', 'month']).size().reset_index(name='tx_count')

# mean & std
volume_stats = monthly_tx.groupby('customer_id')['tx_count'].agg(['mean', 'std']).reset_index()
volume_stats.rename(columns={
    'mean': 'avg_transactions_per_month',
    'std': 'monthly_transaction_std'
}, inplace=True)

In [None]:
customer_df = pd.merge(customer_df, volume_stats, on='customer_id', how='left')
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,min_balance,max_balance,avg_balance,tax_rate,avg_income_days_per_month,income_amount_cv,avg_expense_days_per_month,expense_amount_cv,avg_transactions_per_month,monthly_transaction_std
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,8503.77,15655.16,11412.3,45%,1.0,,1.272727,0.939871,1.25,0.452267
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,2914.84,5609.51,4173.71,45%,,,1.285714,1.348381,1.357143,0.633324
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,6385.79,17200.45,11109.5,45%,,,1.0,0.732798,1.0,0.0
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,12905.81,76382.09,49543.16,45%,1.0,,1.266667,0.709893,1.3125,0.60208
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,3266.44,7240.37,5317.39,45%,,,1.125,0.871065,1.125,0.353553


In [None]:
df = transaction_df.copy()
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year_month'] = df['date'].dt.to_period('M')

# calculate the number of transactions per month
monthly_counts = df.groupby(['customer_id', 'year_month']).size().reset_index(name='monthly_tx_count')

# group by customer
customer_tx_stats = monthly_counts.groupby('customer_id').agg(
    total_transactions=('monthly_tx_count', 'sum'),
    active_months=('year_month', 'nunique')
).reset_index()

# calculate average number of transaction per month
customer_tx_stats['avg_monthly_transaction_count'] = (
    customer_tx_stats['total_transactions'] / customer_tx_stats['active_months']
).round(2)

customer_df = pd.merge(customer_df, customer_tx_stats, on='customer_id', how='left')
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,tax_rate,avg_income_days_per_month,income_amount_cv,avg_expense_days_per_month,expense_amount_cv,avg_transactions_per_month,monthly_transaction_std,total_transactions,active_months,avg_monthly_transaction_count
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,45%,1.0,,1.272727,0.939871,1.25,0.452267,15.0,12.0,1.25
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,45%,,,1.285714,1.348381,1.357143,0.633324,19.0,14.0,1.36
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,45%,,,1.0,0.732798,1.0,0.0,10.0,10.0,1.0
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,45%,1.0,,1.266667,0.709893,1.3125,0.60208,21.0,16.0,1.31
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,45%,,,1.125,0.871065,1.125,0.353553,9.0,8.0,1.12


## 6. Savings Propensity Score ##

In [None]:
customer_df = customer_df.copy()
customer_df['first_join_date'] = pd.to_datetime(customer_df['first_join_date'])
customer_df['customer_tenure'] = (pd.to_datetime("today") - customer_df['first_join_date']).dt.days / 30

scaler = MinMaxScaler()
customer_df[['norm_age', 'norm_income']] = scaler.fit_transform(customer_df[['age', 'household_gross_income']])

customer_df['employment_status_factor'] = customer_df['employment_status'].apply(lambda x: 1 if x == 'Employed' else 0)

customer_df['demographic_score'] = (
    customer_df['norm_age'] +
    customer_df['norm_income'] +
    customer_df['employment_status_factor']
) / 3

customer_df[['norm_avg_balance', 'norm_customer_tenure']] = scaler.fit_transform(
    customer_df[['avg_balance', 'customer_tenure']]
)


In [None]:
deposit_df = transaction_df[transaction_df['transaction_type'] == 'Deposit'].copy()
withdrawal_df = transaction_df[transaction_df['transaction_type'] == 'Withdrawal'].copy()

net_cash_flow = (
    deposit_df.groupby('customer_id')['amount'].sum() -
    withdrawal_df.groupby('customer_id')['amount'].sum()
).reset_index(name='net_cash_flow').fillna(0)

deposit_freq = deposit_df.groupby('customer_id').size().reset_index(name='deposit_freq')
withdrawal_freq = withdrawal_df.groupby('customer_id').size().reset_index(name='withdrawal_freq')

txn_metrics = net_cash_flow.merge(deposit_freq, on='customer_id', how='outer') \
                           .merge(withdrawal_freq, on='customer_id', how='outer') \
                           .fillna(0)

txn_metrics[['norm_net_cash_flow', 'norm_deposit_freq', 'norm_withdrawal_freq']] = scaler.fit_transform(
    txn_metrics[['net_cash_flow', 'deposit_freq', 'withdrawal_freq']]
)

In [None]:
full_df = pd.merge(customer_df, txn_metrics, on='customer_id', how='left').fillna(0)
w1, w2, w3, w4, w5, w6 = 0.2, 0.2, 0.2, 0.2, 0.1, 0.1
full_df['SPS'] = (
    w1 * full_df['norm_avg_balance'] +
    w2 * full_df['norm_net_cash_flow'] +
    w3 * full_df['norm_deposit_freq'] -
    w4 * full_df['norm_withdrawal_freq'] +
    w5 * full_df['norm_customer_tenure'] +
    w6 * full_df['demographic_score']
)

In [None]:
sps_df = full_df[['customer_id', 'SPS']]

customer_df = customer_df.merge(sps_df, on='customer_id', how='left')

In [None]:
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,active_months,avg_monthly_transaction_count,customer_tenure,norm_age,norm_income,employment_status_factor,demographic_score,norm_avg_balance,norm_customer_tenure,SPS
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,12.0,1.25,64.266667,0.2,0.034753,0,0.078251,0.025798,0.064694,0.122274
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,14.0,1.36,93.766667,0.05,0.020713,0,0.023571,0.009286,0.178068,0.087452
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,10.0,1.0,93.766667,0.05,0.039278,0,0.029759,0.025107,0.178068,0.142663
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,16.0,1.31,93.766667,0.0375,0.079622,0,0.039041,0.112776,0.178068,0.111355
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,8.0,1.12,93.766667,0.05,0.019147,0,0.023049,0.011895,0.178068,0.145064


In [None]:
columns_to_drop = [
    'customer_tenure',
    'norm_age',
    'norm_income',
    'employment_status_factor',
    'demographic_score',
    'norm_avg_balance',
    'norm_customer_tenure'
]

customer_df = customer_df.drop(columns=columns_to_drop)

In [None]:
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,avg_income_days_per_month,income_amount_cv,avg_expense_days_per_month,expense_amount_cv,avg_transactions_per_month,monthly_transaction_std,total_transactions,active_months,avg_monthly_transaction_count,SPS
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,1.0,,1.272727,0.939871,1.25,0.452267,15.0,12.0,1.25,0.122274
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,,,1.285714,1.348381,1.357143,0.633324,19.0,14.0,1.36,0.087452
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,,,1.0,0.732798,1.0,0.0,10.0,10.0,1.0,0.142663
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,1.0,,1.266667,0.709893,1.3125,0.60208,21.0,16.0,1.31,0.111355
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,,,1.125,0.871065,1.125,0.353553,9.0,8.0,1.12,0.145064


## 7. TSI ##

In [None]:
df = transaction_df.copy()
df['amount'] = df['amount'].astype(float)

deposits = df[df['transaction_type'] == 'Deposit'].copy()
withdrawals = df[df['transaction_type'] == 'Withdrawal'].copy()

# calculate std and mean by customer id
deposit_stats = deposits.groupby('customer_id')['amount'].agg(['std', 'mean']).reset_index()
deposit_stats.columns = ['customer_id', 'std_deposit', 'mean_deposit']

withdrawal_stats = withdrawals.groupby('customer_id')['amount'].agg(['std', 'mean']).reset_index()
withdrawal_stats.columns = ['customer_id', 'std_withdrawal', 'mean_withdrawal']

# merge
tsi_df = pd.merge(deposit_stats, withdrawal_stats, on='customer_id', how='outer')

# calculate CV
tsi_df['cv_deposit'] = tsi_df['std_deposit'] / tsi_df['mean_deposit']
tsi_df['cv_withdrawal'] = tsi_df['std_withdrawal'] / tsi_df['mean_withdrawal'].abs()

# calculate TSI
tsi_df['TSI'] = 1 - ((tsi_df['cv_deposit'].fillna(0) + tsi_df['cv_withdrawal'].fillna(0)) / 2)
tsi_df['TSI'] = tsi_df['TSI'].clip(lower=0)

tsi_df = tsi_df[['customer_id', 'TSI']]
customer_df = pd.merge(customer_df, tsi_df, on='customer_id', how='left')
customer_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,income_amount_cv,avg_expense_days_per_month,expense_amount_cv,avg_transactions_per_month,monthly_transaction_std,total_transactions,active_months,avg_monthly_transaction_count,SPS,TSI
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,,1.272727,0.939871,1.25,0.452267,15.0,12.0,1.25,0.122274,0.485454
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,,1.285714,1.348381,1.357143,0.633324,19.0,14.0,1.36,0.087452,0.201175
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,,1.0,0.732798,1.0,0.0,10.0,10.0,1.0,0.142663,0.633601
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,,1.266667,0.709893,1.3125,0.60208,21.0,16.0,1.31,0.111355,0.541213
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,,1.125,0.871065,1.125,0.353553,9.0,8.0,1.12,0.145064,0.539062


In [None]:
customer_df = customer_df.fillna(0)
customer_df.isna().sum()

customer_id                      0
residence_country                0
gender                           0
age                              0
first_join_date                  0
residence_index                  0
channel_entrace                  0
activity_status                  0
household_gross_income           0
saving_account                   0
guarantees                       0
junior_account                   0
loans                            0
credit_card                      0
pension                          0
direct_debit                     0
personal_income                  0
number_of_children               0
employment_status                0
current_loan_amount              0
credit_score                     0
customer_segment                 0
min_balance                      0
max_balance                      0
avg_balance                      0
tax_rate                         0
avg_income_days_per_month        0
income_amount_cv                 0
avg_expense_days_per

## 8. demographic_score = (normalized_age + normalized_income + employment_status_factor + normalized_customer_tenure) / 4 ##

In [None]:
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

# tenure factor
tenure_score_map = {
    "0-1 year": 0.3,
    "2-4 years": 0.6,
    "More than 5 years": 1.0
}

df = customer_df.copy()
df['tenure_factor'] = customer_df['customer_segment'].map(tenure_score_map)
df['normalized_age'] = normalize(df['age'])
df['normalized_income'] = normalize(df['personal_income'])
df['demographic_score'] = (
    df['normalized_age'] +
    df['normalized_income'] +
    df['employment_status'] +
    df['tenure_factor']
) / 4


In [None]:
customer_df = pd.merge(customer_df, df, on='customer_id', how='left')
customer_df.head()

Unnamed: 0,customer_id,residence_country_x,gender_x,age_x,first_join_date_x,residence_index_x,channel_entrace_x,activity_status_x,household_gross_income_x,saving_account_x,...,monthly_transaction_std_y,total_transactions_y,active_months_y,avg_monthly_transaction_count_y,SPS_y,TSI_y,tenure_factor,normalized_age,normalized_income,demographic_score
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,0.452267,15.0,12.0,1.25,0.122274,0.485454,0.3,0.2,0.027192,0.381798
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,0.633324,19.0,14.0,1.36,0.087452,0.201175,0.6,0.05,0.023328,0.168332
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,0.0,10.0,10.0,1.0,0.142663,0.633601,0.6,0.05,0.024732,0.418683
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,0.60208,21.0,16.0,1.31,0.111355,0.541213,0.6,0.0375,0.07586,0.42834
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,0.353553,9.0,8.0,1.12,0.145064,0.539062,0.6,0.05,0.020736,0.417684


In [None]:
columns_to_drop = [
    'tenure_factor',
    'normalized_age',
    'normalized_income'
]

customer_df = customer_df.drop(columns=columns_to_drop)

In [None]:
# solve the problem of duplicate columns by removing suffixes
x_columns = [col for col in customer_df.columns if col.endswith('_x')]
no_suffix_columns = [col for col in customer_df.columns if not (col.endswith('_x') or col.endswith('_y'))]

# combine the columns with suffixes
clean_columns = ['customer_id'] + x_columns + [col for col in no_suffix_columns if col != 'customer_id']
clean_df = customer_df[clean_columns]

# remove the suffixes from the column names
rename_dict = {col: col[:-2] if col.endswith('_x') else col for col in clean_columns}
clean_df = clean_df.rename(columns=rename_dict)

# save the clean data to a new CSV file
clean_df.to_csv('clean_customer_dataNEW.csv', index=False)

In [None]:
clean_df.head()

Unnamed: 0,customer_id,residence_country,gender,age,first_join_date,residence_index,channel_entrace,activity_status,household_gross_income,saving_account,...,avg_expense_days_per_month,expense_amount_cv,avg_transactions_per_month,monthly_transaction_std,total_transactions,active_months,avg_monthly_transaction_count,SPS,TSI,demographic_score
0,1375586,ES,0,35,2020-01-12,Y,KHL,1,50887.44,1,...,1.272727,0.939871,1.25,0.452267,15.0,12.0,1.25,0.122274,0.485454,0.381798
1,1050611,ES,0,23,2017-08-10,Y,KHE,1,30619.38,1,...,1.285714,1.348381,1.357143,0.633324,19.0,14.0,1.36,0.087452,0.201175,0.168332
2,1050612,ES,0,23,2017-08-10,Y,KHE,1,57420.17,0,...,1.0,0.732798,1.0,0.0,10.0,10.0,1.0,0.142663,0.633601,0.418683
3,1050613,ES,1,22,2017-08-10,Y,KHD,1,115661.59,0,...,1.266667,0.709893,1.3125,0.60208,21.0,16.0,1.31,0.111355,0.541213,0.42834
4,1050614,ES,0,23,2017-08-10,Y,KHE,1,28358.36,0,...,1.125,0.871065,1.125,0.353553,9.0,8.0,1.12,0.145064,0.539062,0.417684
