### **Importing modules**

In [3]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from datetime import datetime
import scipy.stats as stats # type: ignore
import matplotlib.pyplot as plt # type: ignore
import os # type: ignore

### **Loading Preprocessed Dataset**

In [5]:
df_cards = pd.read_csv('../data/cards_data.csv')
df_users = pd.read_csv('../data/users_data.csv')

### **Performing Feature Engineering**

**1. Added a new column containing categorical variable - Retired, Not Retired**

In [7]:
# Determine retirement status
df_users['retirement_status'] = np.where(
    df_users['current_age'] >= df_users['retirement_age'],
    'Retired',
    'Not Retired'
)

**2. Divided Age into four Groups and Created a new column age_group containing 4 categorical variables**

In [9]:
# Categorize Age Groups
def categorize_age(age):
    if age <= 30:
        return "17-30"
    elif age <= 45:
        return "31-45"
    elif age <= 60:
        return "46-60"
    else:
        return "60+"

df_users['age_group'] = df_users['current_age'].apply(categorize_age)

### **Although there is no such direct analysis possible this variable is used to detect for fruad detection likelihood still we will consider it for the analysis**
**3. Flag if PIN Change is Due**

In [10]:
# Flag if PIN Change is Due we check for whether the card's PIN has been changed in the last 2 years but data was last updated 5 years ago so we take -7 instead of -2.
df_cards['PIN_Change_Due'] = df_cards['year_pin_last_changed'].apply(lambda x: 'Yes' if x < datetime.today().year - 7 else 'No')

**4. Since total debt is a very perspective dependent variable suppose - Person A having Yearly Income $500k for him having an debt of $20k is not an big issue than for a Person B having Yearly Income $20k having an debt of $10k though it seems that Person A has More debt than Person B, if we take the Debt_to_income ratio we see the actuall significance for Person A - 0.04(4% of income) and for Person B - 0.5(50% of income)**

In [11]:
# Calculate Debt-to-Income Ratio
df_users['Debt_to_Income_Ratio'] = df_users['total_debt'] / df_users['yearly_income']

### **Merging Both Tables**

In [12]:
df_merged = pd.merge(df_users, df_cards, left_on="id", right_on="client_id", how="left")

### **Updating the csv files**

In [13]:
data_dir = '../data'
file_path_merged = os.path.join(data_dir, 'merged_data.csv')
df_merged.to_csv(file_path_merged, index=False)