In [198]:
import pandas as pd
from helpers.dslabs_functions import get_variable_types
import numpy as np
from math import pi, sin, cos
import re


In [199]:
df = pd.read_csv('../datasets/class_credit_score.csv')
df.drop(columns=["ID", "Customer_ID", "SSN", "Name"], inplace=True)
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,NumofLoan,Type_of_Loan,...,CreditMix,OutstandingDebt,CreditUtilizationRatio,Credit_History_Age,Payment_of_Min_Amount,TotalEMIpermonth,Amountinvestedmonthly,Payment_Behaviour,MonthlyBalance,Credit_Score
0,January,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,Good
1,February,23,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,31.94496,,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629163,Good
2,March,-500,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,Good
3,April,23,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,Good
4,May,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,Good


In [200]:
variable_types = get_variable_types(df)

variable_types["binary"]

['Credit_Score']

In [201]:
df["Type_of_Loan"].value_counts()

Type_of_Loan
Not Specified                                                                                                                         1408
Credit-Builder Loan                                                                                                                   1280
Personal Loan                                                                                                                         1272
Debt Consolidation Loan                                                                                                               1264
Student Loan                                                                                                                          1240
                                                                                                                                      ... 
Not Specified, Mortgage Loan, Auto Loan, and Payday Loan                                                                                 8
Payday Loan, M

In [202]:
df["Type_of_Loan"].unique()

array(['Auto Loan, Credit-Builder Loan, Personal Loan, and Home Equity Loan',
       'Credit-Builder Loan', 'Auto Loan, Auto Loan, and Not Specified',
       ..., 'Home Equity Loan, Auto Loan, Auto Loan, and Auto Loan',
       'Payday Loan, Student Loan, Mortgage Loan, and Not Specified',
       'Personal Loan, Auto Loan, Mortgage Loan, Student Loan, and Student Loan'],
      dtype=object)

In [203]:
# Function to create binary encoding for loan types
def expand_loans(row, all_loans):
    loans = row['Type_of_Loan'].split(',')
    return [1 if loan in loans else 0 for loan in all_loans]

def clean_loan_type(loan_type):
    return loan_type.strip().replace('and ', '')

all_loans = set(clean_loan_type(loan) 
                for loan_list in df['Type_of_Loan'].dropna() 
                for loan in loan_list.split(','))

# Expand the DataFrame
expanded_rows = []
for loan_list in df['Type_of_Loan']:
    loans = loan_list.split(',') if pd.notna(loan_list) else []
    cleaned_loans = [clean_loan_type(loan) for loan in loans]
    row = [1 if loan in cleaned_loans else 0 for loan in all_loans]
    expanded_rows.append(row)

# Create expanded DataFrame
expanded_df = pd.DataFrame(expanded_rows, columns=list(all_loans))

df = pd.concat([df.reset_index(drop=True), expanded_df.reset_index(drop=True)], axis=1)

In [204]:
def transform_bools(df, keyword):
    for col in df.columns:
        if col.startswith(keyword):
            df[col] = df[col].astype(int)
    return df

df = pd.get_dummies(df, columns=['Occupation'], dummy_na=False)
df = transform_bools(df, "Occupation")

In [205]:
def clean_and_convert(s):
    if pd.isna(s) or s == "nan":
        return np.nan
    cleaned = re.sub(r'[^0-9.]+', '', s)  # Remove non-numeric characters
    # Convert to integer if possible, otherwise float
    return int(cleaned) if cleaned.isdigit() else float(cleaned)

df['Age'] = df['Age'].apply(clean_and_convert)

In [206]:
month_encoding = {
    "January": 0,
    "February":pi/6,
    "March":2*pi/6,
    "April":3*pi/6,
    "May":4*pi/6,
    "June":5*pi/6,
    "July":6*pi/6,
    "August": 7*pi/6,
}


encoding = {
    "Month": month_encoding,
}

def encode_cyclic_variables(df, vars):
    for v in vars:
        x_max: float | int = max(df[v])
        df[v + "_sin"] = df[v].apply(lambda x: round(sin(2 * pi * x / x_max), 3))
        df[v + "_cos"] = df[v].apply(lambda x: round(cos(2 * pi * x / x_max), 3))
    return df

df = df.replace(encoding)
df = encode_cyclic_variables(df, ["Month"])

In [207]:
def calulate_age(age_categories):
    mean_ages = []
    for value in age_categories:
        split = value.split()
        years, months = int(split[0]), int(split[3])
        mean_ages.append((years+months/12))
    return mean_ages

age_value_counts = df["Credit_History_Age"].value_counts()
mean_ages = calulate_age(age_value_counts.index)
credit_history_age_encoding = {age_value_counts.index[i]:mean_ages[i] for i in range(len(mean_ages))}
credit_history_age_encoding["nan"] = np.nan
# credit_history_age_encoding

In [208]:
credit_mix_encoding = {"Good": 2, "Standard":1, "Bad": 0, "nan":np.nan}
payment_of_min_amount_encoding = {"No":0, "NM":1, "Yes":2, "nan":np.nan}
payment_behaviour_encoding = {'High_spent_Small_value_payments':5,
       'Low_spent_Large_value_payments':0,
       'Low_spent_Medium_value_payments':1,
       'Low_spent_Small_value_payments':2,
       'High_spent_Medium_value_payments':4, 
       "nan":np.nan,
       'High_spent_Large_value_payments':3}
credit_score_encoding = {"Good":1, "Poor":0}

encoding = {}
encoding["CreditMix"] = credit_mix_encoding
encoding["Payment_of_Min_Amount"] = payment_of_min_amount_encoding
encoding["Payment_Behaviour"] = payment_behaviour_encoding
encoding["Credit_Score"] = credit_score_encoding
encoding["Credit_History_Age"] = credit_history_age_encoding
df = df.replace(encoding, inplace=False)


In [211]:
df.drop(columns=["Month", "Type_of_Loan"], inplace=True)
df["Credit_Score"]

0        Good
1        Good
2        Good
3        Good
4        Good
         ... 
99995    Poor
99996    Poor
99997    Poor
99998    Good
99999    Poor
Name: Credit_Score, Length: 100000, dtype: object