In [13]:
import pandas as pd
from pandas import DataFrame

def prepare_dataset_for_fidelity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepares the dataset for fidelity comparison by:
    - Converting numeric columns to float (dropping invalids)
    - Standardizing categorical values
    - Encoding employment_since according to custom mapping (unemployed -> -1, 1<year -> 0)
    """

    numeric_columns = [
        "age", "credit_amount", "duration", "monthly_rent_or_mortgage",
        "installment_rate", "residence_since", "existing_credits",
        "liable_people", "default"  
    ]
    
    categorical_columns = [
        "sex", "marital_status", "job", "credit_hist", "purpose", "debtors",
        "property", "installment_other", "housing", "tel", "online_banking",
        "foreign"
    ]

    df = df.copy()

    # 1. Clean and convert numeric columns
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # force conversion to numeric
        # Drop rows with NaN in numeric columns (or impute if preferred)
        df = df[df[col].notna()]

    # 2. Clean categorical columns (standardize text)
    for col in categorical_columns:
        df[col] = df[col].astype(str).str.strip().str.lower()

    # 3. Custom handling for employment_since
    def convert_employment(val):
        val = str(val).strip().lower()
        if val == 'unemployed':
            return -1
        elif val == '<1 year':
            return 0
        else:
            try:
                return float(val)
            except ValueError:
                return None

    df["employment_since"] = df["employment_since"].apply(convert_employment)
    df = df[df["employment_since"].notna()]
    
    # 4. Handling mixed-type 'checking_account'
    def clean_checking_account(val):
        val = str(val).strip().lower()
        if val == 'no checking account':
            return val  # keep as string
        try:
            return float(val)  # keep numeric
        except ValueError:
            return None  # invalid entries removed

    df["checking_account"] = df["checking_account"].apply(clean_checking_account)
    df = df[df["checking_account"].notna()]

    return df

In [20]:
df = pd.read_csv('Financial_Records.csv')
df_modified = pd.read_csv('Cleaned_Financial_Records_2.csv')
prepared_df = prepare_dataset_for_fidelity(df)
prepared_df_modified = prepare_dataset_for_fidelity(df_modified)

In [17]:
prepared_df.head(10)

Unnamed: 0,PID,age,sex,marital_status,job,employment_since,credit_hist,credit_amount,purpose,duration,...,housing,existing_credits,liable_people,monthly_rent_or_mortgage,tel,online_banking,foreign,checking_account,savings,default
0,406827,33,male,single,unskilled - resident,3.0,delay,7378,radio/television,24,...,own,2,1,200,registered,yes,no,124.0,17,1
1,13257437,39,female,single,unskilled - resident,0.0,existing credits paid,3331,car (new),30,...,own,2,1,204,registered,yes,yes,486.0,21,2
2,14303477,54,male,divorced/separated,management/self-employed/highly qualified,2.0,delay,804,car (new),61,...,own,1,1,187,none,yes,yes,132.0,229,1
3,16201589,54,female,single,management/self-employed/highly qualified,4.0,existing credits paid,1818,furniture/equipment,14,...,own,1,1,205,none,no,yes,176.0,45,1
4,11855923,59,male,single,skilled employee / official,3.0,critical account,4670,car (new),27,...,own,1,1,317,none,yes,yes,88.0,74,2
5,16281211,35,male,single,skilled employee / official,4.0,existing credits paid,5389,car (new),8,...,own,1,1,916,registered,yes,yes,88.0,unknown,2
6,13833815,49,male,single,management/self-employed/highly qualified,1.0,delay,3162,car (used),16,...,rent,1,1,181,registered,yes,yes,no checking account,unknown,1
7,5465472,34,male,single,unskilled - resident,8.0,existing credits paid,2930,car (new),14,...,own,2,1,193,none,yes,yes,178.0,98,2
8,5211207,40,male,single,skilled employee / official,17.0,critical account,14012,car (new),23,...,own,1,2,215,none,no,yes,-161.0,29,1
9,10615437,72,male,married/widowed,skilled employee / official,2.0,critical account,2819,furniture/equipment,41,...,for free,1,1,227,none,yes,yes,-223.0,48,2


In [21]:
prepared_df_modified.head(10)

Unnamed: 0,PID,age,sex,marital_status,job,employment_since,credit_hist,credit_amount,purpose,duration,...,housing,existing_credits,liable_people,monthly_rent_or_mortgage,tel,online_banking,foreign,checking_account,savings,default
0,PID000000,33,male,single,unskilled - resident,16.0,critical account,7378,radio/television,24,...,own,2,1,200,registered,yes,yes,124.0,17,1
1,PID000001,39,male,single,skilled employee / official,-1.0,existing credits paid,3331,car (used),30,...,own,2,1,204,none,yes,yes,486.0,21,2
2,PID000002,54,male,single,management/self-employed/highly qualified,2.0,critical account,804,car (new),61,...,own,1,1,187,none,yes,yes,132.0,229,1
3,PID000003,54,female,single,skilled employee / official,0.0,existing credits paid,1818,furniture/equipment,14,...,own,1,1,205,none,no,yes,176.0,45,1
4,PID000004,59,male,single,skilled employee / official,3.0,critical account,4670,car (new),27,...,own,1,1,317,registered,yes,yes,no checking account,74,2
5,PID000005,35,female,single,unskilled - resident,7.0,existing credits paid,5389,car (new),8,...,own,1,1,916,none,yes,no,88.0,unknown,2
6,PID000006,49,male,single,skilled employee / official,7.0,critical account,3162,car (new),16,...,own,1,1,181,none,yes,yes,no checking account,unknown,1
7,PID000007,34,male,single,skilled employee / official,8.0,critical account,2930,car (new),14,...,rent,2,1,193,registered,yes,yes,no checking account,19,2
8,PID000008,40,male,single,unskilled - resident,-1.0,critical account,14012,car (new),23,...,own,1,2,215,none,no,yes,no checking account,29,1
9,PID000009,72,female,married/widowed,management/self-employed/highly qualified,1.0,existing credits paid,2819,furniture/equipment,41,...,own,1,1,227,none,no,yes,-223.0,47,2
