In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [11]:
train_df = pd.read_csv('data/train_updated.csv')
test_df = pd.read_csv('data/test_updated.csv')

print(f"Original Training Data Shape: {train_df.shape}")
print(f"Original Test Data Shape: {test_df.shape}")

# Separate IDs and Target variable
train_profile_id = train_df['ProfileID']
train_target = train_df['RiskFlag']
test_profile_id = test_df['ProfileID']

# Drop ProfileID and RiskFlag (from train_df) for core processing
train_df_proc = train_df.drop(columns=['ProfileID', 'RiskFlag'])
test_df_proc = test_df.drop(columns=['ProfileID'])

Original Training Data Shape: (204277, 18)
Original Test Data Shape: (51070, 17)


In [12]:
def feature_engineering(df):
    """Creates SumToEarningsRatio and DebtLoad features."""
    # Ratio of Requested Sum to Annual Earnings
    df['SumToEarningsRatio'] = df['RequestedSum'] / df['AnnualEarnings']
    # Debt Load feature combining RequestedSum and DebtFactor
    df['DebtLoad'] = df['RequestedSum'] * df['DebtFactor']
    return df

train_df_proc = feature_engineering(train_df_proc.copy())
test_df_proc = feature_engineering(test_df_proc.copy())

# Identify all numerical columns
numerical_cols = train_df_proc.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [13]:
def cap_outliers_iqr(df, column, lower_bound, upper_bound):
    """Applies capping/flooring based on provided bounds."""
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# Calculate bounds ONLY on the training data
bounds_train = {}
for col in numerical_cols:
    Q1 = train_df_proc[col].quantile(0.25)
    Q3 = train_df_proc[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    bounds_train[col] = (lower, upper)

# Apply capping to both train and test using train bounds
for col in numerical_cols:
    lower_bound, upper_bound = bounds_train[col]
    train_df_proc = cap_outliers_iqr(train_df_proc, col, lower_bound, upper_bound)
    test_df_proc = cap_outliers_iqr(test_df_proc, col, lower_bound, upper_bound)

In [14]:
categorical_cols = train_df_proc.select_dtypes(include='object').columns.tolist()

# Concatenate for consistent encoding across train and test
combined_df = pd.concat([train_df_proc, test_df_proc], ignore_index=True)
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_cols, drop_first=True)

# Separate back into processed train and test sets
train_processed_df = combined_df_encoded.iloc[:len(train_df)].copy()
test_processed_df = combined_df_encoded.iloc[len(train_df):].copy()

In [15]:
scaler = StandardScaler()

# Fit scaler only on the training data's numerical columns
train_processed_df[numerical_cols] = scaler.fit_transform(train_processed_df[numerical_cols])

# Transform test data using the fitted scaler
test_processed_df[numerical_cols] = scaler.transform(test_processed_df[numerical_cols])

In [16]:
train_processed_df.insert(0, 'ProfileID', train_profile_id)
train_processed_df['RiskFlag'] = train_target

# Correctly re-attach ProfileID to the test data (must reset index first)
test_processed_df = test_processed_df.reset_index(drop=True)
test_processed_df.insert(0, 'ProfileID', test_profile_id)

# Save the final processed DataFrames
train_processed_df.to_csv('train_processed_parv.csv', index=False)
test_processed_df.to_csv('test_processed_parv.csv', index=False)