In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, RobustScaler, OrdinalEncoder


In [2]:
df = pd.read_csv('../raw_data/train.csv', index_col='id')
df_test = pd.read_csv('../raw_data/test.csv', index_col='id')

In [3]:
y_train = df['loan_paid_back']
X_train = df.drop(columns='loan_paid_back')

In [3]:
# order for grade_subgrade (from best to worst)
grade_order = [
    ['A1', 'A2', 'A3', 'A4', 'A5',
     'B1', 'B2', 'B3', 'B4', 'B5',
     'C1', 'C2', 'C3', 'C4', 'C5',
     'D1', 'D2', 'D3', 'D4', 'D5',
     'E1', 'E2', 'E3', 'E4', 'E5',
     'F1', 'F2', 'F3', 'F4', 'F5']
]

# categorical columns to encode
categorical_features = ['education_level', 'gender', 'marital_status',
                        'employment_status', 'loan_purpose']

# ordinal feature
ordinal_features = ['grade_subgrade']

# numerical columns to scale
robust_features = ['annual_income']
standard_features = ['debt_to_income_ratio', 'credit_score', 'interest_rate']
monetary_features = ['loan_amount']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),
        ('ordinal', OrdinalEncoder(categories=grade_order), ordinal_features),
        ('robust', RobustScaler(), robust_features),
        ('standard', StandardScaler(), standard_features),
        ('monetary', PowerTransformer(method="yeo-johnson"), monetary_features)
    ],
    remainder='drop'  # Drop any remaining columns
)

# Create pipeline with preprocessor only (no need for FunctionTransformer)
pipeline_robust_standard = make_pipeline(
    preprocessor
)

# Fit and transform X_train
X_train_preprocessed = pipeline_robust_standard.fit_transform(X_train)

# Get feature names and convert to DataFrame
feature_names = pipeline_robust_standard.named_steps['columntransformer'].get_feature_names_out()
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names, index=X_train.index)

# View the preprocessed data
X_train_preprocessed.head()

NameError: name 'X_train' is not defined