In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, OrdinalEncoder, PowerTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv("../raw_data/train.csv")
data_test = pd.read_csv("../raw_data/test.csv")

In [3]:
data.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
X = data.drop(columns=['loan_paid_back', 'id'])
y = data['loan_paid_back']

In [5]:
numeric_cols = X.select_dtypes(include="number").columns
numeric_features = list(numeric_cols)
categorical_features = list(X.drop(columns=numeric_cols))

#scaling not needed for trees, so just one hot encoding
preprocessor_onehot = ColumnTransformer([
    ("num", "passthrough", numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

#No transformation for CatBoost
preprocessor_notransf = ColumnTransformer([
    ("num", "passthrough", numeric_features),
    ("cat", "passthrough", categorical_features)
])

#Only numerical encoding for CatBoost, XGBoost and LightGBM
preprocessor_standard = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", "passthrough", categorical_features)
])

#Use for: LogisticRegression, LinearSVC, SVM (rbf), MLPClassifier, etc.
preprocessor_standard_onehot = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

#MinMax could be better for KNN and maybe some neural nets
preprocessor_minmax_onehot = ColumnTransformer([
    ("num", MinMaxScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

#Use for: LogisticRegression / SVM when outliers are a problem.
preprocessor_robust_onehot = ColumnTransformer([
    ("num", RobustScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

#Ordinal encoding:
ordinal_features = ["education_level", "grade_subgrade"]
nominal_features = [col for col in categorical_features if col not in ordinal_features]

ordinal_categories = [
    ["Other", "High School", "Bachelor's", "Master's", "PhD"], # education
    ["A1","A2","A3","A4","A5",
     "B1","B2","B3","B4","B5",
     "C1","C2","C3","C4","C5",
     "D1","D2","D3","D4","D5",
     "E1","E2","E3","E4","E5",
     "F1","F2","F3","F4","F5",
     "G1","G2","G3","G4","G5"]                                # grade
]

preprocessor_standard_ordinal = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("ord", OrdinalEncoder(categories=ordinal_categories), ordinal_categories),
    ("nom", OneHotEncoder(handle_unknown='ignore'), nominal_features)
])

### Mixing encodings:
# order for grade_subgrade (from best to worst)
_grade_order = [
    ['A1', 'A2', 'A3', 'A4', 'A5',
     'B1', 'B2', 'B3', 'B4', 'B5',
     'C1', 'C2', 'C3', 'C4', 'C5',
     'D1', 'D2', 'D3', 'D4', 'D5',
     'E1', 'E2', 'E3', 'E4', 'E5',
     'F1', 'F2', 'F3', 'F4', 'F5']
]

# categorical columns to encode
_categorical_features = ['education_level', 'gender', 'marital_status',
                        'employment_status', 'loan_purpose']

# ordinal feature
_ordinal_features = ['grade_subgrade']

# numerical columns to scale
_robust_features = ['annual_income']
_standard_features = ['debt_to_income_ratio', 'credit_score', 'interest_rate']
_monetary_features = ['loan_amount']

_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), _categorical_features),
        ('ordinal', OrdinalEncoder(categories=_grade_order), _ordinal_features),
        ('robust', RobustScaler(), _robust_features),
        ('standard', StandardScaler(), _standard_features),
        ('monetary', PowerTransformer(method="yeo-johnson"), _monetary_features)
    ],
    remainder='drop'  # Drop any remaining columns
)

# Create pipeline with preprocessor only (no need for FunctionTransformer)
_pipeline_robust_standard = make_pipeline(
    _preprocessor
)

# Model: SVC

In [None]:
# TOO LONG
# model = Pipeline([
#     ("preprocess", preprocessor_standard_onehot),
#     ("svc", SVC(probability=True))
# ])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# model.fit(X_train, y_train)

# print("model.score:", model.score(X_test, y_test))

# y_pred_proba = model.predict_proba(X_test)[:, 1]
# roc_auc_score(y_test, y_pred_proba)


model = Pipeline([
    ("preprocess", preprocessor_standard_onehot),
    ("svc", SVC(probability=False))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

model.fit(X_train, y_train)

y_scores = model.decision_function(X_test)

auc = roc_auc_score(y_test, y_scores)
print("ROC AUC:", auc)

In [None]:
# Grid fit:
param_grid = {
    "svc_C": [0.1, 1, 10],
    "svc_kernel": ["rbf", "poly", "linear"]
    "svc_gamma": ["scale", "auto"]
}

grid = GridSearchCV(
    model,
    param_grid,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1
)

grid.fix(X_train, y_train)
print(grid.best_params_)