In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE



In [5]:
# Load the CSV files
app_df = pd.read_csv("application_record.csv")
credit_df = pd.read_csv("credit_record.csv")


  credit_df = pd.read_csv("credit_record.csv")


In [6]:
# Mark applicants with any STATUS of 1–5 as defaulters
defaulters = credit_df[credit_df['STATUS'].isin(['1', '2', '3', '4', '5'])]['ID'].unique()

# Add target column to application data
app_df['DEFAULT'] = app_df['ID'].apply(lambda x: 1 if x in defaulters else 0)

# View balance of target variable
print(app_df['DEFAULT'].value_counts())




DEFAULT
0    41447
1      911
Name: count, dtype: int64


In [7]:
# Drop ID column (not needed for prediction)
data = app_df.drop(columns=['ID'])


In [8]:
# Check missing values
print(data.isnull().sum())

# Fill missing occupation with "Unknown"
data['OCCUPATION_TYPE'].fillna("Unknown", inplace=True)


CODE_GENDER                0
FLAG_OWN_CAR               0
FLAG_OWN_REALTY            0
CNT_CHILDREN               0
AMT_INCOME_TOTAL           0
NAME_INCOME_TYPE           0
NAME_EDUCATION_TYPE        0
NAME_FAMILY_STATUS         0
NAME_HOUSING_TYPE          1
DAYS_BIRTH                 1
DAYS_EMPLOYED              1
FLAG_MOBIL                 1
FLAG_WORK_PHONE            1
FLAG_PHONE                 1
FLAG_EMAIL                 1
OCCUPATION_TYPE        13211
CNT_FAM_MEMBERS            1
DEFAULT                    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['OCCUPATION_TYPE'].fillna("Unknown", inplace=True)


In [9]:
# Encode binary variables
binary_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
for col in binary_cols:
    data[col] = LabelEncoder().fit_transform(data[col])

# One-hot encode other categoricals
data = pd.get_dummies(data, drop_first=True)


In [10]:
# Separate features and target
X = data.drop('DEFAULT', axis=1)
y = data['DEFAULT']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [14]:
from sklearn.impute import SimpleImputer

# Impute missing values before scaling
imputer = SimpleImputer(strategy="median")   # ya strategy="most_frequent" for categoricals
X_imputed = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("Before SMOTE:", y.value_counts())
print("After SMOTE:", pd.Series(y_resampled).value_counts())


Before SMOTE: DEFAULT
0    41447
1      911
Name: count, dtype: int64
After SMOTE: DEFAULT
1    41447
0    41447
Name: count, dtype: int64


In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("Before SMOTE:", y.value_counts())
print("After SMOTE:", pd.Series(y_resampled).value_counts())


Before SMOTE: DEFAULT
0    41447
1      911
Name: count, dtype: int64
After SMOTE: DEFAULT
1    41447
0    41447
Name: count, dtype: int64


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [17]:
# Define model and parameter grid
model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='f1', verbose=1)
grid.fit(X_train, y_train)

# Best model
best_model = grid.best_estimator_
print("Best Params:", grid.best_params_)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params: {'C': 1, 'solver': 'lbfgs'}


In [18]:
# Predict
y_pred = best_model.predict(X_test)

# Metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[4554 3736]
 [3385 4904]]
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      8290
           1       0.57      0.59      0.58      8289

    accuracy                           0.57     16579
   macro avg       0.57      0.57      0.57     16579
weighted avg       0.57      0.57      0.57     16579

