In [42]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

dataset_repository = "D:\\manualCDmanagement\\codes\\Projects\\VMs\\skl algorithms\\Logistic Regression\\00_datasets\\Loan.1"
dataset_name = "loan.csv"
dataset_file = os.path.join(dataset_repository, dataset_name)
df = pd.read_csv(dataset_file)

df['gender'], gender_uniques = pd.factorize(df.gender)
df['occupation'], occupation_uniques = pd.factorize(df['occupation'])
df['education_level'], educ_level_uniques = pd.factorize(df['education_level'])
df.marital_status, marital_status_uniques = pd.factorize(df.marital_status)
df.loan_status, loan_status_uniques = pd.factorize(df.loan_status)


In [13]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [68]:

model = LogisticRegression(max_iter=1000, random_state=168121, class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

y_prob = model.predict_proba(X_test_scaled)[:, 1]


fpr, tpr, thresholds = roc_curve(y_test, y_prob)
j_scores = tpr - fpr


optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]
# optimal_threshold = 0.8 - for  testing purposes (I thought I got 100 accuracy I was wrong if I put the threshold down)
y_pred_adjusted = (y_prob >= optimal_threshold).astype(int)


In [67]:
#lower threshold
conf_matrx = confusion_matrix(y_test, y_pred_adjusted)
print(conf_matrx)

print(classification_report(y_test, y_pred_adjusted))

cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print("Cross-Validation ROC-AUC scores:", cross_val_scores)
print("Mean Cross-Validation ROC-AUC score:", cross_val_scores.mean())

[[20  1]
 [ 0  4]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        21
           1       0.80      1.00      0.89         4

    accuracy                           0.96        25
   macro avg       0.90      0.98      0.93        25
weighted avg       0.97      0.96      0.96        25

Cross-Validation ROC-AUC scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation ROC-AUC score: 1.0


In [63]:
conf_matrx = confusion_matrix(y_test, y_pred_adjusted)
print(conf_matrx)

print(classification_report(y_test, y_pred_adjusted))

cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
print("Cross-Validation ROC-AUC scores:", cross_val_scores)
print("Mean Cross-Validation ROC-AUC score:", cross_val_scores.mean())

[[21  0]
 [ 0  4]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           1       1.00      1.00      1.00         4

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25

Cross-Validation ROC-AUC scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation ROC-AUC score: 1.0
