In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
url = "https://raw.githubusercontent.com/souravroy0708/ML_LoanPrediction/master/Data/train_u6lujuX_CVtuZ9i.csv"
df = pd.read_csv(url)
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df['Gender'] = df['Gender'].fillna('Unknown')

In [None]:
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])

In [None]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])

In [None]:
df['Self_Employed'].value_counts()

In [None]:
df['Self_Employed'] = df['Self_Employed'].fillna('No')

In [None]:
df['LoanAmount'].value_counts()

In [None]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

In [None]:
df['Loan_Amount_Term'].value_counts()

In [None]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])

In [None]:
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [None]:
df.isna().sum()

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols)//3 + 1, 3, i)
    sns.boxplot(x=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df_scaled = df.copy()
scaler = StandardScaler()
df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier

In [None]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# --- 1) Apply SMOTE ---
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print("Original dataset shape:", y_train.value_counts().to_dict())
print("Resampled dataset shape:", y_resampled.value_counts().to_dict())

In [None]:
# --- 2) Logistic Regression ---
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_resampled, y_resampled)
y_pred_lr = log_reg.predict(X_test)

In [None]:
print("\n🔹 Logistic Regression (after SMOTE)")
print(classification_report(y_test, y_pred_lr))

In [None]:
# --- 3) Decision Tree ---
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_resampled, y_resampled)
y_pred_dt = dt.predict(X_test)

print("\n🔹 Decision Tree (after SMOTE)")
print(classification_report(y_test, y_pred_dt))