In [None]:
# -------------------------------
# Step 1: Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

sns.set(style="whitegrid")  # nicer plots

# -------------------------------
# Step 2: Load Dataset
# -------------------------------
from google.colab import files
uploaded = files.upload()  # Upload your loan_dataset_20000.csv

# Read the CSV file
df = pd.read_csv("loan_dataset_20000.csv")
print("Dataset Loaded Successfully!\n")
print("First 5 rows:")
display(df.head())

# -------------------------------
# Step 3: Understand the Dataset
# -------------------------------
print("\nDataset Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nInfo:")
df.info()

print("\nMissing values per column:")
print(df.isnull().sum())

# -------------------------------
# Step 4: Handle Missing Values
# -------------------------------
# Fill numeric columns with mean
numeric_cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'ApplicantIncome', 'CoapplicantIncome']
for col in numeric_cols:
    if col in df.columns:
        df[col].fillna(df[col].mean(), inplace=True)

# Fill categorical columns with mode
categorical_cols = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
for col in categorical_cols:
    if col in df.columns:
        df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after cleaning:")
print(df.isnull().sum())

# -------------------------------
# Step 5: Exploratory Data Analysis (EDA)
# -------------------------------
# Histogram of Loan Amount
plt.figure(figsize=(6,4))
sns.histplot(df['LoanAmount'], bins=30, kde=True)
plt.title("Loan Amount Distribution")
plt.show()

# Boxplot of Applicant Income vs Loan Status
plt.figure(figsize=(6,4))
sns.boxplot(x='Loan_Status', y='ApplicantIncome', data=df)
plt.title("Applicant Income vs Loan Status")
plt.show()

# Countplot: Education vs Loan Status
plt.figure(figsize=(6,4))
sns.countplot(x='Education', hue='Loan_Status', data=df)
plt.title("Education vs Loan Status")
plt.show()

# Countplot: Property Area vs Loan Status
plt.figure(figsize=(6,4))
sns.countplot(x='Property_Area', hue='Loan_Status', data=df)
plt.title("Property Area vs Loan Status")
plt.show()

# -------------------------------
# Step 6: Encode Categorical Features
# -------------------------------
le = LabelEncoder()
for col in df.select_dtypes(include='object'):
    df[col] = le.fit_transform(df[col])

print("\nDataset after encoding:")
display(df.head())

# -------------------------------
# Step 7: Split Dataset into Train & Test
# -------------------------------
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# Step 8: Train Logistic Regression Model
# -------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# -------------------------------
# Step 9: Make Predictions
# -------------------------------
y_pred = model.predict(X_test)

# -------------------------------
# Step 10: Evaluate Model
# -------------------------------
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", round(accuracy*100, 2), "%")

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# -------------------------------
# Step 11: Conclusion
# -------------------------------
print("""
Conclusion:
- We successfully trained a Logistic Regression model to predict credit risk.
- The model predicts whether a loan applicant will be approved (Loan_Status) based on historical data.
- We handled missing values, visualized key features, encoded categorical columns, trained the model, and evaluated it using accuracy and confusion matrix.
""")