In [None]:
pip install pandas

In [None]:
pip install imbalanced-learn

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('logistic_regression.csv')

In [None]:
df.info()

### Problem Statement

LoanTap seeks to develop an underwriting system to assess the creditworthiness of individuals applying for a **Personal Loan**. Given a set of applicant attributes (demographics, employment, financial history, and credit behavior), the system should address two key objectives:
1. **Loan Decisioning** – Determine whether the loan application should be **approved or rejected** based on the applicant’s creditworthiness.  
2. **Repayment Recommendation** – For approved applicants, recommend **suitable loan terms** (loan amount, tenure, and interest rate) that balance:  
   - **Customer affordability**  
   - **LoanTap’s risk management objectives**  


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

#### ==========================
### Data Analysis
#### ==========================

#### Shape of Data

In [None]:
print("Shape of dataset:", df.shape)

#### Data Types

In [None]:
print("\nData Types:\n", df.dtypes)

#### Missing Values

In [None]:
print("\nMissing Values:\n", df.isnull().sum())

In [None]:
# Convert Categorical Columns to 'category'
categorical_cols = df.select_dtypes(include=['object']).columns
# Drop date-like fields from categorical
categorical_cols = [col for col in categorical_cols if col not in ['issue_d', 'earliest_cr_line','address','emp_length']]

for col in categorical_cols:
    df[col] = df[col].astype('category')

print("\nCategorical columns converted to 'category':", categorical_cols)

#### Statistical Summary (Numerical Attributes)

In [None]:
# ======================

# ======================
print("\nStatistical Summary:\n")
df.describe()

#### Missing Value Treatment

In [None]:
print("\nMissing Values:\n",)
df.isnull().sum()[df.isnull().sum() > 0].reset_index()

In [None]:
df.dropna(inplace=True)

In [None]:
print("\nMissing Values:\n",)
df.isnull().sum()[df.isnull().sum() > 0].reset_index()

In [None]:
df = df.drop(columns=['title'])

#### Outlier Treatment

In [None]:
import pandas as pd

# Columns to check for outliers
num_cols = ['open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'dti', 'loan_amnt', 'installment', 'annual_inc']

# Copy the dataframe
outlier_removed_df = df.copy()

# Remove outliers using IQR method
for col in num_cols:
    Q1 = outlier_removed_df[col].quantile(0.25)
    Q3 = outlier_removed_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Keep only rows within the IQR bounds
    outlier_removed_df = outlier_removed_df[(outlier_removed_df[col] >= lower_bound) & (outlier_removed_df[col] <= upper_bound)]

# Check the resulting dataframe
outlier_removed_df.describe()

#### Feature Engineering

In [None]:
import pandas as pd

def binarize_features(df, cols):
    df_copy = df.copy()
    for col in cols:
        df_copy[col] = df_copy[col].apply(lambda x: 0 if x == 0 else 1)
    return df_copy

binary_cols = ['pub_rec', 'mort_acc', 'pub_rec_bankruptcies']

binarized_df = binarize_features(outlier_removed_df, binary_cols)

In [None]:
import re
def categorize_job(title):
    if pd.isnull(title) or str(title).strip() == '':
        return "Unknown"
    
    title = str(title).lower()

    # Define keyword-based rules with expanded coverage
    if re.search(r"teacher|professor|instructor|principal|educator|tutor", title):
        return "Education"
    elif re.search(r"nurse|rn|lpn|cna|healthcare|physician|doctor|pharmacist|medical|clinic|hospital", title):
        return "Healthcare"
    elif re.search(r"driver|truck|transport|operator|delivery|chauffeur", title):
        return "Transport"
    elif re.search(r"engineer|developer|programmer|coder|software|technician|network|it|data|systems", title):
        return "Technology"
    elif re.search(r"manager|supervisor|director|vp|president|executive|superintendent|controller|coordinator|administrator|lead", title):
        return "Management"
    elif re.search(r"accountant|auditor|finance|analyst|advisor|bookkeeper|underwriter|controller|cfo|financial|treasurer", title):
        return "Finance"
    elif re.search(r"sales|marketing|customer|associate|representative|clerk|cashier|retail|merchandiser|store", title):
        return "Sales/Marketing"
    elif re.search(r"legal|attorney|paralegal|lawyer|counsel|judge", title):
        return "Legal"
    elif re.search(r"military|us army|usaf|navy|air force|marine|coast guard|soldier|veteran", title):
        return "Military"
    elif re.search(r"police|officer|firefighter|security|sheriff|corrections", title):
        return "Public Safety"
    elif re.search(r"mechanic|electrician|machinist|construction|foreman|plumber|welder|carpenter|technician|hvac", title):
        return "Skilled Trade"
    elif re.search(r"real estate|realtor|broker|property|leasing", title):
        return "Real Estate"
    elif re.search(r"bank|loan|mortgage|credit union", title):
        return "Banking"
    elif re.search(r"hr|human resources|recruiter|talent|staffing", title):
        return "HR/Recruitment"
    elif re.search(r"server|waiter|waitress|bartender|cook|chef|hospitality|hotel|restaurant", title):
        return "Hospitality"
    elif re.search(r"government|federal|state|county|city|public works|council", title):
        return "Government"
    else:
        return "Other"

engg_df = binarized_df.copy()
# Apply function
engg_df['emp_category'] = engg_df['emp_title'].apply(categorize_job)

# Check distribution
print(engg_df['emp_category'].value_counts())

In [None]:
engg_df['emp_length'].value_counts().reset_index()

emp_length_map = {
    '10+ years': 10,
    '9 years': 9,
    '8 years': 8,
    '7 years': 7,
    '6 years': 6,
    '5 years': 5,
    '4 years': 4,
    '3 years': 3,
    '2 years': 2,
    '1 year': 1,
    '< 1 year': 0
}
# Replace with mapping
engg_df['emp_length'] = engg_df['emp_length'].map(emp_length_map)

In [None]:
engg_df.describe(include=[object, 'category'])

#### Univariate Analysis

In [None]:
engg_df.info()

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# sns.set(style="whitegrid", palette="muted")
# plt.rcParams["figure.figsize"] = (10, 5)
# plt.rcParams['font.family'] = 'DejaVu Sans' 
# plt.rcParams['text.usetex'] = False
# plt.rcParams['axes.unicode_minus'] = False

# # Continuous Variables
# num_cols = engg_df.select_dtypes(include=['float64', 'int64']).columns

# for col in num_cols:
#     plt.figure()
#     sns.histplot(engg_df[col].dropna(), kde=True, bins=30)
#     plt.title(f"Distribution of {col}")
#     plt.xlabel(col)
#     plt.ylabel("Frequency")
#     plt.show()

# categorical_cols = ['grade','sub_grade','home_ownership','verification_status','purpose','emp_category','initial_list_status']

# # Categorical Variables
# for col in categorical_cols:
#     plt.figure()
#     sns.countplot(data=engg_df, x=col, order=engg_df[col].value_counts().index)
#     plt.title(f"Countplot of {col}")
#     plt.xticks(rotation=45)
#     plt.show()

#### Bivariate Analysis

In [None]:
# target = 'loan_status'

# # Numerical vs Target
# for col in num_cols:
#     plt.figure()
#     sns.boxplot(data=engg_df, x=target, y=col)
#     plt.title(f"{col} vs {target}")
#     plt.show()

# # Categorical vs Target
# for col in categorical_cols:
#     plt.figure()
#     sns.countplot(data=engg_df, x=col, hue=target, order=engg_df[col].value_counts().index)
#     plt.title(f"{col} vs {target}")
#     plt.xticks(rotation=45)
#     plt.show()


#### Data Preparation for Model Training

In [None]:
engg_df.info()

In [None]:
# Select categorical columns
categorical_cols = engg_df.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    unique_vals = engg_df[col].unique()
    print(f"Column: {col}")
    print(f"Number of unique values: {len(unique_vals)}")
    print(f"Unique values (sample up to 20): {unique_vals[:20]}\n")

In [None]:
engg_df['term'] = engg_df['term'].map({' 36 months': 36, ' 60 months': 60})

In [None]:
# engg_df['state'] = engg_df['address'].apply(lambda x: x.split()[-2] if pd.notnull(x) else 'Unknown')
import re

def extract_zip(address):
    # Find the last 5 consecutive digits in the string
    match = re.search(r'(\d{5})(?!.*\d)', address)
    return match.group(1) if match else None

# Apply to your DataFrame column (say 'address')
engg_df['zipcode'] = engg_df['address'].apply(extract_zip)

# # Check result
# print(engg_df[['address', 'zipcode']].head(20))

In [None]:
categorical_cols = ['purpose', 'verification_status', 
                    'home_ownership', 'grade','zipcode']
# 'sub_grade','emp_category','state'removed for now
# Create dummies for all columns at once, drop first category, and ensure 0/1
engg_df = pd.get_dummies(engg_df, columns=categorical_cols, drop_first=True)

# Convert any boolean columns (True/False) to 0/1
bool_cols = engg_df.select_dtypes(include='bool').columns

print(bool_cols)


In [None]:
engg_df[bool_cols] = engg_df[bool_cols].astype(int)

In [None]:
engg_df['initial_list_status'] = engg_df['initial_list_status'].map({'f': 0, 'w': 1},)

In [None]:
# mapping = {'INDIVIDUAL': 0, 'JOINT': 1, 'DIRECT_PAY': 2}
# outlier_trtd_df['application_type_encoded'] = outlier_trtd_df['application_type'].map(mapping)

In [None]:
# Convert earliest credit line to datetime (Removed for now)
# engg_df['earliest_cr_line'] = pd.to_datetime(engg_df['earliest_cr_line'], format='%b-%Y', errors='coerce')

# # Create credit history in years (continuous)
# engg_df['credit_history_years'] = (pd.to_datetime('today') - engg_df['earliest_cr_line']).dt.days / 365

In [None]:
encoded_df = engg_df.drop(columns=['earliest_cr_line','address','application_type','issue_d','emp_title','installment','sub_grade','emp_category','emp_length'])

In [None]:
pd.options.display.max_columns = None
encoded_df.head()

In [None]:
# bool_cols = encoded_df.select_dtypes(include='bool').columns
# print(bool_cols)
# encoded_df[bool_cols] = encoded_df[bool_cols].astype(int)
# encoded_df[bool_cols].head()

In [None]:
encoded_df['loan_status'] = encoded_df['loan_status'].map({'Charged Off': 1, 'Fully Paid': 0})

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = encoded_df.drop(columns=['loan_status'])  # Features
y = encoded_df['loan_status']                 # Target

feature_names = X.columns 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# string_cols = X_train.select_dtypes(include=['object', 'string']).columns

# print("String columns in X_train:")
# print(list(string_cols))

In [None]:
X_train = pd.DataFrame(X_train,columns=X.columns )
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
# ---------------------
# 4. Logistic Regression
# ---------------------
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# ---------------------
# 5. Evaluate
# ---------------------
y_pred = clf.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# import pandas as pd
# import numpy as np

# # Get feature importance from logistic regression
# feature_importance = pd.DataFrame({
#     'Feature': feature_names,
#     'Coefficient': log_reg.coef_[0],
#     'Abs_Coefficient': np.abs(log_reg.coef_[0])
# }).sort_values(by='Abs_Coefficient', ascending=False)

# print(feature_importance.head(20))  # Top 20 most important features

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Basic SMOTE (works when all features are numeric)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:\n", y_train.value_counts())
print("After SMOTE:\n", y_train_res.value_counts())

# Train Logistic Regression on resampled data
log_reg = LogisticRegression(class_weight="balanced")
log_reg.fit(X_train_res, y_train_res)

# Predictions
y_pred = log_reg.predict(X_test)

# Evaluation
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# # Check for NaN values column-wise
# nan_counts = pd.DataFrame({
#     "missing_count": X_train_res.isna().sum(),
#     "missing_ratio": X_train_res.isna().mean()
# })

# print("Total NaN values in resampled data:", X_train_res.isna().sum().sum())
# print(nan_counts[nan_counts["missing_count"] > 0])

In [None]:
# # Evaluation
# cm = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:\n", cm)
# print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# from sklearn.metrics import (
#     roc_curve, roc_auc_score,
#     precision_recall_curve, average_precision_score,
#     classification_report, confusion_matrix
# )
# # === 2. ROC Curve & AUC ===
# y_prob = log_reg.predict_proba(X_test)[:, 1]
# #
# fpr, tpr, thresholds = roc_curve(y_test, y_prob)
# roc_auc = roc_auc_score(y_test, y_prob)

# plt.figure(figsize=(6, 5))
# plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
# plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate (Recall)")
# plt.title("ROC Curve")
# plt.legend(loc="lower right")
# plt.show()

In [None]:
# # === 3. Precision-Recall Curve ===
# precision, recall, pr_thresholds = precision_recall_curve(y_test, y_prob)
# ap_score = average_precision_score(y_test, y_prob)

# plt.figure(figsize=(6, 5))
# plt.plot(recall, precision, label=f'PR Curve (AP = {ap_score:.2f})')
# plt.xlabel("Recall")
# plt.ylabel("Precision")
# plt.title("Precision-Recall Curve")
# plt.legend(loc="upper right")
# plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score,
    classification_report, confusion_matrix
)
# Predicted probabilities for class 0 (defaulters)
y_scores = log_reg.predict_proba(X_test)[:, 0]

# ----------------------------
# 1. PR Curve Method
# ----------------------------
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_scores, pos_label=0)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx_pr = np.argmax(f1_scores)
threshold_pr = thresholds_pr[best_idx_pr]

print("Optimal Threshold (PR Curve, max F1) for defaulters:", threshold_pr)
print("Precision:", precision[best_idx_pr])
print("Recall:", recall[best_idx_pr])
print("F1:", f1_scores[best_idx_pr])

# ----------------------------
# 2. ROC Curve Method
# ----------------------------
fpr, tpr, thresholds_roc = roc_curve(y_test, y_scores, pos_label=0)
j_scores = tpr - fpr
best_idx_roc = np.argmax(j_scores)
threshold_roc = thresholds_roc[best_idx_roc]

print("\nOptimal Threshold (ROC Curve, Youden's J) for defaulters:", threshold_roc)
print("TPR (Recall):", tpr[best_idx_roc])
print("FPR:", fpr[best_idx_roc])

# ----------------------------
# 3. Compare thresholds visually
# ----------------------------
plt.figure(figsize=(12,5))

# PR Curve
plt.subplot(1,2,1)
plt.plot(recall, precision, label='PR Curve')
plt.scatter(recall[best_idx_pr], precision[best_idx_pr], color='red', label='Optimal F1')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()

# ROC Curve
plt.subplot(1,2,2)
plt.plot(fpr, tpr, label='ROC Curve')
plt.scatter(fpr[best_idx_roc], tpr[best_idx_roc], color='red', label="Optimal J")
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC Curve')
plt.legend()

plt.show()

# ----------------------------
# 4. Predictions using PR optimal threshold
# ----------------------------
y_pred_optimal = (y_scores >= threshold_pr).astype(int)
cm = confusion_matrix(y_test, y_pred_optimal)
print("\nConfusion Matrix (PR Threshold):\n", cm)
print("\nClassification Report (PR Threshold):\n", classification_report(y_test, y_pred_optimal))