# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Reading the Data

In [None]:
df= pd.read_csv('/content/Loan_default.csv')

# Data Exploration and Preprocessing

In [None]:
df.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


## Dropping the column - 'LoanID'

In [None]:
df = df.drop(['LoanID'], axis=1)

## Checking Data Types

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Age             255347 non-null  int64  
 1   Income          255347 non-null  int64  
 2   LoanAmount      255347 non-null  int64  
 3   CreditScore     255347 non-null  int64  
 4   MonthsEmployed  255347 non-null  int64  
 5   NumCreditLines  255347 non-null  int64  
 6   InterestRate    255347 non-null  float64
 7   LoanTerm        255347 non-null  int64  
 8   DTIRatio        255347 non-null  float64
 9   Education       255347 non-null  object 
 10  EmploymentType  255347 non-null  object 
 11  MaritalStatus   255347 non-null  object 
 12  HasMortgage     255347 non-null  object 
 13  HasDependents   255347 non-null  object 
 14  LoanPurpose     255347 non-null  object 
 15  HasCoSigner     255347 non-null  object 
 16  Default         255347 non-null  int64  
dtypes: float64

## Checking for null values in dataset

In [None]:
df.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Default
count,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0,255347.0
mean,43.498306,82499.304597,127578.865512,574.264346,59.541976,2.501036,13.492773,36.025894,0.500212,0.116128
std,14.990258,38963.013729,70840.706142,158.903867,34.643376,1.117018,6.636443,16.96933,0.230917,0.320379
min,18.0,15000.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0
25%,31.0,48825.5,66156.0,437.0,30.0,2.0,7.77,24.0,0.3,0.0
50%,43.0,82466.0,127556.0,574.0,60.0,2.0,13.46,36.0,0.5,0.0
75%,56.0,116219.0,188985.0,712.0,90.0,3.0,19.25,48.0,0.7,0.0
max,69.0,149999.0,249999.0,849.0,119.0,4.0,25.0,60.0,0.9,1.0


## Checking values in categorical columns

In [None]:
df['Education'].value_counts()

Education
Bachelor's     64366
High School    63903
Master's       63541
PhD            63537
Name: count, dtype: int64

In [None]:
df['EmploymentType'].value_counts()

EmploymentType
Part-time        64161
Unemployed       63824
Self-employed    63706
Full-time        63656
Name: count, dtype: int64

In [None]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married     85302
Divorced    85033
Single      85012
Name: count, dtype: int64

In [None]:
df['HasMortgage'].value_counts()

HasMortgage
Yes    127677
No     127670
Name: count, dtype: int64

In [None]:
df['HasDependents'].value_counts()

HasDependents
Yes    127742
No     127605
Name: count, dtype: int64

In [None]:
df['LoanPurpose'].value_counts()

LoanPurpose
Business     51298
Home         51286
Education    51005
Other        50914
Auto         50844
Name: count, dtype: int64

In [None]:
df['HasCoSigner'].value_counts()

HasCoSigner
Yes    127701
No     127646
Name: count, dtype: int64

In [None]:
df['Default'].value_counts()

Default
0    225694
1     29653
Name: count, dtype: int64

## Converting categorical columns into numerical data

In [None]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to be label encoded
categorical_columns = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through each categorical column and apply label encoding
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

## Checking relation between columns in dataset

In [None]:
correlation_with_default = df.corr()['Default'].sort_values(ascending=False)

print(correlation_with_default)

Default           1.000000
InterestRate      0.131273
LoanAmount        0.086659
EmploymentType    0.041010
NumCreditLines    0.028330
DTIRatio          0.019236
LoanTerm          0.000545
MaritalStatus    -0.007902
LoanPurpose      -0.010096
Education        -0.022835
HasMortgage      -0.022856
CreditScore      -0.034166
HasDependents    -0.034678
HasCoSigner      -0.039109
MonthsEmployed   -0.097374
Income           -0.099119
Age              -0.167783
Name: Default, dtype: float64


## Checking for collinearity in data

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Create a DataFrame containing only the independent variables (features)
X = df.drop(columns=['Default'])

# Add a constant to the independent variables matrix for intercept calculation
X = add_constant(X)

# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print VIF values
print(vif_data)


           feature        VIF
0            const  61.778423
1              Age   1.000051
2           Income   1.000070
3       LoanAmount   1.000049
4      CreditScore   1.000049
5   MonthsEmployed   1.000034
6   NumCreditLines   1.000026
7     InterestRate   1.000066
8         LoanTerm   1.000045
9         DTIRatio   1.000074
10       Education   1.000076
11  EmploymentType   1.000066
12   MaritalStatus   1.000097
13     HasMortgage   1.000031
14   HasDependents   1.000050
15     LoanPurpose   1.000077
16     HasCoSigner   1.000077


## Correcting imbalance in dataset and removing outliers from data

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest

# Step 1: Balance the dataset using SMOTE
X = df.drop('Default', axis=1)  # Features
y = df['Default']  # Target

# Instantiate SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame
df_resampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=['Default'])], axis=1)

# Step 2: Remove outliers using Isolation Forest
# Assuming your dataset is already scaled appropriately

# Instantiate Isolation Forest
isolation_forest = IsolationForest(contamination=0.1, random_state=42)

# Fit Isolation Forest
outlier_preds = isolation_forest.fit_predict(df_resampled.drop('Default', axis=1))

# Filter outliers
df_no_outliers = df_resampled[outlier_preds != -1]

# Separate target variable from features
X_no_outliers = df_no_outliers.drop('Default', axis=1)
y_no_outliers = df_no_outliers['Default']

# Concatenate features and target variable
df_final = pd.concat([X_no_outliers, y_no_outliers], axis=1)

# Now, df_final contains your balanced dataset without outliers, including the target variable




In [None]:
df_final['Default'].value_counts()

Default
1    220277
0    185972
Name: count, dtype: int64

## Normalizing the dataset

In [None]:
from sklearn.preprocessing import StandardScaler

# Columns to be standardized
columns_to_standardize = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate',
                          'LoanTerm', 'DTIRatio']

# Instantiate StandardScaler
scaler = StandardScaler()

# Standardize selected columns
df_final[columns_to_standardize] = scaler.fit_transform(df_final[columns_to_standardize])

# Now, df_final contains standardized numerical columns


## Dividing the data

In [None]:
from sklearn.model_selection import train_test_split

# Splitting into features (X) and target variable (y)
X = df_final.drop('Default', axis=1)
y = df_final['Default']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Training the model

In [None]:
class LogisticRegressionWithRegularization:
    def __init__(self, learning_rate=0.01, num_iterations=1000, lambda_val=0.01):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.lambda_val = lambda_val
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.num_iterations):
            # linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # sigmoid function
            y_predicted = self.sigmoid(linear_model)

            # compute gradients with regularization
            dw = (1 / num_samples) * (np.dot(X.T, (y_predicted - y)) + 2 * self.lambda_val * self.weights)
            db = (1 / num_samples) * np.sum(y_predicted - y)

            # update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls


In [None]:
log_reg = LogisticRegressionWithRegularization()
log_reg.fit(X_train, y_train)

# Hypothesis Testing

In [None]:
# Define hypothesis testing function
def wald_test(model, X, y):
    # Get coefficient estimates and their standard errors
    coef = model.weights
    num_samples, num_features = X.shape
    y_predicted = model.predict(X)
    residuals = y_predicted - y
    sigma_squared = np.dot(residuals, residuals) / (num_samples - num_features - 1)
    cov_matrix = np.linalg.inv(np.dot(X.T, X)) * sigma_squared

    # Compute z-statistics
    z_stat = coef / np.sqrt(np.diag(cov_matrix))

    # Compute Wald statistic
    wald_stat = z_stat ** 2

    # Compute p-values
    p_values = 1 - chi2.cdf(wald_stat, df=1)

    return {'Coefficient': coef.flatten(), 'Standard Error': np.sqrt(np.diag(cov_matrix)), 'Z-Statistic': z_stat, 'Wald Statistic': wald_stat, 'P-Value': p_values}


# Perform hypothesis testing
results = wald_test(log_reg, X_train, y_train)

# Print results
print("Hypothesis Testing Results:")
print("{:<20} {:<20} {:<20} {:<20} {:<20}".format('Feature', 'Coefficient', 'Standard Error', 'Z-Statistic', 'P-Value'))
for i in range(len(log_reg.weights)):
    print("{:<20} {:<20} {:<20} {:<20} {:<20}".format(f'Feature {i}', results['Coefficient'][i], results['Standard Error'][i], results['Z-Statistic'][i], results['P-Value'][i]))

Hypothesis Testing Results:
Feature              Coefficient          Standard Error       Z-Statistic          P-Value             
Feature 0            -0.5187685149565371  0.0009061655042566831 -572.4876002448104   0.0                 
Feature 1            -0.2555224420988639  0.0009056316536288992 -282.1483117059525   0.0                 
Feature 2            0.2416889413902621   0.0009056276262973068 266.8745236697522    0.0                 
Feature 3            -0.10951510974593666 0.0009027582325195428 -121.3116710553685   0.0                 
Feature 4            -0.2932393391639206  0.0009040534118266929 -324.36063547552277  0.0                 
Feature 5            -0.1565406309757568  0.0009042322793414578 -173.1199322919146   0.0                 
Feature 6            0.37896648503735314  0.0009045623594996714 418.9500934429395    0.0                 
Feature 7            -0.012348490417606191 0.000902751879310508 -13.678720255932959  0.0                 
Feature 8          

# Testing effectiveness of the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

# Predict on test data
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)


# Generate classification report
class_report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(class_report)

Accuracy: 0.7355323076923077
Precision: 0.7625525716031109
Recall: 0.7403563200036499
F1 Score: 0.7512905391328504
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72     37413
           1       0.76      0.74      0.75     43837

    accuracy                           0.74     81250
   macro avg       0.73      0.74      0.73     81250
weighted avg       0.74      0.74      0.74     81250



# Exploring top 3 reasons for prediction along with its values

In [None]:
# Define feature names (replace these with your actual feature names)
feature_names = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

# Initialize an empty DataFrame to store top reasons
top_reasons_df = pd.DataFrame(columns=['Prediction', 'Top Reason 1', 'Value 1', 'Top Reason 2', 'Value 2', 'Top Reason 3', 'Value 3'])

# Iterate through the last 10 predictions and extract top three reasons for each
for i, prediction in enumerate(y_pred[-10:], start=len(y_pred)-10):
    # Get coefficients from the logistic regression model for this prediction
    coefficients = log_reg.weights

    # Create a dictionary to map feature names to coefficients
    feature_coefficients = dict(zip(feature_names, coefficients))

    # Sort the features based on their coefficients
    sorted_features = sorted(feature_coefficients.items(), key=lambda x: abs(x[1]), reverse=True)

    # Extract top three reasons
    top_three_reasons = sorted_features[:3]

    # Extract top three reasons and their corresponding values
    top_three_reasons_values = [(feature, coefficient, df_final.iloc[i][feature]) for feature, coefficient in top_three_reasons]

    # Store top three reasons in DataFrame
    row_values = [prediction]
    for j in range(3):
        if j < len(top_three_reasons_values):
            reason, coefficient, value = top_three_reasons_values[j]
            row_values.extend([reason, value])
        else:
            row_values.extend(['', ''])
    top_reasons_df.loc[i] = row_values

# Print or use top_reasons_df as needed
print(top_reasons_df)


       Prediction Top Reason 1   Value 1  Top Reason 2   Value 2 Top Reason 3  \
81240           0          Age  1.073523  InterestRate -1.696946  HasCoSigner   
81241           1          Age  1.073523  InterestRate -0.252594  HasCoSigner   
81242           0          Age -0.858206  InterestRate -1.980086  HasCoSigner   
81243           0          Age -0.338125  InterestRate  1.481640  HasCoSigner   
81244           1          Age -0.189531  InterestRate  0.677724  HasCoSigner   
81245           0          Age -1.526882  InterestRate -1.157631  HasCoSigner   
81246           1          Age  0.107659  InterestRate  0.820979  HasCoSigner   
81247           1          Age -0.486720  InterestRate -1.555376  HasCoSigner   
81248           1          Age  2.039388  InterestRate -1.312684  HasCoSigner   
81249           0          Age  1.147821  InterestRate -1.130665  HasCoSigner   

       Value 3  
81240      0.0  
81241      0.0  
81242      0.0  
81243      0.0  
81244      1.0  
81245 