<a href="https://colab.research.google.com/github/najmanmna/loan-predict/blob/main/Logistic_Regression_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the data
df = pd.read_csv('/content/sample_data/loan_sanction_train.csv')

# Drop the 'Loan_ID' column and rows with any null values
df = df.drop(['Loan_ID'], axis=1).dropna()

# Identify categorical columns (dtype 'object' or 'category') and apply one-hot encoding
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Convert all columns to integers
df_encoded = df_encoded.astype(int)

# Drop the 'Loan_Status_N' column and separate features and target
df_encoded = df_encoded.drop('Loan_Status_N', axis=1)
X = df_encoded.drop('Loan_Status_Y', axis=1).values
y = df_encoded['Loan_Status_Y'].values

# Splitting the dataset into training and testing sets
def train_test_split(X, y, test_size=0.3, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(X))
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Sigmoid function with overflow prevention
def sigmoid(z):
    return np.where(z >= 0, 1 / (1 + np.exp(-z)), np.exp(z) / (1 + np.exp(z)))

# Logistic Regression model class
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.theta = np.zeros(self.n)
        self.bias = 0

        for _ in range(self.epochs):
            model = np.dot(X, self.theta) + self.bias
            y_predicted = sigmoid(model)

            d_theta = (1 / self.m) * np.dot(X.T, (y_predicted - y))
            d_bias = (1 / self.m) * np.sum(y_predicted - y)

            self.theta -= self.learning_rate * d_theta
            self.bias -= self.learning_rate * d_bias

    def predict_prob(self, X):
        model = np.dot(X, self.theta) + self.bias
        return sigmoid(model)

    def predict(self, X, threshold=0.5):
        y_predicted = self.predict_prob(X)
        return [1 if i > threshold else 0 for i in y_predicted]

# Initialize and train the model
model = LogisticRegressionScratch(learning_rate=0.01, epochs=1000)
model.fit(X_train, y_train)

# Predict probabilities and evaluate with threshold 0.4
y_prob = model.predict_prob(X_test)

# Evaluate at threshold 0.4
threshold = 0.4
y_pred = [1 if i > threshold else 0 for i in y_prob]
accuracy = np.mean(y_pred == y_test)

print(f'Accuracy at threshold {threshold}: {accuracy}')



# Define confusion matrix function
def confusion_matrix(y_true, y_pred):
    K = len(np.unique(y_true))  # Number of classes
    result = np.zeros((K, K))
    for i in range(len(y_true)):
        result[y_true[i]][y_pred[i]] += 1
    return result

# Define classification report function
def classification_report(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tp = np.diag(cm)
    precision = np.divide(tp, np.sum(cm, axis=0), where=np.sum(cm, axis=0) != 0)
    recall = np.divide(tp, np.sum(cm, axis=1), where=np.sum(cm, axis=1) != 0)
    f1 = 2 * np.divide(precision * recall, precision + recall, where=(precision + recall) != 0)
    report = {
        'precision': precision,
        'recall': recall,
        'f1-score': f1,
        'support': np.sum(cm, axis=1)
    }
    return report

# Predict and evaluate with threshold 0.4
y_pred = model.predict(X_test, threshold=0.4)

# Calculate confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy at threshold 0.4: 0.7986111111111112
Confusion Matrix:
[[ 15.  29.]
 [  0. 100.]]
Classification Report:
{'precision': array([1.       , 0.7751938]), 'recall': array([0.34090909, 1.        ]), 'f1-score': array([0.50847458, 0.87336245]), 'support': array([ 44., 100.])}
