In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import pandas as pd
import numpy as np
import sys
import random

# Task 1: Implement Logistics Regression From Scratch

Code implementation of the Logistic Regression model

In [None]:
# Preparing the Training Data, Features and Labels

train_data = pd.read_csv("train_tfidf_features.csv")
train_raw = pd.read_csv("train.csv")

train_features = train_data.loc[:, '0':train_data.columns[-1]]
train_label = train_data["label"]

In [None]:
# Functions

def sigmoid(z):
    return 1/(1+np.exp(-z))

def loss(y, y_hat):
    n = y.shape[0]
    epsilon = 1e-15
    J = np.sum(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon))
    return -(1 / n) * J

def gradients(X, y, y_hat):
    m = X.shape[0]  # Assuming X is (bs, columns), m is the number of samples in the batch
    
    # Reshape y to match the shape of y_hat (if necessary)
    if y_hat.shape != y.shape:
        y = y.reshape(y_hat.shape)

    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum(y_hat - y)
    return dw, db

def train(X, y, bs, epochs, lr):
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values 

    rows, columns = X.shape      
    w = np.zeros(columns)
    b = 0

    for epoch in range(epochs):

        for _ in range(rows//bs):
            batch_indices = np.random.choice(rows, bs, replace=False)
            X_batch = X[batch_indices, : ]
            y_batch = y[batch_indices]
            
            Xw = np.dot(X_batch, w)
            y_hat = sigmoid( Xw +b )   

            current_loss = loss(y_batch, y_hat)
            print(current_loss)

            dw, db = gradients(X_batch, y_batch, y_hat)

            w -= lr * dw
            b -= lr * db            
    return w, b

def predict(X, w, b):
    Xw = np.dot(X, w) + b
    y_hat = sigmoid(Xw)
    predictions = (y_hat >= 0.5).astype(int)  # Thresholding at 0.5 for binary classification
    return predictions

def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def f1_calc(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 1) & (y_pred == 0))
    fn = np.sum((y_true == 0) & (y_pred == 1))

    return tp / (tp + 0.5 * (fp + fn)) 

In [None]:
# Edit values here:

batch_size = 8
epochs = 50
learning_rate = 0.05

# Training the model

w, b = train(train_features, train_label, batch_size, epochs, learning_rate)

print(w,b)

train_predictions  = predict(train_features, w, b)

f1_scores = f1_calc(train_label, train_predictions)
print(f1_scores)

acc = accuracy(train_label, train_predictions)

print(acc)

## Prediction Made Based On Logistic Regression Code

In [None]:
# Preparing the testing data

test_data = pd.read_csv("test_tfidf_features.csv")
test_features = test_data.loc[:, '0':train_data.columns[-1]]

# Applying our model to the Testing data

test_predictions = predict(test_features, w, b)
test_id = test_data["id"]

# Saving data to LogRed_Prediction.csv

submission = pd.DataFrame({
        'id': test_id,
        'label': test_predictions
    })

submission.to_csv('LogRed_Prediction.csv', index=False)

### Performance Comparison

| Metric   | SKLearn | Our Model |
|----------|---------|-----------|
| F1 Score | 0.68927 | 0.69116   |
| Accuracy | 0.74351 | 0.75686   |

We adjusted the values of **Epoch**, **Batch Size**, and **Learning Rate** until our F1 Score and accuracy were marginally better than SKLearn’s Logistic Regression model. This approach was taken to ensure that our model isn't overfitting to the training data, providing a more reliable performance on unseen data.
