In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
import pandas as pd
import numpy as np
import sys
import random

# Task 1: Implement Logistics Regression From Scratch

Code implementation of the Logistic Regression model

In [None]:
# Preparing the Training Data, Features and Labels

train_data = pd.read_csv("train_tfidf_features.csv")
train_raw = pd.read_csv("train.csv")

train_features = train_data.loc[:, '0':train_data.columns[-1]]
train_label = train_data["label"]

In [3]:
# Functions

def sigmoid(z):
    return 1/(1+np.exp(-z))

def loss(y, y_hat):
    n = y.shape[0]
    epsilon = 1e-15
    J = np.sum(y * np.log(y_hat + epsilon) + (1 - y) * np.log(1 - y_hat + epsilon))
    return -(1 / n) * J

def gradients(X, y, y_hat):
    m = X.shape[0]  # Assuming X is (bs, columns), m is the number of samples in the batch
    
    # Reshape y to match the shape of y_hat (if necessary)
    if y_hat.shape != y.shape:
        y = y.reshape(y_hat.shape)

    dw = (1 / m) * np.dot(X.T, (y_hat - y))
    db = (1 / m) * np.sum(y_hat - y)
    return dw, db

def train(X, y, bs, epochs, lr):
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values 

    rows, columns = X.shape      
    w = np.zeros(columns)
    b = 0

    for epoch in range(epochs):

        for _ in range(rows//bs):
            batch_indices = np.random.choice(rows, bs, replace=False)
            X_batch = X[batch_indices, : ]
            y_batch = y[batch_indices]
            
            Xw = np.dot(X_batch, w)
            y_hat = sigmoid( Xw +b )   

            current_loss = loss(y_batch, y_hat)
            print(current_loss)

            dw, db = gradients(X_batch, y_batch, y_hat)

            w -= lr * dw
            b -= lr * db            
    return w, b

def predict(X, w, b):
    Xw = np.dot(X, w) + b
    y_hat = sigmoid(Xw)
    predictions = (y_hat >= 0.5).astype(int)  # Thresholding at 0.5 for binary classification
    return predictions

def accuracy(y_true, y_pred):
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def f1_calc(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 1) & (y_pred == 0))
    fn = np.sum((y_true == 0) & (y_pred == 1))

    return tp / (tp + 0.5 * (fp + fn)) 

In [4]:
# Edit values here:

batch_size = 8
epochs = 50
learning_rate = 0.05

# Training the model

w, b = train(train_features, train_label, batch_size, epochs, learning_rate)

print(w,b)

train_predictions  = predict(train_features, w, b)

f1_scores = f1_calc(train_label, train_predictions)
print(f1_scores)

acc = accuracy(train_label, train_predictions)

print(acc)

0.6931471805599433
0.6923870694320104
0.6916324654911883
0.6907099471927383
0.6962376387841145
0.6886297306118514
0.7008693715822796
0.6931425590893823
0.6888299166663976
0.6931980762041827
0.6896296852578438
0.6973475181189163
0.6967300587429148
0.6930233710969522
0.6850730377218461
0.6882502148222558
0.6873736045545457
0.6996464161460716
0.6930301773655517
0.6934403543770549
0.6826904664985426
0.693005319919162
0.6802103096548966
0.6683147909507936
0.6941985348641149
0.6938423219127292
0.7041986718631952
0.6749684225694272
0.6936002370278085
0.6618889224418077
0.6684232735278912
0.6804005298979121
0.6796965668873236
0.6792494412495905
0.6791782103884199
0.6949315605271607
0.6459196494998627
0.6764535042697796
0.6958651184740181
0.6571031996222632
0.7168847248808803
0.7149949123396393
0.6954431341903093
0.7142794921539787
0.6781538384772146
0.6774788984389293
0.6957427931875129
0.6601520951288151
0.6572189111779027
0.6755926662883669
0.6749537731554001
0.6101524652390864
0.67316419509

## Prediction Made Based On Logistic Regressino Code

In [5]:
# Preparing the testing data

test_data = pd.read_csv("test_tfidf_features.csv")
test_features = test_data.loc[:, '0':train_data.columns[-1]]

# Applying our model to the Testing data

test_predictions = predict(test_features, w, b)
test_id = test_data["id"]

# Saving data to LogRed_Prediction.csv

submission = pd.DataFrame({
        'id': test_id,
        'label': test_predictions
    })

submission.to_csv('LogRed_Prediction.csv', index=False)