## CSE595 Homework 1 : Logistic Regression Implementation

In [2]:
import pandas
import re
from collections import Counter
import numpy as np
from scipy import sparse
import torch
import torch.nn as nn

In [99]:
%pip install torch

Collecting torch
  Downloading torch-2.8.0-cp310-cp310-win_amd64.whl.metadata (30 kB)
Collecting filelock (from torch)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading torch-2.8.0-cp310-cp310-win_amd64.whl (241.4 MB)
   ---------------------------------------- 0.0/241.4 MB ? eta -:--:--
   ---------------------------------------- 2.9/241.4 

In [26]:
# Function to load stop words from NLTK Github Repo to be removed 
def load_stopwords():
    f = open("stopwords.txt")
    words = f.readlines()
    stopwords = set()
    for i in words:
        i = i.strip()
        stopwords.add(i)
    return stopwords

### Part 1 : Representing Text Data

In [56]:
# tokenizes purely by using whitespace
def tokenize (sentence : str):
    tokens = sentence.split()
    return tokens

In [61]:
# tokenizing by ignoring capitalization, removing stop words as well as all punctuations
def better_tokenize(text : str):
    # store unique tokens in tokens
    tokens = []
    # convert entire text into lowercase
    lowercase = text.lower()
    # find all possible words and all possible punctuations as their own tokens 
    words = re.findall(r"\w+|[,.!?#\r\n$%;:()\"']", lowercase)
    # load in stop words
    stopwords = load_stopwords()
    
    for word in words:
        if word not in stopwords:
            tokens.append(word)
    return tokens

In [96]:
# build the vocabulary with a minimum word frequency of 250 along with a column mapping
def build_vocab(token_docs: list):
    vocab, all_tokens = [], []
    vocab_indices = {}
    for doc in token_docs:
        all_tokens.extend(doc)

    # returns frequences of tokens across all documents
    frequencies = Counter(all_tokens)
    
    i = 0
    for freq in frequencies:
    #   if above certain threshold, add to vocab
        # if frequencies[freq] >= 250:
        vocab.append(freq)
        vocab_indices[freq] = i
        i += 1
     
    return set(vocab), vocab_indices    

# create term document matrix
def create_sparse_matrix(documents : list):
    tokenized_docs = []
    for doc in documents:
        tokens = better_tokenize(doc)
        tokenized_docs.append(tokens)
    vocab, indices = build_vocab(tokenized_docs)

    dense_matrix = []
    for doc in tokenized_docs:
        row = [0] * len(vocab)
        for token in doc:
            if token in vocab:
                row[indices[token]] += 1
        dense_matrix.append(row)
    np_arr = np.array(dense_matrix)
    sparse_td_matrix = sparse.csr_matrix(np_arr)
    
    return sparse_td_matrix, indices

#### Testing Tokenization

In [58]:
df = pandas.read_csv("train.csv")
better_tokenize(df['generation'][0])

['"',
 'conjuring',
 '"',
 'one',
 'thrilling',
 'horror',
 'film',
 'come',
 'recent',
 'times',
 '.',
 '"',
 'sinister',
 '"',
 'one',
 'scariest',
 'films',
 'watched',
 '.',
 'since',
 "'",
 'based',
 '"',
 'true',
 'events',
 '"',
 ',',
 'makes',
 'even',
 'interesting',
 '.',
 "'",
 'quite',
 'give',
 'full',
 '10',
 'star',
 'rating',
 'quite',
 'jump',
 'scares',
 ',',
 'least',
 'good',
 'portion',
 'jump',
 'scares',
 'add',
 'atmosphere',
 'movie',
 '.',
 'first',
 'time',
 'watcher',
 'movie',
 ',',
 'certainly',
 'disturb',
 '.',
 'doesn',
 "'",
 'stick',
 'like',
 '"',
 'sinister',
 '"',
 'long',
 ',',
 'conjuring',
 'fun',
 'watch',
 'always',
 'love',
 'coming',
 'back',
 'rewatch',
 'every',
 '.']

In [92]:
documents_test = ["I like the cat!", "the cat eats.", "i saw this cat and it was adorable i like it so much"]
matrix, indices = create_sparse_matrix(documents_test)

print(matrix)
print(indices)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (3, 0)>
{}


### Part 2 : Logistic Regression w numpy

In [1]:
# Defining sigmoid function that takes in a np array/vector
def sigmoid(x):
    result = 1/(1+np.exp(-x))
    return result

# weights is our beta coefficients, x is the feature vector of a document and y is the ground truth of a document
def log_likelihood(x, y, weights):
    a = y * weights.transpose() * x
    b = np.log(1 + np.exp(weights.transpose() * x))
    return a - b

# compute the gradient for a specific x vector, beta coefficients and ground truth y
def compute_gradient(x, y, weights):
    return (sigmoid(weights * x) - y) * x

In [None]:
# logistic regression with X matrix each row is a feature vector, Y vector, rate of learning and number of steps
def logistic_regression(X, Y, learning_rate, num_step):
    weights = [0] * len(Y)
    for step in num_step:


### Part 3 : Logistic Regression w Pytorch 

In [107]:
# takes in sparse scipy matrix and converts it into a sparse tensor
def to_sparse_tensor(sparse_scipy):
    # return row and col indices of non zero values along with the values
    row_ind, col_ind, values = sparse.find(sparse_scipy)

    # merge both np arrays to a singular array - this is more efficient - ASK
    indices = np.stack((row_ind, col_ind))
    i = torch.tensor(indices)
    v = torch.tensor(values, dtype=torch.float32)
    return torch.sparse_coo_tensor(i, v)

In [None]:
# Logistic Regression Class extending nn.Module
class LogisticRegression(nn.Module):
    def __init__(self, vocab_size) -> None:
        # call the base class initialization
        super().__init__()
        # singular output feature for binary classification - is it LLM generated or not?
        # input features would be the size of the vocabulary - how many features should we train on?
        self.linear = nn.Linear(in_features = vocab_size, out_features=1)

    # forward takes in input sparse matrix - documents with all features
    def forward(self, x):
        # pass the linear layer output into sigmoid function and return the probability
        x = nn.Sigmoid(self.linear(x))
        return x
        

In [106]:
documents_test = ["I like the cat!", "the cat eats.", "i saw this cat and it was adorable i like it so much"]
matrix, indices = create_sparse_matrix(documents_test)

print(matrix)
to_sparse_tensor(matrix)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (3, 8)>
  Coords	Values
  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (2, 0)	1
  (2, 1)	1
  (2, 5)	1
  (2, 6)	1
  (2, 7)	1
tensor(indices=tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2],
                       [0, 1, 2, 1, 3, 4, 0, 1, 5, 6, 7]]),
       values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
       size=(3, 8), nnz=11, layout=torch.sparse_coo)
