## CSE595 Homework 1 : Logistic Regression Implementation

In [85]:
import pandas
import re
from collections import Counter
import numpy as np
from scipy import sparse

In [26]:
# Function to load stop words from NLTK Github Repo to be removed 
def load_stopwords():
    f = open("stopwords.txt")
    words = f.readlines()
    stopwords = set()
    for i in words:
        i = i.strip()
        stopwords.add(i)
    return stopwords

### Part 1 : Representing Text Data

In [56]:
# tokenizes purely by using whitespace
def tokenize (sentence : str):
    tokens = sentence.split()
    return tokens

In [61]:
# tokenizing by ignoring capitalization, removing stop words as well as all punctuations
def better_tokenize(text : str):
    # store unique tokens in tokens
    tokens = []
    # convert entire text into lowercase
    lowercase = text.lower()
    # find all possible words and all possible punctuations as their own tokens 
    words = re.findall(r"\w+|[,.!?#\r\n$%;:()\"']", lowercase)
    # load in stop words
    stopwords = load_stopwords()
    
    for word in words:
        if word not in stopwords:
            tokens.append(word)
    return tokens

In [91]:
# build the vocabulary with a minimum word frequency of 250 along with a column mapping
def build_vocab(token_docs: list):
    vocab, all_tokens = [], []
    vocab_indices = {}
    for doc in token_docs:
        all_tokens.extend(doc)

    # returns frequences of tokens across all documents
    frequencies = Counter(all_tokens)
    
    i = 0
    for freq in frequencies:
    #   if above certain threshold, add to vocab
        if frequencies[freq] >= 250:
            vocab.append(freq)
            vocab_indices[freq] = i
            i += 1
     
    return set(vocab), vocab_indices    

# create term document matrix
def create_sparse_matrix(documents : list):
    tokenized_docs = []
    for doc in documents:
        tokens = better_tokenize(doc)
        tokenized_docs.append(tokens)
    vocab, indices = build_vocab(tokenized_docs)

    dense_matrix = []
    for doc in tokenized_docs:
        row = [0] * len(vocab)
        for token in doc:
            if token in vocab:
                row[indices[token]] += 1
        dense_matrix.append(row)
    np_arr = np.array(dense_matrix)
    sparse_td_matrix = sparse.csr_matrix(np_arr)
    
    return sparse_td_matrix, indices

#### Testing Tokenization

In [58]:
df = pandas.read_csv("train.csv")
better_tokenize(df['generation'][0])

['"',
 'conjuring',
 '"',
 'one',
 'thrilling',
 'horror',
 'film',
 'come',
 'recent',
 'times',
 '.',
 '"',
 'sinister',
 '"',
 'one',
 'scariest',
 'films',
 'watched',
 '.',
 'since',
 "'",
 'based',
 '"',
 'true',
 'events',
 '"',
 ',',
 'makes',
 'even',
 'interesting',
 '.',
 "'",
 'quite',
 'give',
 'full',
 '10',
 'star',
 'rating',
 'quite',
 'jump',
 'scares',
 ',',
 'least',
 'good',
 'portion',
 'jump',
 'scares',
 'add',
 'atmosphere',
 'movie',
 '.',
 'first',
 'time',
 'watcher',
 'movie',
 ',',
 'certainly',
 'disturb',
 '.',
 'doesn',
 "'",
 'stick',
 'like',
 '"',
 'sinister',
 '"',
 'long',
 ',',
 'conjuring',
 'fun',
 'watch',
 'always',
 'love',
 'coming',
 'back',
 'rewatch',
 'every',
 '.']

In [92]:
documents_test = ["I like the cat!", "the cat eats.", "i saw this cat and it was adorable i like it so much"]
matrix, indices = create_sparse_matrix(documents_test)

print(matrix)
print(indices)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 0 stored elements and shape (3, 0)>
{}
