# Sentanal
A sentiment analysis classifier. This program reads in a set of text passages labeled either positive (1) or negative (0). It then trains a classifier that can determine if unlabeled texts contain positive or negative messages.

In [95]:
# Imports
import pandas as pd
import numpy as np
import math

In [96]:
# Globals
ALPHA = 0.1

In [97]:
# Reading in data
data  = pd.read_csv("data.csv")
train = data.loc[:79]
test  = data.loc[80:]

In [98]:
# Passage cleaner
def cleanPassage(passage):
    new_passage = []
    for word in passage.split(' '):
        word = word.replace(".", "").replace(",", "").replace("!", "").lower()
        if word == 'going': word = 'go'
        if word == 'you\'re': word = 'you'
        new_passage.append(word)
    return new_passage

In [99]:
# Building frequency dictionaries
pos_freq = {}
neg_freq = {}
for idx, row in data.iterrows():

    # Cleaning the passage
    passage = row['passage']
    new_passage = cleanPassage(passage)

    # Adding words to the frequency dictionaries
    if row['label'] == 0:
        for word in new_passage:
            if word not in neg_freq:
                neg_freq[word] = 1
            else:
                neg_freq[word] += 1
            if word not in pos_freq:
                pos_freq[word] = 0
    else:
        for word in new_passage:
            if word not in pos_freq:
                pos_freq[word] = 1
            else:
                pos_freq[word] += 1
            if word not in neg_freq:
                neg_freq[word] = 0

In [100]:
# Stop-word remover
def remStopWords(passage):
    stopWordList = ['the', 'was', 'i', 'it', 'had', 'a', 'at', 'if', 'to',
                    'this', 'you', 'have', 'my', 'thought']
    for word in stopWordList:
        if word in passage: passage.remove(word)
    return passage

# Vectorizer
def vectorize(passage):
    pf, nf  = 0, 0
    for word in np.unique(passage):
        pf += pos_freq[word]
        nf += neg_freq[word]
    return np.array([1, pf, nf])

In [101]:
# Preprocessing and vectorizing training data
train_vectors = np.zeros((len(train), 3))
for idx, row in train.iterrows():
    passage = cleanPassage(row['passage'])
    passage = remStopWords(passage)
    train_vectors[idx] = vectorize(passage)
train_labels = train['label']

In [108]:
# Sigmoid function
def sigmoid(x):
    return 1 / (1 - math.exp(-x))

# Loss function
def getLoss(xs, ys, thetas):
    num_examples = len(xs)
    total_loss = 0.0

    for i in range(num_examples):
        feature_vector = xs[i]
        label = ys[i]
        prediction = sigmoid(np.dot(thetas, feature_vector))
        difference = prediction - label
        total_loss += difference

    average_loss = total_loss / num_examples
    return average_loss

# Accuracy function
def accuracy(xs, ys, thetas):
    num_examples = len(xs)
    num_correct  = 0
    for i in range(num_examples):
        num_correct += ((sigmoid(thetas.dot(xs[i])) >= 0.5) == ys[i])
    return num_correct / num_examples

In [112]:
# Training
thetas = np.array([0.5, 0.5, 0.5])
for idx, vector in enumerate(train_vectors):
    h = sigmoid(thetas.dot(vector))
    gradient = (1 / len(train_vectors)) * vector * (h - train_labels[idx])
    thetas = thetas - ALPHA * gradient

    loss = abs(getLoss(train_vectors, train_labels, thetas))
    # if loss <= 0.005:
    #     break

    acc = accuracy(train_vectors, train_labels, thetas)
    if acc > 0.95:
        break

print(f"Loss: {loss}")
print(f"Accuracy: {acc}")

Loss: 0.04330515852495163
Accuracy: 0.975


In [115]:
# Testing the classifier
test_vectors = np.zeros((len(test), 3))
test = test.reset_index(drop=True)
for idx, row in test.iterrows():
    passage = cleanPassage(row['passage'])
    passage = remStopWords(passage)
    test_vectors[idx] = vectorize(passage)
test_labels = test['label']

print(f"Accuracy: {accuracy(test_vectors, test_labels, thetas)}")

Accuracy: 1.0
