# Sentiment Analysis on the Sentiment140 dataset

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from math import e
from tqdm import tqdm_notebook as tqdm
# ^ this was used to track progress but commented out for final notebook since it does not save state after the
# kernel shuts down

### Preprocessing

Define the data directory and convert the columns of the DataFrame that we want into a numpy matrix. Note that we replace the '4' label with a '1' in order to use standard one-class classification

In [2]:
data_dir = './data/'

In [3]:
df = pd.read_csv(data_dir + 'training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', \
                names=['sentiment', 'id', 'date', 'flag', 'user', 'text'], \
                 dtype={'sentiment': int, 'text': str})
df['sentiment'].replace({4:1}, inplace=True)

In [4]:
matrix = df[['sentiment', 'text']].values
np.random.shuffle(matrix)

Tokenize the tweets and remove stop words

In [5]:
stopset = set(stopwords.words('english'))
tokenizer = TweetTokenizer()

for i in range(len(matrix)):
    tokens = tokenizer.tokenize(matrix[i][1])
    matrix[i][1] = np.array([x for x in tokens if x not in stopset])

In order to keep track of the weight index that each unique word corresponds to, we need to store a map from word to index

In [6]:
unique = set()
words = {}

for row in matrix:
    for word in row[1]:
        unique.add(word)

for i, word in enumerate(unique):
    words[word] = i

### Modeling

Split the data into training, validation, and test sets

In [7]:
num_samples = len(matrix)
training_data = matrix[:int(num_samples * .6)]
val_data = matrix[int(num_samples * .6):int(num_samples* .9)]
test_data = matrix[int(num_samples * .9):]

Define hyperparameters

In [8]:
learning_rate = 0.01
reg = 0.001
epochs = 2

Run logistic regression algorithm with ridge regularization on the training set

In [9]:
w = np.array([0.0 for _ in unique])
b = 0.0
#pbar = tqdm(total=epochs * len(training_data))

for _ in range(epochs):
    np.random.shuffle(training_data)
    for row in training_data:
        y = row[0]
        odds = b
        for word in row[1]:
            i = words[word]
            odds += w[i]
        
        p = 1 / (1 + e**(-odds))
        for word in row[1]:
            i = words[word]
            w[i] = w[i] - learning_rate * ((p - y) + reg * w[i])
        b = b - learning_rate * ((p - y) + reg * b)
        #pbar.update(1)

### Validation

Define a function to test model accuracy and compute confusion matrix for any further analysis. Will be used along with the validation set to find appropriate hyperparameters

In [10]:
def calc_conf_matrix(data):
    conf = np.array([[0, 0], [0, 0]])
    #pbar = tqdm(total=len(val_data))
    for row in data:
        y = row[0]
        odds = b
        for word in row[1]:
            i = words[word]
            odds += w[i]
        
        y_hat = int(round(1 / (1 + e**(-odds))))
        conf[y][y_hat] += 1
        #pbar.update(1)
    
    print('model accuracy was: ' + str((conf[0][0] + conf[1][1]) / len(data)))
    return conf

In [11]:
calc_conf_matrix(val_data);

model accuracy was: 0.7768770833333334


### Testing

Run the 'calc_conf_matrix' function once more on the test data. Report the accuracy of the model

In [12]:
calc_conf_matrix(test_data);

model accuracy was: 0.77898125


Now to report top and bottom 5 words by weight as well as bias:

In [13]:
indices = dict((reversed(item) for item in words.items()))
k = 5

weights = np.copy(w)
largest_indices = np.argpartition(weights, -k)[-k:]
largest_weights = [indices[index] for index in largest_indices]
print('the top 5 weighed words were: ' + str(largest_weights))

weights = np.copy(w)
smallest_indices = np.argpartition(weights, k)[:k]
smallest_weights = [indices[index] for index in smallest_indices]
print('the bottom 5 weighed words were: ' + str(smallest_weights))
print('the value of the bias is: ' + str(b))

the top 5 weighed words were: ['Thanks', 'thank', 'smile', 'Thank', 'welcome']
the bottom 5 weighed words were: ['sad', 'Poor', 'Sad', 'unfortunately', 'sadly']
the value of the bias is: 0.2183862494586355
