Imports

In [411]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.models import Word2Vec

from nltk.corpus import stopwords
import string

from gensim.models import Word2Vec

import torch
import torch.nn as nn
import torch.optim as optim


In [412]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/jacob/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jacob/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jacob/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/jacob/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Load Dataset

In [413]:
path_to_features = './data/stanfordSentimentTreebank/datasetSentences.txt'
path_to_labels = './data/stanfordSentimentTreebank/sentiment_labels.txt'

features_df = pd.read_csv(path_to_features, sep='\t')
labels_df = pd.read_csv(path_to_labels, sep='|', index_col=0)
features_df.drop('sentence_index', axis=1, inplace=True)

first_1000_f = features_df[:1000]
first_1000_l = labels_df[:1000]

data_df = pd.concat([first_1000_f, first_1000_l], axis=1)

In [414]:
data_df.head()

Unnamed: 0,sentence,sentiment values
0,The Rock is destined to be the 21st Century 's...,0.5
1,The gorgeously elaborate continuation of `` Th...,0.5
2,Effective but too-tepid biopic,0.44444
3,If you sometimes like to go to the movies to h...,0.5
4,"Emerges as something rare , an issue movie tha...",0.42708


Tokenizer

In [415]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def tokenize_and_preprocess(sentence):
    tokens = word_tokenize(sentence)
    tokens = [w for w in tokens if w.lower() not in stop_words and w not in string.punctuation]
    nltk_tagged = nltk.pos_tag(tokens)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_tokens = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_tokens.append(word)
        else:        
            lemmatized_tokens.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_tokens


In [416]:
def sentence_to_avg_vector(sentence, model):
    tokens = tokenize_and_preprocess(sentence)
    vector = np.zeros(model.vector_size)
    for token in tokens:
        if token in model.wv:
            vector += model.wv[token]
    if len(tokens) > 0:
        vector /= len(tokens)
    return vector

In [417]:
tokens_list = data_df['sentence'].apply(tokenize_and_preprocess).tolist()

word2vec_model = Word2Vec(sentences=tokens_list, vector_size=100, window=5, min_count=1, workers=4)

data_df['sentence_vector'] = data_df['sentence'].apply(lambda x: sentence_to_avg_vector(x, word2vec_model))


In [418]:
data_df.head()

Unnamed: 0,sentence,sentiment values,sentence_vector
0,The Rock is destined to be the 21st Century 's...,0.5,"[-0.0011338543486503609, 0.0006192612848032943..."
1,The gorgeously elaborate continuation of `` Th...,0.5,"[0.0014520163885722666, 0.0011679331015013463,..."
2,Effective but too-tepid biopic,0.44444,"[0.0061636255122721195, 0.0019347511309509475,..."
3,If you sometimes like to go to the movies to h...,0.5,"[2.7271192973583107e-05, 0.002349162603624993,..."
4,"Emerges as something rare , an issue movie tha...",0.42708,"[-0.00048030953845367406, 0.000448660732497676..."


In [419]:
from sklearn.model_selection import train_test_split

In [420]:
X_train, X_test, y_train, y_test, sentences_train, sentences_test = train_test_split(
    data_df['sentence_vector'].tolist(), 
    data_df['sentiment values'].tolist(), 
    data_df['sentence'].tolist(),
    test_size=0.2, 
    random_state=42
)

Model

In [421]:
class SentimentNN(nn.Module):
    def __init__(self):
        super(SentimentNN, self).__init__()
        self.fc1 = nn.Linear(100, 50) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 1) 
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [422]:
X = torch.tensor(np.vstack(X_train), dtype=torch.float)
y = torch.tensor(y_train, dtype=torch.float).view(-1, 1)

# Instantiate the model, define loss function and optimizer
model = SentimentNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 500
# Training loop
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.17938360571861267
Epoch 101, Loss: 0.02870471030473709
Epoch 201, Loss: 0.028493061661720276
Epoch 301, Loss: 0.02828666940331459
Epoch 401, Loss: 0.028082385659217834


In [423]:
X_test = torch.tensor(np.vstack(X_test), dtype=torch.float)
y_test = torch.tensor(y_test, dtype=torch.float).view(-1, 1)

In [424]:
with torch.no_grad():
    model.eval()
    predictions = model(X_test)
    test_loss = criterion(predictions, y_test) 

print(f'Test Loss: {test_loss.item()}')

Test Loss: 0.024493910372257233


In [425]:
index_to_check = 0

with torch.no_grad():
    model.eval()
    predictions = model(X_test)

specific_test_example = X_test[index_to_check]
predicted_value = predictions[index_to_check]
actual_value = y_test[index_to_check]

print("Original Sentence:", sentences_test[index_to_check])
print("Predicted Sentiment Value:", predicted_value.item()) 
print("Actual Sentiment Value:", actual_value.item())

Original Sentence: One scarcely needs the subtitles to enjoy this colorful action farce .
Predicted Sentiment Value: 0.5326350927352905
Actual Sentiment Value: 0.5138900279998779
