In [1]:
#Importing the necessary libraries
import csv
import torch
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# Variable for location of the tweets.csv file
INPUTFILE_PATH = "Tweets.csv"
tweets = []
train_tweets =[]
test_tweets = []
sentiment_class = set()
tweet_sent_class= []
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
def tokenizer(sentence):
    tokens = sentence.split(" ")
    tokens = [porter.stem(token.lower()) for token in tokens if not token.lower() in stop_words]
    return tokens
i = 0
with open(INPUTFILE_PATH, 'r', encoding="utf8") as csvfile:
    tweetreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for row in tweetreader:
        # For skipping the headerline
        if i == 0:
            i += 1
            continue
        # tweets will contain the tweet text 
        tweets.append(tokenizer(row[10]))
        tweet_sent_class.append(row[1])
        sentiment_class.add(row[1])
        i += 1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class_dict = {}
for index, class_name in enumerate(sentiment_class):
    class_dict[class_name] = index
vocab = {}
vocab_index = 0
for tokens in tweets:
    for key, token in enumerate(tokens):
        #all_tokens.add(token)
        if token not in vocab:
            vocab[token] = vocab_index
            vocab_index += 1
#train test split
train_tweets = tweets[:9000]
test_tweets = tweets[9000:]
def map_word_vocab(sentence):
    idxs = [vocab[w] for w in sentence]
    return torch.tensor(idxs, dtype=torch.long)
def map_class(sentiment):
    return torch.tensor([class_dict[sentiment]], dtype=torch.long)
def prepare_sequence(sentence):
    # create the input feature vector
    input = map_word_vocab(sentence)
    return input
EMBEDDING_DIM = 50
HIDDEN_DIM = 10

In [3]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.word_embeddings = nn.Embedding(vocab_size, input_size)
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, word, hidden):
        embeds = self.word_embeddings(word)
        combined = torch.cat((embeds.view(1, -1), hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)
# creating an instance of RNN
rnn = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(vocab), len(sentiment_class))
# Setting the loss function and optimizer
loss_function = nn.NLLLoss()
optimizer = optim.SGD(rnn.parameters(), lr=0.001)

In [4]:
for epoch in range(1):  
    if epoch % 5 == 0:
        print("Finnished epoch " + str(epoch / 30 * 100)  + "%")
    for i in range(len(train_tweets)):
        sentence = train_tweets[i]
        sent_class = tweet_sent_class[i]
# Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
# Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        hidden = rnn.init_hidden()
        rnn.zero_grad()
# Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence)
        target_class = map_class(sent_class)

        # Step 3. Run our forward pass.
        for i in range(len(sentence_in)):
            class_scores, hidden = rnn(sentence_in[i], hidden)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(class_scores, target_class)
        loss.backward()
        optimizer.step()

Finnished epoch 0.0%


In [None]:
# Convert the sentiment_class from set to list
sentiment_class = list(sentiment_class)

y_pred = []
y_actual = []
with torch.no_grad():
    for i in range(len(test_tweets)):
        sentence = test_tweets[i]
        sent_class = tweet_sent_class[9000+i]
        inputs = prepare_sequence(sentence)
        hidden = rnn.init_hidden()
        for i in range(len(inputs)):
            class_scores, hidden = rnn(inputs[i], hidden)
        # for word i. The predicted tag is the maximum scoring tag.
        y_pred.append(sentiment_class[((class_scores.max(dim=1)[1].numpy()))[0]])
        y_actual.append(str(sent_class))