# Mounting content from Google Drive.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Part 1: Loading and preprocessing the data

In [2]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
import os
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

tweet_path = "/content/gdrive/My Drive/SentimentTweets.csv"

#Creating the dataframe and converting every uppercase character to lowercase using the str.lower() function.
tweet_df = pd.read_csv(tweet_path).apply(lambda x: x.astype(str).str.lower())

#We will substitute every unwanted character with ' '. Here we remove the URLs.
tweet_df['text'] = tweet_df['text'].apply(lambda y: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', " ", y , flags=re.MULTILINE) )

#Here we remove escape characters such as \n, \x and \u.
tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'\\n', " ", z , flags=re.MULTILINE) )
tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'\\x..', " ", z , flags=re.MULTILINE) )
tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'\\u....', " ", z , flags=re.MULTILINE) )
#tweet_df['text'] = tweet_df['text'].apply(lambda z: re.sub(r'@\w+', ' ', z , flags=re.MULTILINE) )

#And finally we remove any other remaining symbols by removing every non-alphabetic character.
tweet_df['text'] = tweet_df['text'].apply(lambda k: re.sub("[^a-z]+", " ", k, flags=re.MULTILINE) )

#Now we remove the stopwords.
stop = stopwords.words('english')
tweet_df['text'] = tweet_df['text'].apply(lambda s: ' '.join([item for item in s.split() if item not in stop]))

#Converting the labels from 0-4 to Y-N.
tweet_df['target'] = tweet_df['target'].apply(lambda k: re.sub("4", "y", k, flags=re.MULTILINE) )
tweet_df['target'] = tweet_df['target'].apply(lambda k: re.sub("0", "n", k, flags=re.MULTILINE) )

#Converting the labels to numeric format.
ltoi = {l: i for i, l in enumerate(tweet_df['target'].unique())}
tweet_df['target'] = tweet_df['target'].apply(lambda y: ltoi[y])

#Since the dataset is too large, a subset of the data will be used for the GloVe model. (Otherwise the session crashes from too much RAM usage)
less_size = (int)(tweet_df.shape[0]/1.5)
tweet_df = tweet_df[:less_size]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Part 2: Stemming and lemmatization 

In [3]:
#Showing the 'text' column after the preprocessing.
tweet_df['text']

0                                            brokenpromises
1         david carradine sad thai law sure fowl play ma...
2                                   b tell bro say congrats
3                                    littlefluffycat indeed
4         completed race life mins girlies work fun bloo...
                                ...                        
853328    think diversity deserved win susan boyle succe...
853329    starting collection sock yarn seems must knit ...
853330                            ryan comes back afternoon
853331    bundacp leave jakrta friday back monday night ...
853332                         hello app reviewers apple hq
Name: text, Length: 853333, dtype: object

# Stemming

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import warnings
warnings.filterwarnings('ignore') 

import nltk

#Objects needed for the stemming.
from nltk.stem import PorterStemmer, WordNetLemmatizer
porter_stemmer = PorterStemmer()

#Defining a stem_sentences function that will stem our text and return it in string format.
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

#Stemming the text.
tweet_df['stemmed_text'] = tweet_df['text'].apply(stem_sentences)

#Showing the 'text' column after stemming.
tweet_df['stemmed_text']

0                                              brokenpromis
1         david carradin sad thai law sure fowl play man...
2                                    b tell bro say congrat
3                                      littlefluffycat inde
4         complet race life min girli work fun bloodi ho...
                                ...                        
853328    think divers deserv win susan boyl success reg...
853329    start collect sock yarn seem must knit often h...
853330                             ryan come back afternoon
853331    bundacp leav jakrta friday back monday night w...
853332                             hello app review appl hq
Name: stemmed_text, Length: 853333, dtype: object

# Lemmatization

In [5]:
nltk.download('wordnet')

#Objects needed for the lemmatization.
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

#Defining a lemmatize function that will lemmatize our text and return it in string format.
def lemmatize(text):
    string_list = [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)]
    list_to_str = ' '.join([str(element) for element in string_list])
    return list_to_str

#Performing lemmatization on the stemmed text.
tweet_df['lemmatized_text'] = tweet_df['stemmed_text'].apply(lemmatize)

#Showing the lemmatized text.
tweet_df['lemmatized_text']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


0                                              brokenpromis
1         david carradin sad thai law sure fowl play man...
2                                    b tell bro say congrat
3                                      littlefluffycat inde
4         complet race life min girli work fun bloodi ho...
                                ...                        
853328    think diver deserv win susan boyl success rega...
853329    start collect sock yarn seem must knit often h...
853330                             ryan come back afternoon
853331    bundacp leav jakrta friday back monday night w...
853332                             hello app review appl hq
Name: lemmatized_text, Length: 853333, dtype: object

# Part 3: Preparing the data using TorchText

# Initializing the data Fields

In [6]:
import torch
from torchtext import data

#Contrary to the TF-IDF model, torchtext was used in order to prepare the data for the model.

#Here we initialize two fields. One for the text (our features) and one for the labels.
text_field = data.Field(tokenize='spacy', lower=True, fix_length=40) #Fix length is 40 so every batch can have the same length (needed for this model).
label_field = data.Field(sequential=False, use_vocab=False)

# Splitting the dataframe and getting csv format

In [7]:
#Splitting the dataframe to train and test sets.
train_set, test_set = train_test_split(tweet_df, test_size=0.2, random_state=42)

#Keeping the columns we need.
train_set = train_set[['lemmatized_text','target']]
test_set = test_set[['lemmatized_text','target']]

#Creating two new csv files (needed for the following function).
train_set.to_csv('/content/gdrive/My Drive/train_file.csv')
test_set.to_csv('/content/gdrive/My Drive/test_file.csv')

# Splitting again with TorchText

In [8]:
#Splitting again with the TabularDataset module. This will allow us to also tokenize the text.
train_data, test_data = data.TabularDataset.splits(
                                        path='/content/gdrive/My Drive/',
                                        train = 'train_file.csv',
                                        test = 'train_file.csv',
                                        format = 'csv',
                                        skip_header = True,
                                        fields=[(None, None), ('lemmatized_text', text_field), ('target', label_field)])

#Printing the first entry to make sure the tokenization was succesful.
print(vars(train_data.examples[0]))

{'lemmatized_text': ['can', 'not', 'upload', 'profil', 'pic', 'whywhi'], 'target': '0'}


# Building the GloVe vocabulary

In [9]:
#We'll have to use a fixed size for the the vocabulary.
#The vocab size without any restrictions is too large, and as a result it makes the training take way too much time. 
#On the 'bright' side, those 25000 words are the most frequent from the text.
MAX_VOCAB_SIZE = 25_000

#Building the GloVe vocabulary (using a Twitter embedding for obvious reasons...)
text_field.build_vocab(
    train_data,
    max_size = MAX_VOCAB_SIZE,
    vectors='glove.twitter.27B.25d'
)

#Getting the vocab instance.
vocab = text_field.vocab

.vector_cache/glove.twitter.27B.zip: 1.52GB [11:43, 2.16MB/s]                           
100%|█████████▉| 1193268/1193514 [00:28<00:00, 43846.28it/s]

# Creating batches

In [10]:
#Since the dataset is rather large we'll have to use an equally large size for the batches.
BATCH_SIZE = 1000

#Splitting with BucketIterator (which will also give us the iterator in tensor format).
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size = BATCH_SIZE,
    sort=False)

#Checking how many batches we have.
len(test_iterator)

683

# Part 4: Creating the Model

In [12]:
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
import torch.nn.functional as F

#Defining the Feed Forward GloVe Model. Commentary about the choices of the dimensions, activation functions etc can be seen in the ReadMe file.
class FeedForwardGloVeModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        #Embedding layer
        self.embedding = nn.Embedding(input_dim,embedding_dim)

        #Linear layer
        self.fc = nn.Linear(hidden_dim*embedding_dim,output_dim)
        
    def forward(self, input):
        
        #Embedding layer (also resized for the output function).
        embed = self.embedding(input).view(input.size()[0], -1)    
        
        #Non-linearity.
        act = F.relu(embed)

        #Linear layer
        output = self.fc(embed)

        return output


INPUT_DIM = len(text_field.vocab) #Input size is the length of the GloVe vocab.
EMBEDDING_DIM = 300 #'Random' number for the embedding dimensions.
HIDDEN_DIM = 40 #Hidden dimension must be equal to the batch length for this model.
OUTPUT_DIM = 2

#Initializing the model.
model = FeedForwardGloVeModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

# Part 5: Training the Model and getting scores

In [13]:
import torch
from torchtext import data
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
train_losses = []
test_losses = []
test_accuracies = []
prec_scores = []
rec_scores = []
f1_scores = []

#Initializing the Loss Function and the Optimizer.
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=0.02)

epochs = 5
for epoch in range(epochs): #Training the model for 5 epochs. 
    for batch in train_iterator: #Epochs are way less than the ones in TF-IDF, but in this case we'll have to iterate every batch which needs more time.
      
        optimizer.zero_grad()
        prediction = model(batch.lemmatized_text.T)
        loss = loss_function(prediction, batch.target)
        
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
    
    for batch in test_iterator:
        with torch.no_grad(): #Turn off gradients for test validation, saves memory and computations.
            optimizer.zero_grad()
            prediction = model(batch.lemmatized_text.T)
            loss = loss_function(prediction, batch.target)
            
            test_losses.append(loss.item())

            ps = torch.exp(prediction)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == batch.target.view(*top_class.shape)
            
            #Getting precision, recall and f1 scores.
            prec_score_test = precision_score(top_class, batch.target, average = 'weighted')
            prec_scores.append(prec_score_test)
            
            rec_score_test = recall_score(top_class, batch.target, average = 'weighted')
            rec_scores.append(rec_score_test)
            
            f1_score_test = f1_score(top_class, batch.target, average = 'weighted')
            f1_scores.append(f1_score_test)

            test_accuracy = torch.mean(equals.float())
            test_accuracies.append(test_accuracy)
    
    print(f"Epoch: {epoch+1}/{epochs}.. ",
          f"Training Loss: {np.mean(train_losses):.3f}.. ",
          f"Test Loss: {np.mean(test_losses):.3f}.. ",
          f"Test Accuracy: {np.mean(test_accuracies):.3f}")

prec_score_test = np.mean(prec_scores)
rec_score_test = np.mean(rec_scores)
f1_score_test = np.mean(f1_scores)

#More discussion about the scores and comparisons with the other models can be seen in the ReadMe file.
print()
print("Printing scores for Precision, Recall and F1-Measure")
print("Average accuracy using Precision : {}%".format(round(prec_score_test*100,2)))
print("Average accuracy using Recall : {}%".format(round(rec_score_test*100,2)))
print("Average accuracy using F1-Measure: {}%".format(round(f1_score_test*100,2)))

Epoch: 1/5..  Training Loss: 2.580..  Test Loss: 0.897..  Test Accuracy: 0.705
Epoch: 2/5..  Training Loss: 1.779..  Test Loss: 0.796..  Test Accuracy: 0.721
Epoch: 3/5..  Training Loss: 1.427..  Test Loss: 0.763..  Test Accuracy: 0.713
Epoch: 4/5..  Training Loss: 1.230..  Test Loss: 0.714..  Test Accuracy: 0.723
Epoch: 5/5..  Training Loss: 1.102..  Test Loss: 0.683..  Test Accuracy: 0.732

Printing scores for Precision, Recall and F1-Measure
Average accuracy using Precision : 77.58%
Average accuracy using Recall : 73.16%
Average accuracy using F1-Measure: 73.83%
