In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn 
import torch.nn.functional as F 
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics.classification import ConfusionMatrix,F1Score
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/natedrake7/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/natedrake7/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
train_data = pd.read_csv('./Data/train_set.csv')
test_data = pd.read_csv('./Data/test_set.csv')

#Create Dataframes
train_set = pd.DataFrame(train_data)
test_set = pd.DataFrame(test_data)

Tokenize Text and create W2V Model

In [3]:
train_set['Tokenized_Text'] = train_set['Text'].apply(lambda x: word_tokenize(x))

# Create Word2Vec model
model = Word2Vec(sentences=train_set['Tokenized_Text'], vector_size=200, window=5, min_count=1, workers=-1)

# Save the model
model.save("./Data/word2vec.model")

Load Word2Vec Model

In [4]:
word2vec_model = Word2Vec.load('./Data/word2vec.model')

Encode Labels

In [6]:
encoder  = LabelEncoder() #initialize label encoder so we can set integer values to the label unique values

X_train_labels = encoder.fit_transform(train_set['Sentiment'])

print(encoder.classes_)
print(encoder.transform(encoder.classes_))
labels = {0 : 'NEGATIVE',1 : 'NEUTRAL',2 : 'POSITIVE'}

['NEGATIVE' 'NEUTRAL' 'POSITIVE']
[0 1 2]


Create Validation Set

In [7]:
X_train, X_val, Y_train, Y_val = train_test_split(train_set['Tokenized_Text'], X_train_labels, test_size=0.2 ,random_state=42)

In [13]:
# Apply Word2Vec embeddings to each sentence
train_embedded_sentences = [word2vec_model.wv[words] for words in X_train if words]

train_sentence_embeddings = [np.mean(embeddings, axis=0) for embeddings in train_embedded_sentences]

val_embedded_sentences = [word2vec_model.wv[words] for words in X_val if words]

val_sentence_embeddings = [np.mean(embeddings, axis=0) for embeddings in val_embedded_sentences]

Create Tensorflow tensors

In [15]:
batch_size = 16

#Train set
X_train_tensor = torch.tensor(train_sentence_embeddings,dtype=torch.float)
Labels_train_tensor = torch.tensor(Y_train,dtype=torch.long) #create a torch tensor for the labels from the numpy array
Dataset = TensorDataset(X_train_tensor,Labels_train_tensor) #Create a dataset
Train_set = DataLoader(Dataset,batch_size=batch_size,shuffle=True) #Create the train set


#Val set
X_val_tensor = torch.tensor(val_sentence_embeddings,dtype=torch.float)
Labels_val_tensor = torch.tensor(Y_val,dtype=torch.long) #create a torch tensor for the labels from the numpy array
Dataset = TensorDataset(X_val_tensor,Labels_val_tensor) #Create a dataset
Validation_set = DataLoader(Dataset,batch_size=batch_size,shuffle=True) #Create the validation set

Set Gpu Or Cpu

In [16]:
if torch.cuda.is_available(): #if nvidia gpu is available
   device="cuda" #set devide to cuda since GPUs are much faster at deep learning
else: 
   device="cpu" #else set CPU
print("Device =",device)

Device = cuda


FeedForward Neural Network

In [48]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self): #initialize feedforward network
        super(FeedForwardNeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential( #Initialize the linear layers
            nn.Linear(200,128),
            nn.ReLU(),
            nn.Linear(128,32),
            nn.ReLU(),
            nn.Linear(32,1), #output is 3 since we have 3 sentiment classes
        )
    def forward(self, x):
        logits = self.linear_relu_stack(x) #pass the mfccs value through the network
        return logits #return the output
    
model = FeedForwardNeuralNetwork().to(device) #Create a feedforward network instance

Train Function

In [49]:
def Train(dataLoader,model,loss_fn,optimizer,scheduler1 = None,scheduler2 = None):
    size = len(dataLoader.dataset) #Get the size of the dataset
    for batch, (X,y) in enumerate(dataLoader): #iterate all the dataset

        X = X.to(device) #Load variables to GPU
        y = y.to(device, dtype=torch.float) #Load variables to GPU
        
        pred = model(X) #predict the label

        y = y.view(-1, 1)

        loss = loss_fn(pred,y) #find the loss between the prediction and the true label
        #BackPropagation
        optimizer.zero_grad() #reset all the gradients

        loss.backward()
        
        optimizer.step() #perform a step

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            
    if scheduler1: #if there is a scheduler available
        scheduler1.step() #get the next learning rate
    if scheduler2: #if there are chained schedulers
        scheduler2.step() #get the next learning rate

Validation Function

In [53]:
def Validation(dataloader,model):
    size = len(dataloader.dataset) #get the size of the dataset
    f1 = 0 #declare f1 score to 0
    f1_score = F1Score(task='multiclass',num_classes=3,average='macro').to(device)

    with torch.no_grad(): #disable gradient caclulation since we are validating not training
        for X,y in dataloader:

            X = X.to(device)#Load to GPU
            y = y.to(device, dtype=torch.float) #Load variables to GPU
        
            pred = model(X) #predict the label

            y = y.view(-1, 1)
            
            f1 += f1_score(pred,y) #calculate the f1 score
    f1 /= size #find the average f1 score
    return f1

Test Function

In [79]:
def Test(dataloader,model,loss_fn):
    size = len(dataloader.dataset) #get the size of the dataset
    test_loss,correct,f1 = 0,0,0 #declare variables
    f1_score = F1Score(task='multiclass',num_classes=3,average='macro').to(device) 

    with torch.no_grad(): #disable gradient caclulation since we are testing not training
        for X,y in dataloader:

            X = X.to(device) #Load to GPU
            y = y.to(device, dtype=torch.float) #Load to GPU

            pred = model(X) #predict the label

            old_y = y
            y = y.view(-1, 1)

            test_loss += loss_fn(pred,y).item() #find the loss_fn
            
            correct += (pred.argmax(1) == old_y).type(torch.float).sum().item() #find the accuracy
            f1 += f1_score(pred,y) #find the f1 score
    
    test_loss /= size #compute the average loss
    correct /= size #average accuracy
    f1 /= size #average f1 score
    confmat = ConfusionMatrix('multiclass',num_classes=3).to(device)
    confusion_matrix = confmat(pred,y) #find the confusion matrix
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f},Avg F1-Score: {f1:>8f}\n") #print statistics
    print(f"Confusion Matrix \n: {confusion_matrix}\n") #print confusion matrix

Training-Validation

In [78]:
loss_fn = nn.MSELoss(reduction='sum')
best_f1 = 0 #initialize variables
learning_rate = 0.0001

#optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate) #set optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate) #set optimizer

epochs_count = 30 #set epochs count

for t in range(epochs_count): #iterate over the entire dataset for the number of epochs specified
    print(f"Epoch {t+1}\n-------------------------------")
    Train(Train_set,model,loss_fn,optimizer) #train the model
    f1 = Validation(Validation_set,model) #find the f1 score for the validation set
    if f1 >= best_f1: #if the f1 score of the validation set is the best one
        best_f1 = f1 #set it to the best f1 variable
        best_model = model #set best model equal to the current model
#print(f"Best F1 Score: {best_f1:>8f}\n")
Test(Validation_set,best_model,loss_fn)#test based on the best model found by the validation set

Epoch 1
-------------------------------
loss: 11.994993  [    0/29304]
loss: 9.288376  [ 1600/29304]
loss: 7.744741  [ 3200/29304]
loss: 11.507072  [ 4800/29304]
loss: 13.474834  [ 6400/29304]
loss: 9.917650  [ 8000/29304]
loss: 12.322187  [ 9600/29304]
loss: 12.983179  [11200/29304]
loss: 13.710409  [12800/29304]
loss: 9.998266  [14400/29304]
loss: 10.536527  [16000/29304]
loss: 10.124269  [17600/29304]
loss: 11.076577  [19200/29304]
loss: 11.924091  [20800/29304]
loss: 8.035734  [22400/29304]
loss: 11.134641  [24000/29304]
loss: 12.265488  [25600/29304]
loss: 9.693432  [27200/29304]
loss: 12.145502  [28800/29304]
Epoch 2
-------------------------------
loss: 11.115925  [    0/29304]
loss: 9.012704  [ 1600/29304]
loss: 10.167179  [ 3200/29304]
loss: 8.542454  [ 4800/29304]
loss: 12.006057  [ 6400/29304]
loss: 6.932095  [ 8000/29304]
loss: 7.849139  [ 9600/29304]
loss: 9.414249  [11200/29304]
loss: 10.791187  [12800/29304]
loss: 12.124752  [14400/29304]
loss: 10.437691  [16000/29304]
l

0.010197
Best F1 Score: 0.010684

Best F1 Score: 0.017475 AdamW
