# Mounting Content from Google Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Import necessary libraries

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Import datasets


Labels in dataset:
*   label 0: neutral
*   label 1: anti-vax
*   label 2: pro-vax







In [None]:
# train set
Train_set_Location = r'/content/vaccine_train_set.csv' 

# validation set
Validation_set_Location = r'/content/vaccine_validation_set.csv'  

df_train = pd.read_csv(Train_set_Location, index_col=0)
df_test = pd.read_csv(Validation_set_Location, index_col=0)

print(f"vaccine_validation_set: {df_train.shape}")
print(f"vaccine_train_set: {df_test.shape}")

df_train.head(10) # check the train dataframe

vaccine_validation_set: (15976, 2)
vaccine_train_set: (2282, 2)


Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0
5,Why did I get my whooping cough vaccine the sa...,2
6,Prosecutor Ken Ervin: I want to know how reckl...,0
7,"""@UberFacts: On average, people who complain l...",0
8,The legacy of @JennyMcCarthy will be she took ...,2
9,"“@UberFacts: On average, people who complain l...",0


# Clean up the data


In [None]:
# Drop the rows where at least one element is missing
df_train.dropna()
df_test.dropna()

df_train.head(5)

Unnamed: 0,tweet,label
0,Sip N Shop Come thru right now #Marjais #Popul...,0
1,I don't know about you but My family and I wil...,1
2,@MSignorile Immunizations should be mandatory....,2
3,President Obama spoke in favor of vaccination ...,0
4,"""@myfoxla: Arizona monitoring hundreds for mea...",0


# Creating GloVe vocabulary 

##Download pre-trained model
https://nlp.stanford.edu/projects/glove/

In [None]:
!wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip

--2021-12-17 21:46:53--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2021-12-17 21:46:53--  http://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [application/zip]
Saving to: ‘glove.twitter.27B.zip’


2021-12-17 21:51:40 (5.07 MB/s) - ‘glove.twitter.27B.zip’ saved [1520408563/1520408563]

Archive:  glove.twitter.27B.zip
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  
  inflating: glove.twitter.27B

## Import glove.twitter file

In [None]:
# Using the vocabulary with 25 dimensions
glove_twitter_Location = '/content/glove.twitter.27B.25d.txt' # change this if you want
D = int(glove_twitter_Location.split('.')[3][:-1])            # get the dimensions from the vectors in file

In [None]:
# Getting the vocabulary from the file
f = open(glove_twitter_Location, "r") 

glove_words = dict()   # key : word, value : index to vector from gloves_vectors
glove_vectors_list = []
index_vector = 0

# for each line in glove file get the word in a dict and the vector in an list
for  x in f:  
  word = x.split()[0]
  vector = [ float(n) for n in x.split()[1:] ]  # convert each string to float
  if len(vector) != D :              # pass the Unreadable characters
    continue
  glove_words[word] = index_vector   # so a word has the vector at index -> index_vector to gloves_vectors
  glove_vectors_list.append(vector)    
  index_vector = index_vector + 1
gloves_vectors = np.asarray(glove_vectors_list)  # convert list to array

f.close()

## Convert tweets to vectors 

In [None]:
# for train set build a dictionary of words and an array of vectors for each tweet
all_vectors_train = []  

n_words = 0

for index, row in df_train.iterrows():
  n_words = 0
  prev_vector = np.array([0]*D)   # create a null list

  for word in row['tweet'].split() :
    n_words += 1
    if word in glove_words:             # for each word in tweet get the vector from glove_vectors, if vector doesn't exist pass
      index_vector = glove_words[word]  
      curr_word_vector = np.array(glove_vectors_list[index_vector])
      tweet_vectors = curr_word_vector + prev_vector  # Add all vectors in tweet, so at the end we will have a vector with D dimensions for each tweet
      prev_vector = tweet_vectors

  all_vectors_train.append(tweet_vectors/n_words) # divide with the sum of all elements to normalize values

all_vectors_train = np.array(all_vectors_train)

len(all_vectors_train)  # should be equal with the number of tweets from train set

15976

In [None]:
# Same for validation set
all_vectors_test = []  
  
n_words = 0

for index, row in df_test.iterrows():
  n_words = 0
  prev_vector = np.array([0]*D)   # create a null list

  for word in row['tweet'].split() :
    n_words += 1
    if word in glove_words:             # for each word in tweet get the array from glove_vectors
      index_vector = glove_words[word]  # get the vector for the word
      curr_word_vector = np.array(glove_vectors_list[index_vector])
      tweet_vectors = curr_word_vector + prev_vector    # Add all vectors in tweet, so at the end we will have a vector with D dimensions for each tweet
      prev_vector = tweet_vectors
    # else:       # if word doesn't exist in glove vocabulary make an array with 1 * D for the word
    #   one_list = [0] * D
    #   tweet_vectors = one_list + prev_vector
    #   prev_vector = tweet_vectors

  all_vectors_test.append(tweet_vectors/n_words) # divide with the sum of all elements to normalize values

all_vectors_test = np.array(all_vectors_test)

len(all_vectors_test)  # should be equal with the number of tweets from test set

2282

In [None]:
# # prepare train set
import torch




x_train = (
    list(
        map(
            lambda x: torch.FloatTensor(x), 
            all_vectors_train
        )
    )
)

# Saving in tensors
x_train= torch.stack(x_train)
y_train = torch.tensor(df_train.label, dtype=torch.long)


print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")


x_train shape: torch.Size([15976, 25])
y_train shape: torch.Size([15976])


In [None]:
# # prepare test set

x_test = (
    list(
        map(
            lambda x: torch.FloatTensor(x), 
            all_vectors_test 
        )
    )
)

# Save in tensors
y_test = torch.tensor(df_test.label, dtype=torch.long)

x_test= torch.stack(x_test)


print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_test shape: torch.Size([2282, 25])
y_test shape: torch.Size([2282])


# Creating data loaders

In [None]:
# Build data loaders
BatchSize = 1000

# Initialize train dataloader
dataset = torch.utils.data.TensorDataset(x_train, y_train)
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=BatchSize, shuffle=True)

# Initialize test dataloader
dataset = torch.utils.data.TensorDataset(x_test, y_test)
test_dataloader = torch.utils.data.DataLoader(dataset, batch_size=BatchSize, shuffle=True)

In [None]:
examples = iter(test_dataloader)
samples, labels = examples.next()
print(samples.shape, labels.shape)

torch.Size([1000, 25]) torch.Size([1000])


# Creating Neural Networks

In [None]:
import torch
import torch.nn as nn

## Neural Network 1

In [None]:

class Network_1(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
      super().__init__()
      self.linear1 = nn.Linear(input_size, hidden_size1)  
      self.relu1 = nn.ReLU()
      self.linear2 = nn.Linear(hidden_size1, hidden_size2)
      self.relu2 = nn.ReLU()
      self.linear3 = nn.Linear(hidden_size2, num_classes)


    def forward(self, x):
      out = self.linear1(x)
      out = self.relu1(out)
      out = self.linear2(out)
      out = self.relu2(out)
      out = self.linear3(out)
      return out 


In [None]:
# Creating the weights for the loss function 
import sklearn.utils.class_weight as class_weight

class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train),y = y_train.numpy())
print('weights:', class_weights)
class_weights = torch.tensor(class_weights, dtype=torch.float)

weights: [0.71404309 2.56890175 0.8262736 ]


In [None]:
#Define layer sizes
input_size = x_train.shape[1]
hidden_size1 = 400
hidden_size2 = 300
num_classes = 3

#Define Hyperparameters
learning_rate = 1e-2

#Initialize model, loss, optimizer
model_1 = Network_1(input_size, hidden_size1, hidden_size2, num_classes)
# loss_func = nn.CrossEntropyLoss()
loss_func = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')


optimizer = torch.optim.Adamax(model_1.parameters(), lr=learning_rate)

model_1

Network_1(
  (linear1): Linear(in_features=25, out_features=400, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=400, out_features=300, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=300, out_features=3, bias=True)
)

### Training the Neural Network 1

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

num_epochs = 5
accuracies = []
average_f1_score = []
average_recall_score = []
average_precision_score = []

for epoch in range(num_epochs):
  train_losses = []
  test_losses = []

  

  for x_batch, y_batch in train_dataloader: # gia ato train set, to idio gia to validation sto project
    y_pred = model_1(x_batch)

    loss = loss_func(y_pred, y_batch) 

    train_losses.append(loss.item())
    
    #Delete previously stored gradients
    optimizer.zero_grad()
    #Perform backpropagation starting from the loss calculated in this epoch
    loss.backward()
    #Update model's weights based on the gradients calculated during backprop
    optimizer.step()

  print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {np.mean(train_losses):.4f}", end = '  ')
 
  # Testing the model_GloVe
  # with torch.no_grad():
  for x_batch, y_batch in test_dataloader:

    y_pred = model_1(x_batch)
    
    loss = loss_func(y_pred, y_batch) # convert to type long

    ps = torch.exp(y_pred)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == y_batch.view(*top_class.shape)
    
    test_losses.append(loss.item())

    accuracy = torch.mean(equals.float())
    accuracies.append(accuracy)           

  print(f"Test Loss = {np.mean(test_losses):.4f}", end = '  ')   

  y_pred = model_1(x_test)

  # Getting precision, recall, f1 scores and accuracy.
  precision_score_temp = precision_score(y_test, torch.argmax(y_pred, dim=1), average='weighted')
  recall_score_temp = recall_score(y_test, torch.argmax(y_pred, dim=1), labels=[0, 1, 2], average='weighted')
  f1_score_temp = 2 * (precision_score_temp * recall_score_temp) / (precision_score_temp + recall_score_temp)

  average_precision_score.append(precision_score_temp)
  average_f1_score.append(f1_score_temp)
  average_recall_score.append(recall_score_temp)
  print("f1 score: {}%".format(round(f1_score_temp*100, 2)), end = '  ')
  print("precision score: {}%".format(round(precision_score_temp*100, 2)), end = '  ')
  print("recall score: {}%".format(round(recall_score_temp*100, 2)))

print()
print("Average accuracy: {} %".format(round(np.mean(accuracies)*100, 2)))
print("Average f1 score: {} %".format(round(np.mean(average_f1_score)*100, 2)))
print("Average recall score: {} %".format(round(np.mean(average_recall_score)*100, 2)))
print("Average precision score: {} %".format(round(np.mean(average_precision_score)*100, 2)))

Epoch 1/5: Train Loss = 1.2027  Test Loss = 1.0737  f1 score: 46.57%  precision score: 52.36%  recall score: 41.94%
Epoch 2/5: Train Loss = 1.0559  Test Loss = 1.0330  f1 score: 53.45%  precision score: 53.27%  recall score: 53.64%
Epoch 3/5: Train Loss = 1.0238  Test Loss = 1.0044  f1 score: 52.83%  precision score: 55.56%  recall score: 50.35%
Epoch 4/5: Train Loss = 1.0037  Test Loss = 0.9813  f1 score: 54.38%  precision score: 57.29%  recall score: 51.75%
Epoch 5/5: Train Loss = 0.9858  Test Loss = 0.9802  f1 score: 55.52%  precision score: 58.27%  recall score: 53.02%

Average accuracy: 50.17 %
Average f1 score: 52.55 %
Average recall score: 50.14 %
Average precision score: 55.35 %


## Neural Network 2


In [None]:
import torch
import torch.nn as nn


class Network_2(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, num_classes):
      super().__init__()
      self.linear1 = nn.Linear(input_size, hidden_size1)  
      self.celu = nn.CELU()
      self.linear2 = nn.Linear(hidden_size1, hidden_size2)
      self.gelu = nn.GELU()
      self.linear3 = nn.Linear(hidden_size2, hidden_size3)
      self.elu = nn.ELU()
      self.linear4 = nn.Linear(hidden_size3, num_classes)


    def forward(self, x):
      out = self.linear1(x)
      out = self.celu(out)
      out = self.linear2(out)
      out = self.gelu(out)
      out = self.linear3(out)
      out = self.elu(out)
      out = self.linear4(out)
      return out 


In [None]:
#Define layer sizes
input_size = x_train.shape[1]
hidden_size1 = 200
hidden_size2 = 100
hidden_size3 = 150
num_classes = 3

#Define Hyperparameters
learning_rate = 1e-2

#Initialize model, loss, optimizer
model_2 = Network_2(input_size, hidden_size1, hidden_size2, hidden_size3, num_classes)
# loss_func = nn.CrossEntropyLoss()
loss_func = nn.CrossEntropyLoss(weight=class_weights, reduction='mean')     # same weights with model_1


optimizer = torch.optim.AdamW(model_2.parameters(), lr=learning_rate)

model_2

Network_2(
  (linear1): Linear(in_features=25, out_features=200, bias=True)
  (celu): CELU(alpha=1.0)
  (linear2): Linear(in_features=200, out_features=100, bias=True)
  (gelu): GELU()
  (linear3): Linear(in_features=100, out_features=150, bias=True)
  (elu): ELU(alpha=1.0)
  (linear4): Linear(in_features=150, out_features=3, bias=True)
)

### Training the Network 2

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

num_epochs = 5
accuracies = []
average_f1_score = []
average_recall_score = []
average_precision_score = []


for epoch in range(num_epochs):
  train_losses = []
  test_losses = []
  

  for x_batch, y_batch in train_dataloader: # gia ato train set, to idio gia to validation sto project
    y_pred = model_2(x_batch)

    loss = loss_func(y_pred, y_batch) 

    train_losses.append(loss.item())
    
    #Delete previously stored gradients
    optimizer.zero_grad()
    #Perform backpropagation starting from the loss calculated in this epoch
    loss.backward()
    #Update model's weights based on the gradients calculated during backprop
    optimizer.step()

  print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {np.mean(train_losses):.4f}", end = '  ')
 
  # for test loop
  # with torch.no_grad():
  for x_batch, y_batch in test_dataloader:

    y_pred = model_2(x_batch)
    
    loss = loss_func(y_pred, y_batch) # convert to type long

    ps = torch.exp(y_pred)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == y_batch.view(*top_class.shape)
    
    test_losses.append(loss.item())


    accuracy = torch.mean(equals.float())
    accuracies.append(accuracy)

  print(f"Test Loss = {np.mean(test_losses):.4f}", end = '  ')

  y_pred = model_2(x_test)

  # Getting precision, recall, f1 scores and accuracy.
  precision_score_temp = precision_score(y_test, torch.argmax(y_pred, dim=1), average='weighted')
  recall_score_temp = recall_score(y_test, torch.argmax(y_pred, dim=1), labels=[0, 1, 2], average='weighted')
  f1_score_temp = 2 * (precision_score_temp * recall_score_temp) / (precision_score_temp + recall_score_temp)

  average_precision_score.append(precision_score_temp)
  average_f1_score.append(f1_score_temp)
  average_recall_score.append(recall_score_temp)
  print("f1 score: {}%".format(round(f1_score_temp*100, 2)), end = '  ')
  print("precision score: {}%".format(round(precision_score_temp*100, 2)), end = '  ')
  print("recall score: {}%".format(round(recall_score_temp*100, 2)))

print()
print("Average accuracy: {} %".format(round(np.mean(accuracies)*100, 2)))
print("Average f1 score: {} %".format(round(np.mean(average_f1_score)*100, 2)))
print("Average recall score: {} %".format(round(np.mean(average_recall_score)*100, 2)))
print("Average precision score: {} %".format(round(np.mean(average_precision_score)*100, 2)))

Epoch 1/5: Train Loss = 1.0910  Test Loss = 1.0310  f1 score: 44.67%  precision score: 52.19%  recall score: 39.04%
Epoch 2/5: Train Loss = 1.0245  Test Loss = 1.0000  f1 score: 52.43%  precision score: 57.85%  recall score: 47.94%
Epoch 3/5: Train Loss = 0.9919  Test Loss = 1.0052  f1 score: 48.85%  precision score: 57.17%  recall score: 42.64%
Epoch 4/5: Train Loss = 0.9845  Test Loss = 0.9686  f1 score: 53.87%  precision score: 57.3%  recall score: 50.83%
Epoch 5/5: Train Loss = 0.9733  Test Loss = 0.9656  f1 score: 53.7%  precision score: 57.59%  recall score: 50.31%

Average accuracy: 46.28 %
Average f1 score: 50.7 %
Average recall score: 46.15 %
Average precision score: 56.42 %
