<a href="https://colab.research.google.com/github/nbs19/deep-learning/blob/main/lstm_Covid_tweets_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [None]:
df_train = pd.read_csv("/content/Corona_NLP_train.csv", encoding="latin1")
df_test = pd.read_csv("/content/Corona_NLP_test.csv", encoding="latin1")

df_train = df_train.sample(frac=1)
df_test = df_test.sample(frac=1)

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.0 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 46.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def token_counter(text, tokenizer):
    return len(tokenizer.encode(text))
tok_len = df_train["OriginalTweet"].apply(lambda x : token_counter(x, tokenizer))
max(list(tok_len))

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

184

The longest tweet contains 184 tokens, we don't have to use padding up to the 512th token, we will stop at 200 to reduce the size of the tensors handled.

In [None]:
tokenizer.model_max_length = 200

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
def remove_links(text):
    to_remove = ['\r','\n',',',';',':','.']
    
    out = re.sub(r'http\S+', '', text)
    
    for token in to_remove:
        out = out.replace(token, '')
    
    return re.sub(' +', ' ', out.lower())

def tokenize(text, tokenizer):
    return tokenizer.encode(text, padding='max_length')

name_to_idx = {
    "Extremely Negative" : 0,
    "Negative" : 1,
    "Neutral" : 2,
    "Positive" : 3,
    "Extremely Positive" : 4
}

def process_tgt(value):
    return name_to_idx[value]

In [None]:
train_text = list(df_train["OriginalTweet"].apply(remove_links).apply(lambda x : tokenize(x, tokenizer)))
train_labels = list(df_train["Sentiment"].apply(process_tgt))

test_text = list(df_test["OriginalTweet"].apply(remove_links).apply(lambda x : tokenize(x, tokenizer)))
test_labels = list(df_test["Sentiment"].apply(process_tgt))

In [None]:
from torch.utils.data import Dataset

class CreateDataset(Dataset):
    
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels
        
        
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)

train_dataset = CreateDataset(train_text, train_labels)
test_dataset = CreateDataset(test_text, test_labels)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size = 128)
test_loader = DataLoader(test_dataset, batch_size = 128)

In [None]:
print(tokenizer.vocab_size)

50265


In [None]:
class LSTMModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size = embedding_dim, 
                            hidden_size = hidden_size, 
                            num_layers = num_layers,
                            dropout = dropout,
                            batch_first = True)
        self.linear1 = nn.Linear(512*100, 256)
        self.linear2= nn.Linear(256,5)
        #self.linear3= nn.Linear(64,5)
        self.sig = nn.Sigmoid()
        
    def forward(self, inputs):
        emb = self.embedding(inputs)
        #print(emb.shape)
        lstm_out, _ = self.lstm(emb)
        output = lstm_out.reshape(lstm_out.size()[0], -1)
        #print(output.shape)
        out1=self.linear1(output)
        out2=self.linear2(out1)
       # out3=self.linear3(out2)
        #print(output.shape,out1.shape)
        return out2
    
model = LSTMModel(tokenizer.vocab_size, 128,256,3, 0.2)

In [None]:
from tqdm import tqdm

class Trainer():
    
    def __init__(self, model, train_loader, valid_loader):
        
        self.model = model
        self.train_loader = train_loader
        self.valid_loader = valid_loader

    def train_epoch(self, f_loss, optimizer, device):
        self.model.train()
        correct = 0
        tot_loss = 0
        N = 41157 
        for p in model.parameters():
            p.requires_grad = False
        for p in model.linear2.parameters():
            p.requires_grad = True      
        iterator = enumerate(self.train_loader)

        for i, (inputs, targets) in iterator:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = self.model(inputs)

            loss = f_loss(outputs, targets)

            loss_value = loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            tot_loss += inputs.shape[0] * loss_value

            predicted_targets = outputs.argmax(dim=1)
            correct += (predicted_targets == targets).sum().item()


        return tot_loss/N, correct/N

    def valid_epoch(self, f_loss, device):

        self.model.eval()

        correct = 0
        tot_loss = 0
        N = 3798

        #iterator = tqdm(enumerate(self.valid_loader))
        iterator = enumerate(self.valid_loader)

        with torch.no_grad():
            for i, (inputs, targets) in iterator:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = self.model(inputs)

                loss = f_loss(outputs, targets)

                tot_loss += inputs.shape[0] * loss.item()

                predicted_targets = outputs.argmax(dim=1)
                correct += (predicted_targets == targets).sum().item()


        return tot_loss/N, correct/N

    def training(self, f_loss, optimizer, device, epochs = 10):

        train_loss = []
        train_acc = []
        valid_loss = []
        valid_acc = []
        state = torch.load("/content/drive/MyDrive/ep36.pkl")
        model.load_state_dict(state)
        for i in range(epochs):
            print("EPOCH {}/{}".format(i + 1, epochs))
            train_results = self.train_epoch(f_loss, optimizer, device)
            print("Training loss : {: .3f} | Training accuracy : {: .3f}".format(*train_results))
            valid_results = self.valid_epoch(f_loss, device)
            print("Validation loss : {: .3f} | Validation accuracy : {: .3f}\n".format(*valid_results))
            torch.save(model.state_dict(),'/content/drive/MyDrive/'+'ep'+str(i)+'.pkl')
            train_loss.append(train_results[0])
            train_acc.append(train_results[1])
            valid_loss.append(valid_results[0])
            valid_acc.append(valid_results[1])

        return train_loss, train_acc, valid_loss, valid_acc

In [None]:
device = torch.device('cuda')
model = model.cuda()
f_loss = torch.nn.CrossEntropyLoss()
optimizer =  torch.optim.SGD(model.parameters(), lr = 0.005, weight_decay = 0.0002)

In [None]:
trainer = Trainer(model, train_loader, test_loader)

train_loss, train_acc, valid_loss, valid_acc = trainer.training(f_loss, optimizer, device, epochs = 50)

In [None]:
state = torch.load("/content/drive/MyDrive/ep49.pkl")
print(state.keys())

odict_keys(['embedding.weight', 'lstm.weight_ih_l0', 'lstm.weight_hh_l0', 'lstm.bias_ih_l0', 'lstm.bias_hh_l0', 'lstm.weight_ih_l1', 'lstm.weight_hh_l1', 'lstm.bias_ih_l1', 'lstm.bias_hh_l1', 'lstm.weight_ih_l2', 'lstm.weight_hh_l2', 'lstm.bias_ih_l2', 'lstm.bias_hh_l2', 'linear1.weight', 'linear1.bias', 'linear2.weight', 'linear2.bias'])


In [None]:
plt.plot(train_loss, label = "train set")
plt.plot(valid_loss, label = "test set")
plt.legend()
plt.title("Loss of the model during training")
plt.show()

plt.plot(train_acc, label = "train set")
plt.plot(valid_acc, label = "test set")
plt.legend()
plt.title("Accuracy of the model during training")
plt.show()