# Check GPU and data

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)


from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Mounted at /content/gdrive
Mon May 15 16:34:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+------------------------------------------------------------

## Loading data

In [None]:
from collections import Counter

In [None]:
import pandas as pd
tweets = pd.read_csv('/content/gdrive/MyDrive/TweetsCOV19.csv')
tweets.rename(columns={"TweetText": "x"}, inplace=True)
tweets['x'] = tweets['x'].astype(str)
tweets.dropna(subset=["Sentiment"], inplace=True)
tweets['UserLocation'] = tweets['UserLocation'].fillna("unknown")
tweets[['pos','neg']] = tweets['Sentiment'].str.split(" ", expand=True)
tweets["strat"] = tweets['pos'].astype(int) + tweets['neg'].astype(int) #+";"+ pd.to_datetime(tweets["Timestamp"]).dt.month.astype(str)

  tweets = pd.read_csv('/content/gdrive/MyDrive/TweetsCOV19.csv')


## Preprocessing code

In [None]:
emoticons = {
    ':*': '<kiss>',
    ':-*': '<kiss>',
    ':x': '<kiss>',
    ':-)': '<happy>',
    ':-))': '<happy>',
    ':-)))': '<happy>',
    ':-))))': '<happy>',
    ':-)))))': '<happy>',
    ':-))))))': '<happy>',
    ':)': '<happy>',
    ':))': '<happy>',
    ':)))': '<happy>',
    ':))))': '<happy>',
    ':)))))': '<happy>',
    ':))))))': '<happy>',
    ':)))))))': '<happy>',
    ':o)': '<happy>',
    ':]': '<happy>',
    ':3': '<happy>',
    ':c)': '<happy>',
    ':>': '<happy>',
    '=]': '<happy>',
    '8)': '<happy>',
    '=)': '<happy>',
    ':}': '<happy>',
    ':^)': '<happy>',
    '|;-)': '<happy>',
    ":'-)": '<happy>',
    ":')": '<happy>',
    '\\o/': '<happy>',
    '\\0/': '<happy>',
    ':-d': '<laugh>',
    ':d': '<laugh>',
    '8-d': '<laugh>',
    '8d': '<laugh>',
    'x-d': '<laugh>',
    'xd': '<laugh>',
    '=-d': '<laugh>',
    '=D': '<laugh>',
    '=-3': '<laugh>',
    '=3': '<laugh>',
    'b^d': '<laugh>',
    '>:[': '<sad>',
    ':-(': '<sad>',
    ':-((': '<sad>',
    ':-(((': '<sad>',
    ':-((((': '<sad>',
    ':-(((((': '<sad>',
    ':-((((((': '<sad>',
    ':-(((((((': '<sad>',
    ':(': '<sad>',
    ':((': '<sad>',
    ':(((': '<sad>',
    ':((((': '<sad>',
    ':(((((': '<sad>',
    ':((((((': '<sad>',
    ':(((((((': '<sad>',
    ':((((((((': '<sad>',
    ':-c': '<sad>',
    ':c': '<sad>',
    ':-<': '<sad>',
    ':<': '<sad>',
    ':-[': '<sad>',
    ':[': '<sad>',
    ':{': '<sad>',
    ':-||': '<sad>',
    ':@': '<sad>',
    ":'-(": '<sad>',
    ":'(": '<sad>',
    'd:<': '<sad>',
    'd:': '<sad>',
    'd8': '<sad>',
    'd;': '<sad>',
    'd=': '<sad>',
    'dX': '<sad>',
    'v.v': '<sad>',
    "d-':": '<sad>',
    '(>_<)': '<sad>',
    ':|': '<sad>',
    '>:O': '<surprise>',
    ':-O': '<surprise>',
    ':-o': '<surprise>',
    ':O': '<surprise>',
    '°o°': '<surprise>',
    'o_O': '<surprise>',
    'o_0': '<surprise>',
    'o.O': '<surprise>',
    'o-o': '<surprise>',
    '8-0': '<surprise>',
    '|-O': '<surprise>',
    ';-)': '<wink>',
    ';)': '<wink>',
    '*-)': '<wink>',
    '*)': '<wink>',
    ';-]': '<wink>',
    ';]': '<wink>',
    ';d': '<wink>',
    ';^)': '<wink>',
    ':-,': '<wink>',
    '>:p': '<tong>',
    ':-p': '<tong>',
    ':p': '<tong>',
    'x-': '<tong>',
    'x-p': '<tong>',
    'xp': '<tong>',
    ':-p': '<tong>',
    ':p': '<tong>',
    '=p': '<tong>',
    ':-Þ': '<tong>',
    ':Þ': '<tong>',
    ':-b': '<tong>',
    ':b': '<tong>',
    ':-&': '<tong>',
    '>:\\': '<annoyed>',
    '>:/': '<annoyed>',
    ':-/': '<annoyed>',
    ':-.': '<annoyed>',
    ':/': '<annoyed>',
    ':\\': '<annoyed>',
    '=/': '<annoyed>',
    '=\\': '<annoyed>',
    ':L': '<annoyed>',
    '=L': '<annoyed>',
    ':S': '<annoyed>',
    '>.<': '<annoyed>',
    ':-|': '<annoyed>',
    '<:-|': '<annoyed>',
    ':-x': '<seallips>',
    ':x': '<seallips>',
    ':-#': '<seallips>',
    ':#': '<seallips>',
    'o:-)': '<angel>',
    '0:-3': '<angel>',
    '0:3': '<angel>',
    '0:-)': '<angel>',
    '0:)': '<angel>',
    '0;^)': '<angel>',
    '>:)': '<devil>',
    '>:d': '<devil>',
    '>:-d': '<devil>',
    '>;)': '<devil>',
    '>:-)': '<devil>',
    '}:-)': '<devil>',
    '}:)': '<devil>',
    '3:-)': '<devil>',
    '3:)': '<devil>',
    'o/\\o': '<highfive>',
    '^5': '<highfive>',
    '>_>^': '<highfive>',
    '^<_<': '<highfive>',
    '<3': '<heart>'
}

In [None]:
import os
def _load_noslang_data():
    noslang_dict = {}
    infile = open("/content/gdrive/MyDrive/noslang_mod.txt", 'r')
    for line in infile:
        items = line.split(' - ')
        if len(items[0]) > 0 and len(items) > 1:
            noslang_dict[items[0].strip()] = items[1].strip()
    return noslang_dict

In [None]:
from typing import Dict, Optional
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from textblob import Word
from textblob import TextBlob
from tqdm import tqdm
import re

nltk.download('stopwords')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))

nltk.download('wordnet')
tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

tqdm.pandas()

def to_lower(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences.
  """
  df[x_col] = df[x_col].apply(lambda sentence: sentence.lower())

def remove_punctuation(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences.
  """

  df[x_col] =df[x_col].apply(lambda sentence: re.sub(r'(?<=\w)[^\s\w](?![^\s])', '', sentence ) ) 



def replace_abbreviations(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  slang = _load_noslang_data()
  df[x_col] = df[x_col].apply(lambda tokens: [slang.get(token, token) for token in tokens])

def simplify_haha(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences
  """
  haha = r"\ba?h+a+\-?h+a+\-?[h+a+\-?]*\b"
  df[x_col] = df[x_col].apply(lambda sentence: re.sub(haha, 'haha', sentence))


def replace_emoticons(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens
  """
  df[x_col] = df[x_col].apply(lambda tokens: [emoticons.get(token, token) for token in tokens])

  
def tokenize(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences.
  """
  df[x_col] = df[x_col].apply(lambda sentence: tokenizer.tokenize(sentence))

def remove_tag_tokens(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w for w in tokens if not w in ['user', '<url>']])

def remove_stopwords(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w for w in tokens if not w in stop_words])

def lemmatize(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])

def remove_single_symbols(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w for w in tokens if len(w) > 1])

def spelling_correction(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].progress_apply(lambda tokens: [Word(w).correct() for w in tokens])


def replace_user_handles(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w if not (w.startswith("@") and len(w) > 1) else "<user>" for w in tokens])

def replace_urls(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w if not (w.startswith("http://") or w.startswith("https://") or w.startswith("www.")) else "<url>" for w in tokens])

def untokenize(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: " ".join(tokens))

def preprocess(df: pd.DataFrame, flags: Optional[Dict[str, bool]], x_col='x'):
  if flags is not None:
    if flags.get('to_lower', False):
      to_lower(df, x_col=x_col)
    if flags.get('remove_punctuation', False):
      remove_punctuation(df, x_col=x_col)
    if flags.get('simplify_haha', False):
      simplify_haha(df, x_col=x_col)
    if flags.get('tokenize', False):
      tokenize(df, x_col=x_col)
    if flags.get('replace_abbreviations', False):
      replace_abbreviations(df, x_col=x_col)
    if flags.get('replace_emoticons', False):
      replace_emoticons(df, x_col=x_col)
    if flags.get('replace_user_handles', False):
      replace_user_handles(df, x_col=x_col)
    if flags.get('replace_urls', False):
      replace_urls(df, x_col=x_col)  
    if flags.get('remove_tag_tokens', False):
      remove_tag_tokens(df, x_col=x_col)
    if flags.get('remove_stopwords', False):
      remove_stopwords(df, x_col=x_col)
    if flags.get('lemmatize', False):
      lemmatize(df, x_col=x_col)
    if flags.get('remove_single_symbols', False):
      remove_single_symbols(df, x_col=x_col)
    if flags.get('spelling_correction', False):
      spelling_correction(df, x_col=x_col)
    if flags.get('untokenize', False):
      untokenize(df, x_col=x_col)
  

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Processing tweets

In [None]:
import copy
unprocessed_tweets = copy.deepcopy(tweets)

In [None]:
preprocess(tweets, flags={'to_lower': True, 'remove_punctuation': True, 'simplify_haha': True, 'tokenize': True, 'replace_abbreviations': True, 'replace_emoticons': True, 'replace_user_handles': True, 'remove_stopwords': True, 'lemmatize': True, 'remove_single_symbols': True, 'spelling_correction': False, 'replace_urls': True, 'untokenize':True})

In [None]:
from sklearn.model_selection import train_test_split
def split_data(tweets):
    train_tweets, test_tweets, train_pos, test_pos, train_neg, test_neg, _, test_strat = train_test_split(tweets['x'], tweets['pos'], tweets['neg'], tweets['strat'], test_size=0.2, random_state=0, stratify=tweets[['strat']])
    val_tweets, test_tweets, val_pos, test_pos, val_neg, test_neg = train_test_split(test_tweets, test_pos, test_neg, test_size=0.5, random_state=0, stratify=test_strat)
    return train_tweets, val_tweets, test_tweets, train_pos, val_pos, test_pos, train_neg, val_neg, test_neg

In [None]:
train_tweets, val_tweets, test_tweets, train_pos, val_pos, test_pos, train_neg, val_neg, test_neg = split_data(tweets)


# Reset the indices
train_tweets = train_tweets.reset_index(drop=True)
train_pos = train_pos.reset_index(drop=True)
train_neg = train_neg.reset_index(drop=True)
val_tweets = val_tweets.reset_index(drop=True)
val_pos = val_pos.reset_index(drop=True)
val_neg = val_neg.reset_index(drop=True)
test_tweets = test_tweets.reset_index(drop=True)
test_pos = test_pos.reset_index(drop=True)
test_neg = test_neg.reset_index(drop=True)

In [None]:


#print(train_tweets)
print(type(train_neg[1]))
print(train_neg)
train_neg = train_neg.astype(int)
train_pos = train_pos.astype(int)
val_neg = val_neg.astype(int)
val_pos = val_pos.astype(int)
print(type(train_neg[1]))
print(train_neg)
#print(train_pos)

#print(tweets.iloc[393888])

<class 'str'>
0         -1
1         -2
2         -1
3         -1
4         -1
          ..
540291    -1
540292    -3
540293    -4
540294    -1
540295    -1
Name: neg, Length: 540296, dtype: object
<class 'numpy.int64'>
0        -1
1        -2
2        -1
3        -1
4        -1
         ..
540291   -1
540292   -3
540293   -4
540294   -1
540295   -1
Name: neg, Length: 540296, dtype: int64


In [None]:
len(train_tweets), len(train_pos), len(val_tweets), len(test_tweets)

(540296, 540296, 67537, 67537)

In [None]:
train_tweets

0                      need stripper today’s d&amp;d sesh!!
1         intention pointing (1 administration criticize...
2         #exclusive video youtube tablighi jamaat chief...
3         <user> cover tried echo art nicholas grunas "t...
4         florida’s hepatitis outbreak prompt vaccine pu...
                                ...                        
540291    acting chief executive china's #hongkong speci...
540292    <user> human right start decent life governmen...
540293    <user> yes let's take post joke "hey italian g...
540294    {{google say it’s building nationwide coronavi...
540295    <user> maybe opened big store social distancin...
Name: x, Length: 540296, dtype: object

# Part 3: Transformers

BERT (Bidirectional Encoder Representations from Transformers) is a transformer-based language representational model that pre-trains representations from unlabeled text by jointly conditioning on left and right context in all layers. These representations are not task-specific, such that the model can be pre-trained and then fine-tuned on many different tasks, including sentiment classification.
The core part of BERT makes use of the Transformer model, an attention based model that learns contextual relations between words in a text. A Transformer consists of an encoder that produces representations from the input and a decoder that outputs a prediction for a specific task.
BERT only uses the encoder, because its goal is to produce a language model.
BERT is bidirectional in the sense that it doesn't read an input sequence from one direction word by word, but reads the entire sequence at once, which allows it to learn contextual information of a word from all the words to both of its sides.





As opposed to retraining all parameters of a pretrained model, a common approach is to fix the majority of a model’s parameters and only train a subset of selected parameters with the new data. For pretrained BERT models, these parameters are typically the ones on the output classification layer and the ones of the intermediate layer norms. It was shown in (Luet al., 2021) that transformer models are even capable of generalizing to different input modalities when pre-trained on language data and only fine-tuning the above-mentioned parameters, which is why finetuning these parameters for our task is a reasonable approach.

When fine-tuning a pre-trained model, it may be the case that performance is improved when the pretrained model was trained on data and for a task that is similar to the data and task at hand. For this reason, we used the Twitter roBERTa Base for Sentiment Analysis model as our base for fine-tuning.
It is a roBERTa-based model which was trained on approximately 124M English tweets and then fine-tuned for sentiment analysis with the TweetEval benchmark.
It classifies tweets into the three categories Positive, Negative, and Neutral. The difference between BERT and roBERTa is that the latter uses an improved masking strategy.

In [None]:
!pip install transformers

!pip install sentence-transformers

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from transformers import AutoConfig, AutoTokenizer, AutoModel

class CustomSentimentClassifier(nn.Module):
    def __init__(self, n_classes_pos, n_classes_neg):
        super(CustomSentimentClassifier, self).__init__()
        config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        config.output_hidden_states = True
        self.roberta = RobertaModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", config=config)
        #self.roberta = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", config=config)
        print(self.roberta.config)
        self.drop = nn.Dropout(p=0.3)
        #self.out_pos = nn.Linear(self.roberta.config.hidden_size, n_classes_pos)
        #self.out_neg = nn.Linear(self.roberta.config.hidden_size, n_classes_neg)
        self.out = nn.Linear(self.roberta.config.hidden_size, n_classes_pos + n_classes_neg)
        self.softmax = nn.Softmax(dim=1)

        self.n_classes_pos = n_classes_pos
        self.n_classes_neg = n_classes_neg

         # Freeze the majority of the model's parameters
        for param in self.roberta.parameters():
            param.requires_grad = False

        # Unfreeze the last few layers
        num_layers_to_unfreeze = 3
        for layer in self.roberta.encoder.layer[-num_layers_to_unfreeze:]:
            for param in layer.parameters():
                param.requires_grad = True

        #for i, layer in enumerate(self.roberta.encoder.layer):
          #if i % 2 == 0:
              #for param in layer.parameters():
                  #param.requires_grad = False


        # Unfreeze the first few layers
        #num_layers_to_freeze = 3
        #for layer in self.roberta.encoder.layer[:num_layers_to_freeze]:
            #for param in layer.parameters():
                #param.requires_grad = True

        #for param in self.roberta.pooler.parameters():
           #param.requires_grad = True
        #for param in self.drop.parameters():
            #param.requires_grad = True
        #for param in self.out_pos.parameters():
            #param.requires_grad = True
        #for param in self.out_neg.parameters():
            #param.requires_grad = True
    


    def forward(self, input_ids, attention_mask):
      outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
      pooled_output = outputs['pooler_output']
      output = self.drop(pooled_output)
      #pos_logits = self.out_pos(output)
      #neg_logits = self.out_neg(output)
      logits = self.out(output)
      # Split the logits into positive and negative parts
      pos_logits, neg_logits = torch.split(logits, self.n_classes_pos, dim=1)

      # Apply softmax separately to the positive and negative parts
      #pos_probs = self.softmax(pos_logits)
      #neg_probs = self.softmax(neg_logits)
    
      return pos_logits, neg_logits



In [None]:
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, tweets, pos_labels, neg_labels, tokenizer, max_len):
        self.tweets = tweets
        self.pos_labels = pos_labels
        self.neg_labels = neg_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item):
        if item >= len(self.tweets):
            raise IndexError("Index out of range")

        tweet = str(self.tweets[item])
        pos_label = self.pos_labels[item] - 1
        neg_label = abs(self.neg_labels[item]) - 1

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "pos_label": torch.tensor(pos_label, dtype=torch.long),
            "neg_label": torch.tensor(neg_label, dtype=torch.long)
        }

def create_data_loader(tweets, pos_labels, neg_labels, tokenizer, max_len, batch_size):
    dataset = SentimentDataset(tweets[:500], pos_labels[:500], neg_labels[:500], tokenizer, max_len)
    return DataLoader(dataset, batch_size=batch_size)


In [None]:
tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
#tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
BATCH_SIZE = 16
MAX_LEN = 240
from transformers import get_scheduler


train_data_loader = create_data_loader(train_tweets, train_pos, train_neg, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_tweets, val_pos, val_neg, tokenizer, MAX_LEN, BATCH_SIZE)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_EPOCHS = 3
LEARNING_RATE = 1e-5
from torch import optim
num_classes_pos = 5  # Number of positive sentiment classes
num_classes_neg = 5  # Number of negative sentiment classes
model = CustomSentimentClassifier(num_classes_pos, num_classes_neg)
model = model.to(device)

loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
nm_steps = 3*len(train_data_loader)



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [None]:
from tqdm import tqdm
import torch.nn.functional as F
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    #model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    pos_preds_list, neg_preds_list = [], []
    pos_labels_list, neg_labels_list = [], []
    with tqdm(total=len(data_loader), desc="Training", unit="batch", leave=False) as pbar:
      for batch in data_loader:
          input_ids = batch["input_ids"].to(device)
          attention_mask = batch["attention_mask"].to(device)
          pos_labels = batch["pos_label"].to(device)
          neg_labels = batch["neg_label"].to(device)

    
          optimizer.zero_grad()
          

          pos_logits, neg_logits = model(input_ids=input_ids, attention_mask=attention_mask)

          pos_loss = loss_fn(pos_logits, pos_labels)
          neg_loss = loss_fn(neg_logits, neg_labels)
          total_batch_loss = pos_loss + neg_loss

          total_batch_loss.backward()
          optimizer.step()
          #lr_scheduler.step()
          

          total_loss += total_batch_loss.item()

          # Apply softmax separately to the positive and negative parts
          pos_probs = F.softmax(pos_logits, dim=1)
          neg_probs = F.softmax(neg_logits, dim=1)

          # Calculate accuracy
          _, pos_preds = torch.max(pos_probs, 1)
          _, neg_preds = torch.max(neg_probs, 1)
          pos_preds = pos_preds + 1  # shift back to original range
          neg_preds = (neg_preds + 1) * -1  # shift back to original range
          pos_labels = pos_labels + 1  # shift back to original range
          neg_labels = (neg_labels + 1) * -1  # shift back to original range

          total_correct += (pos_preds == pos_labels).sum().item() + (neg_preds == neg_labels).sum().item()
          total_samples += pos_labels.size(0) + neg_labels.size(0)
          accuracy = total_correct / total_samples

          pbar.update(1)
          pbar.set_postfix({"Loss": total_loss / (pbar.n + 1), "Accuracy": accuracy})
          

    return total_loss / len(data_loader), total_correct / total_samples


for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss, train_accuracy = train_epoch(model, train_data_loader, loss_fn, optimizer, device)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Training Accuracy: {train_accuracy:.4f} ")
    #Evaluation
    model.eval()
    val_loss, val_accuracy = train_epoch(model, val_data_loader, loss_fn, optimizer, device)
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f} ")

# Save the fine-tuned model
torch.save(model.state_dict(), f"/content/gdrive/MyDrive/joint_roberta_sentiment_model.pt")




Epoch 1/3
Training Loss: 2.1025
Training Accuracy: 0.6360 




Validation Loss: 1.7778
Validation Accuracy: 0.7070 




Epoch 2/3
Training Loss: 1.7311
Training Accuracy: 0.7030 




Validation Loss: 1.5916
Validation Accuracy: 0.7200 




Epoch 3/3
Training Loss: 1.5979
Training Accuracy: 0.6980 




Validation Loss: 1.5044
Validation Accuracy: 0.7200 


In [None]:
from sklearn.metrics import f1_score
model.load_state_dict(torch.load(f"/content/gdrive/MyDrive/joint_roberta_sentiment_model.pt"))
def test_epoch(model, data_loader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    pos_preds_all = []
    neg_preds_all = []
    true_pos_labels = []
    true_neg_labels = []

    with torch.no_grad():
      with tqdm(total=len(data_loader), desc="Training", unit="batch", leave=False) as pbar:
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            pos_labels = batch["pos_label"].to(device)
            neg_labels = batch["neg_label"].to(device)

            pos_logits, neg_logits = model(input_ids=input_ids, attention_mask=attention_mask)

            # Apply softmax separately to the positive and negative parts
            pos_probs = F.softmax(pos_logits, dim=1)
            neg_probs = F.softmax(neg_logits, dim=1)

            # Calculate accuracy
            _, pos_preds = torch.max(pos_probs, 1)
            _, neg_preds = torch.max(neg_probs, 1)

            #pos_preds = pos_preds.argmax(dim=1)
            #neg_preds = neg_preds.argmax(dim=1)

            pos_preds_all.extend(pos_preds.cpu().numpy())
            neg_preds_all.extend(neg_preds.cpu().numpy())
            true_pos_labels.extend(pos_labels.cpu().numpy())
            true_neg_labels.extend(neg_labels.cpu().numpy())

            pbar.update(1)

    # Converting predictions and true labels back to original range
    adjusted_pos_preds = [pred + 1 for pred in pos_preds_all]
    adjusted_neg_preds = [-(pred + 1) for pred in neg_preds_all]
    adjusted_true_pos_labels = [label + 1 for label in true_pos_labels]
    adjusted_true_neg_labels = [-(label + 1) for label in true_neg_labels]

    total_correct = sum([pred == label for pred, label in zip(adjusted_pos_preds, adjusted_true_pos_labels)]) + \
                    sum([pred == label for pred, label in zip(adjusted_neg_preds, adjusted_true_neg_labels)])
    total_samples = len(adjusted_true_pos_labels) + len(adjusted_true_neg_labels)

    accuracy = total_correct / total_samples
    pos_f1_score = f1_score(adjusted_true_pos_labels, adjusted_pos_preds, average='weighted')
    neg_f1_score = f1_score(adjusted_true_neg_labels, adjusted_neg_preds, average='weighted')

    return accuracy, pos_f1_score, neg_f1_score

test_neg = test_neg.astype(int)
test_pos = test_pos.astype(int)
test_data_loader = create_data_loader(test_tweets, test_pos, test_neg, tokenizer, MAX_LEN, BATCH_SIZE)
accuracy, pos_f1_score, neg_f1_score = test_epoch(model, test_data_loader, device)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Positive sentiment F1 score: {pos_f1_score:.4f}")
print(f"Negative sentiment F1 score: {neg_f1_score:.4f}")

                                                          

Test Accuracy: 0.6400
Positive sentiment F1 score: 0.7121
Negative sentiment F1 score: 0.3787


