# Imports

In [None]:
!pip install transformers
!pip install emoji
!pip install sentencepiece
!pip install datasets
!pip install transformers
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import plotly.express as px
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
# from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import emoji
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoConfig
from sklearn.metrics import classification_report
from transformers.modeling_outputs import TokenClassifierOutput, SequenceClassifierOutput
from transformers import AdamW, get_scheduler
from datasets import load_metric, Dataset
from statistics import mean
from torchmetrics import PearsonCorrCoef, SpearmanCorrCoef
from torchmetrics.functional import pearson_corrcoef, spearman_corrcoef
import random

In [None]:
GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


# Mounting Drive for Reading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd 'drive/MyDrive/NLP_Project'

/content/drive/MyDrive/NLP_Project


# utils

In [None]:
def read_data(file_path = "train.csv"):
  df = pd.read_csv(file_path)
  df['len'] = df['text'].apply(lambda x: len(x))
  df['label'] = df['label'].astype('float32')
  return df

def filter_tweet_language(df, language = "English"):
  return df[df['language']==language]

def filter_tweet_intimacy(df, lower_bound = 1, upper_bound = 5):
  return df.loc[(df['label'] >= lower_bound) & (df['label'] <= upper_bound)]

def train_val_test_split(df, train_portion = 0.9, val_portion = 0.05, test_portion = 0.05):
    df_train, df_val, df_test, _ = np.split(df.sample(frac=1, random_state=42), [int(train_portion * len(df)), int((train_portion + val_portion) * len(df)), int((train_portion + val_portion + test_portion) * len(df))])
    return df_train, df_val, df_test

def extract_emojis(df):
    emojis_list = list()
    for s in df['text']:
        emojis_in_text = emoji.distinct_emoji_list(s)
        if len(emojis_in_text)>0:
            emojis_list.extend(emojis_in_text)
    return list(set(emojis_list))

# HuggingFace Models

## Hyperparameters and Global Variables

In [None]:
EPOCHS = 6
BATCH_SIZE = 32
LR = 1e-4
PATH_TO_SAVE = "./checkpoint"
PATH_FOR_READ = './checkpoint'

## creating dataset

In [None]:
class RegressionIntimacyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.all_data = df
        self.tokenizer = tokenizer
        
        # tokenized texts of our dataset
        self.encodings = {}
        self.encodings['input_ids'] = np.array([np.array(self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")['input_ids'])
                                for text in self.all_data['text']])
        
        self.encodings['attention_mask'] = np.array([np.array(self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")['attention_mask'])
                                for text in self.all_data['text']])
        
        # intimacy scores
        scaler = MinMaxScaler()
        self.labels = scaler.fit_transform(self.all_data['label'].to_numpy().reshape(-1, 1))
        # self.labels = self.all_data['label']
        # self.labels = self.all_data['label'].to_numpy().reshape(-1, 1)
        # self.labels = self.all_data['label'].to_numpy().reshape(-1, 1) * 10
    
    def classes(self,):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_max_setntence_len(self):
        # get length of longest sentence in our dataset
        return max(self.all_data['len'])
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
# tokenizer for tokenization of texts
# bert_base_uncased_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # cased or uncased?
xlmt_tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
df = read_data("augmented_dataset.csv")
intim_dataset = RegressionIntimacyDataset(df, xlmt_tokenizer)
print(len(intim_dataset.labels))

print(df['label'])

23721
0        1.8
1        1.0
2        1.0
3        1.6
4        1.6
        ... 
23716    1.0
23717    2.0
23718    3.8
23719    1.8
23720    1.6
Name: label, Length: 23721, dtype: float32


## model definition

In [None]:
xlmt_model = AutoModel.from_pretrained(MODEL)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this mode

In [None]:
for name, param in xlmt_model.named_parameters():
    param.requires_grad = False
    if "encoder.layer.10" in name or "encoder.layer.11" in name or name == 'pooler.dense.weight' or name == 'pooler.dense.bias':
        param.requires_grad = True
    print(name, param.requires_grad)

embeddings.word_embeddings.weight False
embeddings.position_embeddings.weight False
embeddings.token_type_embeddings.weight False
embeddings.LayerNorm.weight False
embeddings.LayerNorm.bias False
encoder.layer.0.attention.self.query.weight False
encoder.layer.0.attention.self.query.bias False
encoder.layer.0.attention.self.key.weight False
encoder.layer.0.attention.self.key.bias False
encoder.layer.0.attention.self.value.weight False
encoder.layer.0.attention.self.value.bias False
encoder.layer.0.attention.output.dense.weight False
encoder.layer.0.attention.output.dense.bias False
encoder.layer.0.attention.output.LayerNorm.weight False
encoder.layer.0.attention.output.LayerNorm.bias False
encoder.layer.0.intermediate.dense.weight False
encoder.layer.0.intermediate.dense.bias False
encoder.layer.0.output.dense.weight False
encoder.layer.0.output.dense.bias False
encoder.layer.0.output.LayerNorm.weight False
encoder.layer.0.output.LayerNorm.bias False
encoder.layer.1.attention.self.query

In [None]:
class XLMTRegressor(nn.Module):
    def __init__(self, model, hidden_count = 20, dropout = 0.2):
        super(XLMTRegressor, self).__init__()

        self.xlmt = model
        self.linear = nn.Linear(768, hidden_count)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.regression = nn.Linear(hidden_count, 1)

    def forward(self, input_ids = None, attention_mask = None, labels = None, loss_fn_type = 'pearson'):
        encoding_output = self.xlmt(input_ids = input_ids, attention_mask = attention_mask)
        # https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPooling
        linear_output = self.linear(encoding_output[1])
        relu_output = self.relu(linear_output)
        dropout_output = self.dropout(relu_output)
        final_output = self.regression(dropout_output)
        

        loss = None
        if labels is not None:
            if loss_fn_type == 'mse':
                loss_fn = nn.MSELoss()
                loss = loss_fn(final_output, labels)
            elif loss_fn_type == 'pearson':
                pearson = PearsonCorrCoef(num_outputs = 1)
                loss = -pearson(torch.squeeze(final_output, 1).to('cpu'), torch.squeeze(labels, 1).to('cpu'))
            elif loss_fn_type == 'spearman':
                spearman = SpearmanCorrCoef(num_outputs = 1)
                loss = -spearman(torch.squeeze(final_output, 1).to('cpu'), torch.squeeze(labels, 1).to('cpu'))
            elif loss_fn_type == 'mse+pearson':
                mse = nn.MSELoss()
                pearson = PearsonCorrCoef(num_outputs = 1)
                loss = mse(final_output, labels) - pearson(torch.squeeze(final_output, 1).to('cpu'), torch.squeeze(labels, 1).to('cpu'))

        
        return SequenceClassifierOutput(loss = loss, logits = final_output, hidden_states = encoding_output.hidden_states, attentions = encoding_output.attentions)



## train model

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    train, val = RegressionIntimacyDataset(train_data, xlmt_tokenizer), RegressionIntimacyDataset(val_data, xlmt_tokenizer)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size = BATCH_SIZE, shuffle = True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size = BATCH_SIZE, shuffle = True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = AdamW(model.parameters(), lr = learning_rate)

    if use_cuda:
        model = model.cuda()

    for epoch_num in range(epochs):
        model.train()
        # train the model on train data
        for train_batch in tqdm(train_dataloader):
            train_label = train_batch['labels'].to(device)
            mask = train_batch['attention_mask'].to(device)
            input_id = train_batch['input_ids'].squeeze(1).to(device)

            outputs = model(input_id, mask, train_label, loss_fn_type = 'mse')

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            optimizer.zero_grad()
        
        torch.save(model.state_dict(), f'{PATH_TO_SAVE}_{epoch_num}')


        model.eval()
        # evaluate the model on validation data
        eval_losses = []
        for val_batch in tqdm(val_dataloader):
            val_label = val_batch['labels'].to(device)
            mask = val_batch['attention_mask'].to(device)
            input_id = val_batch['input_ids'].squeeze(1).to(device)

            with torch.no_grad():
                outputs = model(input_id, mask, val_label, loss_fn_type = 'mse')
            
            model_pred = outputs.logits
            eval_losses.append(outputs.loss.item())

        print(f'loss = {mean(eval_losses)}')


              
model = XLMTRegressor(xlmt_model)
df_train, df_val, df_test = train_val_test_split(df)
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 668/668 [14:34<00:00,  1.31s/it]
100%|██████████| 38/38 [00:34<00:00,  1.11it/s]


loss = 0.02782576773805838


100%|██████████| 668/668 [14:37<00:00,  1.31s/it]
100%|██████████| 38/38 [00:34<00:00,  1.10it/s]


loss = 0.028499379999151354


100%|██████████| 668/668 [14:37<00:00,  1.31s/it]
100%|██████████| 38/38 [00:34<00:00,  1.10it/s]


loss = 0.025270869514267696


100%|██████████| 668/668 [14:36<00:00,  1.31s/it]
100%|██████████| 38/38 [00:34<00:00,  1.10it/s]


loss = 0.023713568274519946


100%|██████████| 668/668 [14:37<00:00,  1.31s/it]
100%|██████████| 38/38 [00:34<00:00,  1.10it/s]


loss = 0.022539541480670635


100%|██████████| 668/668 [14:36<00:00,  1.31s/it]
100%|██████████| 38/38 [00:34<00:00,  1.10it/s]

loss = 0.022511504982647142





## evaluate

In [None]:
def evaluate(model, test_data):
    test = RegressionIntimacyDataset(test_data, xlmt_tokenizer)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    model.eval()
    # evaluate the model on test data
    with torch.no_grad():
        test_losses = []
        for test_batch in tqdm(test_dataloader):
            test_label = test_batch['labels'].to(device)
            mask = test_batch['attention_mask'].to(device)
            input_id = test_batch['input_ids'].squeeze(1).to(device)

            outputs = model(input_id, mask, test_label, loss_fn_type = 'mse')

            model_pred = outputs.logits
            test_losses.append(outputs.loss.item())
    
    print(f'loss = {mean(test_losses)}')
    
evaluate(model, df_test)

100%|██████████| 594/594 [00:36<00:00, 16.37it/s]

loss = 0.022738995032861923





## evaluation on test data codalab

In [None]:
def read_test_data(file_path = "semeval_test.csv"):
  df = pd.read_csv(file_path)
  return df

In [None]:
class TestRegressionIntimacyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.all_data = df
        self.tokenizer = tokenizer
        
        # tokenized texts of our dataset
        self.encodings = {}
        self.encodings['input_ids'] = np.array([np.array(self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")['input_ids'])
                                for text in self.all_data['text']])
        
        self.encodings['attention_mask'] = np.array([np.array(self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")['attention_mask'])
                                for text in self.all_data['text']])
    
    def __len__(self):
        return len(self.all_data)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item


In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
xlmt_tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

In [None]:
xlmt_model = AutoModel.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this mode

In [None]:
model = XLMTRegressor(xlmt_model)
model.load_state_dict(torch.load(f'{PATH_FOR_READ}_{EPOCHS-1}'))

<All keys matched successfully>

In [None]:
def predict(model, test_data):
    # generating output of the mode for test dataset
    test = TestRegressionIntimacyDataset(test_data, xlmt_tokenizer)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    model_predictions = []
    model.eval()
    with torch.no_grad():
        for test_batch in tqdm(test_dataloader):
            mask = test_batch['attention_mask'].to(device)
            input_id = test_batch['input_ids'].squeeze(1).to(device)
            outputs = model(input_id, mask, loss_fn_type = 'mse')
            model_pred = outputs.logits
            model_predictions.append(model_pred)
    return model_predictions


In [None]:
# df_coda = read_test_data("train.csv")
df_coda = read_test_data("semeval_test.csv")    
outputs = predict(model, df_coda)

100%|██████████| 429/429 [06:36<00:00,  1.08it/s]


In [None]:
flat_predictions = [item.cpu().numpy()[0] for sublist in outputs for item in sublist]

In [None]:
min_flat = min(flat_predictions)
max_flat = max(flat_predictions)

In [None]:
print(len(flat_predictions))

13697


In [None]:
df_coda['predictions'] = flat_predictions
df_coda['predictions'] = df_coda['predictions'].apply(lambda x: (x - min_flat) * 4 / (max_flat - min_flat) + 1)
# df_coda['predictions'] = df_coda['predictions'].apply(lambda x: x/10)
df_coda['predictions'] = df_coda['predictions'].apply(lambda x: 1 if x < 1 else x)
df_coda['predictions'] = df_coda['predictions'].apply(lambda x: 5 if x > 5 else x)
df_coda.to_csv('results.csv')

In [None]:
df_coda['predictions'].argmax()

1887

In [None]:
df_coda['predictions'].min()

1.0

In [None]:
print(df_coda['predictions'].argmin())
print(df_coda['text'][14])
print(df_coda['language'][14])


13686
#1Ene 1942 se firma la declaración de la Naciones Unidas. #60AñosDeRevoluciónCubana #Feliz2019 http
Spanish


In [None]:
print(df_coda['text'][12021])
print(df_coda['language'][12021])

@user ily bestie💞
Dutch


In [None]:
# find subtraction
sub = df_coda['predictions'] - df_coda['label']
print(abs(sub).argmax())

KeyError: ignored

In [None]:
print(df_coda['text'][2439])
print(df_coda['language'][2439])