# Imports

In [None]:
!pip install transformers
!pip install emoji
!pip install sentencepiece
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://us

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import plotly.express as px
import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
# from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import emoji
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoConfig
from sklearn.metrics import classification_report
from transformers.modeling_outputs import TokenClassifierOutput, SequenceClassifierOutput
from transformers import AdamW, get_scheduler
from datasets import load_metric, Dataset
from statistics import mean

# Mounting Drive for Reading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd 'drive/MyDrive/NLP_Project'

/content/drive/MyDrive/NLP_Project


# utils

In [None]:
def read_data(file_path = "train.csv"):
  df = pd.read_csv(file_path)
  df['len'] = df['text'].apply(lambda x: len(x))
  df['label'] = df['label'].astype('float32')
#   df['normalized_label'] = df['label'].apply(lambda x: (x-1)/5)
  return df

def filter_tweet_language(df, language = "English"):
  return df[df['language']==language]

def filter_tweet_intimacy(df, lower_bound = 1, upper_bound = 5):
  return df.loc[(df['label'] >= lower_bound) & (df['label'] <= upper_bound)]

def train_val_test_split(df, train_portion = 0.8, val_portion = 0.1, test_portion = 0.1):
    df_train, df_val, df_test, _ = np.split(df.sample(frac=1, random_state=42), [int(train_portion * len(df)), int((train_portion + val_portion) * len(df)), int((train_portion + val_portion + test_portion) * len(df))])
    return df_train, df_val, df_test

def extract_emojis(df):
    emojis_list = list()
    for s in df['text']:
        emojis_in_text = emoji.distinct_emoji_list(s)
        if len(emojis_in_text)>0:
            emojis_list.extend(emojis_in_text)
    return list(set(emojis_list))

# HuggingFace Models

## hyperparameters

In [None]:
EPOCHS = 5
BATCH_SIZE = 32
PATH_TO_SAVE = "./checkpoint"

## creating dataset

In [None]:
class RegressionIntimacyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.all_data = df
        self.tokenizer = tokenizer
        
        # tokenized texts of our dataset
        self.texts = [self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") 
                                for text in self.all_data['text']]

        # intimacy scores
        scaler = MinMaxScaler()
        self.labels = scaler.fit_transform(self.all_data['label'].to_numpy().reshape(-1, 1))
    
    def classes(self,):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]
    
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])
    
    def get_max_setntence_len(self):
        # get length of longest sentence in our dataset
        return max(self.all_data['len'])
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_labels = self.get_batch_labels(idx)
        
        return batch_texts, batch_labels




In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"

In [None]:
# tokenizer for tokenization of texts
# bert_base_uncased_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # cased or uncased?
xlmt_tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)


Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
df = read_data()
intim_dataset = RegressionIntimacyDataset(df, xlmt_tokenizer)
print(len(intim_dataset.labels))

9491


In [None]:
df['label']

0       1.8
1       1.0
2       1.0
3       1.6
4       1.6
       ... 
9486    1.0
9487    2.0
9488    3.8
9489    1.8
9490    1.6
Name: label, Length: 9491, dtype: float32

## model definition

In [None]:
xlmt_model = AutoModel.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mode

In [None]:
for name, param in xlmt_model.named_parameters():
    param.requires_grad = False
    if name == 'pooler.dense.weight' or name == 'pooler.dense.bias':
        param.requires_grad = True
    print(name, param.requires_grad)

embeddings.word_embeddings.weight False
embeddings.position_embeddings.weight False
embeddings.token_type_embeddings.weight False
embeddings.LayerNorm.weight False
embeddings.LayerNorm.bias False
encoder.layer.0.attention.self.query.weight False
encoder.layer.0.attention.self.query.bias False
encoder.layer.0.attention.self.key.weight False
encoder.layer.0.attention.self.key.bias False
encoder.layer.0.attention.self.value.weight False
encoder.layer.0.attention.self.value.bias False
encoder.layer.0.attention.output.dense.weight False
encoder.layer.0.attention.output.dense.bias False
encoder.layer.0.attention.output.LayerNorm.weight False
encoder.layer.0.attention.output.LayerNorm.bias False
encoder.layer.0.intermediate.dense.weight False
encoder.layer.0.intermediate.dense.bias False
encoder.layer.0.output.dense.weight False
encoder.layer.0.output.dense.bias False
encoder.layer.0.output.LayerNorm.weight False
encoder.layer.0.output.LayerNorm.bias False
encoder.layer.1.attention.self.query

In [None]:
class XLMTRegressor(nn.Module):
    def __init__(self, model, hidden_count = 20, dropout = 0.2):
        super(XLMTRegressor, self).__init__()

        self.xlmt = model
        self.linear = nn.Linear(768, hidden_count)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.regression = nn.Linear(hidden_count, 1)

    def forward(self, input_ids = None, attention_mask = None, labels = None):
        encoding_output = self.xlmt(input_ids = input_ids, attention_mask = attention_mask)
        # https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPooling
        linear_output = self.linear(encoding_output[1])
        relu_output = self.relu(linear_output)
        dropout_output = self.dropout(relu_output)
        final_output = self.regression(dropout_output)

        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(final_output, labels)
        
        return SequenceClassifierOutput(loss = loss, logits = final_output, hidden_states = encoding_output.hidden_states, attentions = encoding_output.attentions)



## train model

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    train, val = RegressionIntimacyDataset(train_data, xlmt_tokenizer), RegressionIntimacyDataset(val_data, xlmt_tokenizer)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size = BATCH_SIZE, shuffle = True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size = BATCH_SIZE, shuffle = True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = AdamW(model.parameters(), lr = learning_rate)

    if use_cuda:
        model = model.cuda()

    for epoch_num in range(epochs):
        model.train()

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            outputs = model(input_id, mask, train_label)

            loss = outputs.loss
            loss.backward()
            optimizer.step()

            optimizer.zero_grad()
        
        torch.save(model.state_dict(), f'{PATH_TO_SAVE}_{epoch_num}')


        model.eval()
        eval_losses = []
        for val_input, val_label in tqdm(val_dataloader):
            val_label = val_label.to(device)
            mask = val_input['attention_mask'].to(device)
            input_id = val_input['input_ids'].squeeze(1).to(device)

            with torch.no_grad():
                outputs = model(input_id, mask, val_label)
            
            model_pred = outputs.logits
            eval_losses.append(outputs.loss.item())

        print(f'loss = {mean(eval_losses)}')
                
EPOCHS = 5
model = XLMTRegressor(xlmt_model)
LR = 1e-3

df_train, df_val, df_test = train_val_test_split(df)
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 238/238 [04:17<00:00,  1.08s/it]
100%|██████████| 30/30 [00:28<00:00,  1.05it/s]


loss = 0.03679414385308822


100%|██████████| 238/238 [04:18<00:00,  1.09s/it]
100%|██████████| 30/30 [00:28<00:00,  1.05it/s]


loss = 0.03377731361736854


100%|██████████| 238/238 [04:18<00:00,  1.09s/it]
100%|██████████| 30/30 [00:28<00:00,  1.05it/s]


loss = 0.03337364811450243


100%|██████████| 238/238 [04:18<00:00,  1.09s/it]
100%|██████████| 30/30 [00:28<00:00,  1.05it/s]


loss = 0.03205319847911596


100%|██████████| 238/238 [04:18<00:00,  1.09s/it]
100%|██████████| 30/30 [00:28<00:00,  1.05it/s]

loss = 0.032118543547888595





## evaluate

In [None]:
def evaluate(model, test_data):
    test = RegressionIntimacyDataset(test_data, xlmt_tokenizer)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    model.eval()
    with torch.no_grad():
        test_losses = []
        for test_input, test_label in tqdm(test_dataloader):
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            outputs = model(input_id, mask, test_label)

            model_pred = outputs.logits
            test_losses.append(outputs.loss.item())
    
    print(f'loss = {mean(test_losses)}')
    
evaluate(model, df_test)

100%|██████████| 475/475 [00:29<00:00, 16.14it/s]


loss = 0.030576393889557373


## evaluation on test data codalab

In [None]:
PATH_FOR_READ = './checkpoint'

In [None]:
def read_test_data(file_path = "semeval_test.csv"):
  df = pd.read_csv(file_path)
#   df['len'] = df['text'].apply(lambda x: len(x))
#   df['label'] = df['label'].astype('float32')
#   df['normalized_label'] = df['label'].apply(lambda x: (x-1)/5)
  return df

In [None]:
class TestRegressionIntimacyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.all_data = df
        self.tokenizer = tokenizer
        
        # tokenized texts of our dataset
        self.texts = [self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") 
                                for text in self.all_data['text']]
    
    def __len__(self):
        return len(self.all_data)
    
    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        
        return batch_texts


In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base"
xlmt_tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
xlmt_model = AutoModel.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mode

In [None]:
# for name, param in xlmt_model.named_parameters():
#     param.requires_grad = False

In [None]:
model = XLMTRegressor(xlmt_model)
model.load_state_dict(torch.load(f'{PATH_FOR_READ}_{EPOCHS-1}'))
# model.load_state_dict(torch.load(f'{PATH_FOR_READ}_{EPOCHS-1}', map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
def predict(model, test_data):
    test = TestRegressionIntimacyDataset(test_data, xlmt_tokenizer)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    model_predictions = []
    model.eval()
    with torch.no_grad():
        for test_input in tqdm(test_dataloader):
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            outputs = model(input_id, mask)
            model_pred = outputs.logits
            model_predictions.append(model_pred)
    return model_predictions


In [None]:
df_coda = read_test_data("train.csv")    
# df_coda = read_test_data("semeval_test.csv")    
outputs = predict(model, df_coda)

100%|██████████| 297/297 [04:42<00:00,  1.05it/s]


In [None]:
flat_predictions = [item.cpu().numpy()[0] for sublist in outputs for item in sublist]

In [None]:
min_flat = min(flat_predictions)
max_flat = max(flat_predictions)

In [None]:
print(len(flat_predictions))

9491


In [None]:
df_coda['predictions'] = flat_predictions
df_coda['predictions'] = df_coda['predictions'].apply(lambda x: (x - min_flat) * 4 / (max_flat - min_flat) + 1)
df_coda.to_csv('results.csv')

In [None]:
df_coda['predictions'].argmax()

12021

In [None]:
df_coda['predictions'].min()

3.261665493249893

In [None]:
print(df_coda['predictions'].argmin())
print(df_coda['text'][14])
print(df_coda['language'][14])


14
#1Ene 1942 se firma la declaración de la Naciones Unidas. #60AñosDeRevoluciónCubana #Feliz2019 http
Spanish


In [None]:
print(df_coda['text'][12021])
print(df_coda['language'][12021])

@user ily bestie💞
Dutch


In [None]:
# find subtraction
sub = df_coda['predictions'] - df_coda['label']
print(abs(sub).argmax())

2439


In [None]:
print(df_coda['text'][2439])
print(df_coda['language'][2439])

Que empiece el fin de cojedera. Empezamos con un gang Bang. Activos un paso al frente
Spanish
