In [None]:
# pip install pyarabic

In [None]:
# pip install transformers

In [57]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
import pyarabic.araby as araby
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import re
import torch.nn as nn
from transformers import AdamW
from tqdm import tqdm
import random
import gc
from IPython.display import clear_output
from sklearn.decomposition import PCA
import scipy as sc
import math as mt
from scipy import cluster as clst

In [4]:
train = pd.read_csv('/content/OSACT2022-sharedTask-train.txt',
sep='\t', names = ['index', 'tweet', 'offensive', 'hatespeech',
                   'vulgar', 'violent'])
dev = pd.read_csv('/content/OSACT2022-sharedTask-dev.txt',
sep='\t', names = ['index', 'tweet', 'offensive', 'hatespeech',
                   'vulgar', 'violent'])

In [5]:
tokenizer = AutoTokenizer.from_pretrained('UBC-NLP/MARBERT')
marbert = AutoModel.from_pretrained('UBC-NLP/MARBERT', output_hidden_states= True)

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at UBC-NLP/MARBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Cleaning, Tokenization, Encoding

In [6]:
def clean_data(tweet):
    text = araby.strip_tashkeel(tweet)
    text = araby.strip_tatweel(text)
    clean_tweet = text.replace('@USER', '')
    clean_tweet = clean_tweet.replace('URL', '')
    clean_tweet=bytes(clean_tweet, 'utf-8').decode('utf-8','ignore')
    clean_tweet = re.sub(r'[A-Z]+',' ', clean_tweet)
    clean_tweet = clean_tweet.replace('>', '')
    clean_tweet = clean_tweet.replace('<', '')
    return clean_tweet

In [7]:
def get_max_len(tweets, tokenizer= tokenizer):
    max_len = 0
    for tweet in tweets:
        tokens = tokenizer.encode(tweet, add_special_tokens = True)
        max_len = max(max_len, len(tokens))
    return max_len

In [8]:
def encode(tweets, max_len, tokenizer = tokenizer):
    
    input_ids = []
    attention_masks = []
    
    for tweet in tweets:
        encodings_dict = tokenizer.encode_plus(
        tweet,
        add_special_tokens = True,
        max_length = max_len,
        pad_to_max_length = True,
        return_tensors = 'pt'
    )
        input_ids.append(encodings_dict['input_ids'])
        attention_masks.append(encodings_dict['attention_mask'])

  
    input_tensor = torch.cat(input_ids, dim = 0)
    attention_mask_tensor = torch.cat(attention_masks, dim = 0)
    
    return input_tensor, attention_mask_tensor



In [96]:
def create_dataloaders(input_ids, batch_size,
                       attention_masks = None,
                      labels = None):
    
    if(labels == None):
      tensor_dataset = TensorDataset(input_ids, attention_masks)
    else:
      tensor_dataset = TensorDataset(input_ids, labels)


    dataloader = DataLoader(
        tensor_dataset,
        shuffle = False,
        batch_size = batch_size
    )
    
    return dataloader

In [10]:
labels_dict = {
    'OFF':1,
    'NOT_OFF':0
}

In [11]:
def create_labels(hatespeech_labels, labels_dict = labels_dict):
    labels = []
    for label in hatespeech_labels:
        
        label_value = labels_dict[label]
        labels.append(label_value)
            
    labels_tensor = torch.Tensor(labels)
    labels_tensor = labels_tensor.long()
 
    return labels_tensor

In [12]:
## bad tweet
train.tweet.iloc[7394]
train.drop(index = 7394, inplace = True)

In [13]:
train['tweet'] = train['tweet'].apply(clean_data)
dev['tweet'] = dev['tweet'].apply(clean_data)

train_tweets = list(train['tweet'].values)
train_max_len = get_max_len(train_tweets)

dev_tweets = list(dev['tweet'].values)
dev_max_len = get_max_len(dev_tweets)

In [14]:
off_labels_train = train.offensive.values
off_labels_dev = dev.offensive.values

labels_train = create_labels(off_labels_train)
labels_dev = create_labels(off_labels_dev)

In [15]:
train_input, train_mask = encode(train_tweets, train_max_len)
dev_input, dev_mask = encode(train_tweets, dev_max_len)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Evaluating Marbert's Isotropy

In [78]:
def extract_features(dataloader, sequence_length):

  features = []

  loop = tqdm(dataloader)
  idx=0
  for batch in loop:
    if(idx ==2 ):
      break
    with torch.no_grad():

      input_ids = batch[0]
      attention_mask_train = batch[1]
      output = marbert(input_ids, attention_mask= attention_mask_train)[0]
      
      output = np.asarray(output.cpu().numpy()).reshape((-1, sequence_length, 768))
      
      for feature_id in range(0, output.shape[0]):
        features.append(output[feature_id].reshape(sequence_length,768))
      
      idx+=1
   
        
  features = np.asarray(features).reshape((-1, 768))


  return features

In [19]:
## code cell originally by Sara Rajaee source:
# https://github.com/Sara-Rajaee/clusterbased_isotropy_enhancement/

def isotropy(representations):
    eig_values, eig_vectors = np.linalg.eig(np.matmul(np.transpose(representations),
                                                      representations))
    max_f = -mt.inf
    min_f =  mt.inf

    for i in range(eig_vectors.shape[1]):
        f = np.matmul(representations, np.expand_dims(eig_vectors[:, i], 1))
        f = np.sum(np.exp(f))

        min_f = min(min_f, f)
        max_f = max(max_f, f)

    isotropy = min_f / max_f

    return isotropy

In [None]:
train_dataloader = create_dataloaders(train_input, train_mask, batch_size = 1)
train_features= extract_features(train_dataloader, train_max_len)


In [None]:
original_isotropy_value = isotropy(train_features)
print(original_isotropy_value)

### Make Representations Isotropic

In [56]:
def cluster_based(representations, n_cluster: int, n_pc: int):


  centroid, label=clst.vq.kmeans2(representations, n_cluster, minit='points',
                                  missing='warn', check_finite=True)
  cluster_mean=[]
  for i in range(max(label)+1):
    sum=np.zeros([1,768]);
    for j in np.nonzero(label == i)[0]:
      sum=np.add(sum, representations[j])
    cluster_mean.append(sum/len(label[label == i]))

  zero_mean_representation=[]
  for i in range(len(representations)):
    zero_mean_representation.append((representations[i])-cluster_mean[label[i]])

  cluster_representations={}
  for i in range(n_cluster):
    cluster_representations.update({i:{}})
    for j in range(len(representations)):
      if (label[j]==i):
        cluster_representations[i].update({j:zero_mean_representation[j]})

  cluster_representations2=[]
  for j in range(n_cluster):
    cluster_representations2.append([])
    for key, value in cluster_representations[j].items():
      cluster_representations2[j].append(value)

  cluster_representations2=np.array(cluster_representations2)

  model=PCA()
  post_rep=np.zeros((representations.shape[0],representations.shape[1]))

  for i in range(n_cluster):
      model.fit(np.array(cluster_representations2[i]).reshape((-1,768)))
      component = np.reshape(model.components_, (-1, 768))

      for index in cluster_representations[i]:
        sum_vec = np.zeros((1, 768))

        for j in range(n_pc):
                sum_vec = sum_vec + np.dot(cluster_representations[i][index],
                          np.transpose(component)[:,j].reshape((768,1))) * component[j]
        
        post_rep[index]=cluster_representations[i][index] - sum_vec

  clear_output()

  return post_rep


In [None]:
dev_dataloader = create_dataloaders(dev_input, dev_mask, batch_size = 1)
dev_features = extract_features(dev_dataloader, train_max_len)

In [91]:
iso_train_rep = list(cluster_based(train_features,
                                          n_cluster = 10, n_pc= 8))

iso_dev_rep = list(cluster_based(np.asarray(dev_features),
                                          n_cluster = 10, n_pc= 8))

### Hate Speech Classification

#### Training, Validation and Testing functions

In [None]:
class lstm(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers,
               classes):
    super(lstm, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.lstm = nn.LSTM(input_size,
                        hidden_size,
                        num_layers,
                        batch_first = True,
                        bidirectional = True)
    
    self.fc = nn.Linear(hidden_size *2, classes)

  def forward(self, x):
    h0 = torch.zeros(self.num_layers*2,x.size(0), self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)

    output, _ = self.lstm(x, (h0, c0))
    output = self.fc(output[:, 1, :])

    return output



In [None]:
def validate(dataloader):
    preds = []
    target = []

    loop = tqdm(dataloader,leave = True)
    accuracy = 0
    len_labels = 0
    for batch in loop:
        embeddings = batch[0].to(device)
        labels = batch[1]

        scores = model(embeddings)

        predictions= torch.argmax(scores.float(), dim=1).to('cpu').flatten()
        labels = labels.flatten()
        preds.extend(predictions)
        target.extend(labels)
    

    f1 = f1_score(target, preds, average='macro')
    acc = accuracy_score(target, preds)
    recall = recall_score(target, preds)
    precision = precision_score(target, preds)

    results = {
        'f1': f1,
        'accuracy': acc,
        'recall': recall,
        'precision': precision,
    }

    return results

In [None]:
def train(train_dataloader,dev_dataloader, num_epochs, seed_val,
          optimizer = optimizer, model = model, criterion = criterion):

  seed_val = seed_val
  
  random.seed(seed_val)
  seed = random.randint(1, seed_val)
  torch.manual_seed(seed) 
  torch.cuda.manual_seed_all(seed)

  validaition_results = []
  best_f1 = 0

  for epoch in range(num_epochs):
      loop = tqdm(train_dataloader, leave = True)
      for batch in loop:
        optimizer.zero_grad()
        embeddings = batch[0]
        labels = batch[1]

        scores = model(embeddings)
        loss = criterion(scores, labels)

        loss.backward()
        optimizer.step()
          


        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss = loss.item())

      validation_scores = validate(dev_dataloader)
      best_f1 = max(best_f1, validation_scores['f1'])
      print('current f1 score for Epoch ' + str(epoch) + ' is: ' +  str(validation_scores['f1']))
      print('Best f1 score: ', str(best_f1))
      
      
  return validaition_results
      


#### Classification

In [None]:
iso_train_dataloader = create_dataloaders(iso_train_rep,
                                          labels = labels_train, batch_size = 32)
iso_dev_dataloader = create_dataloaders(input_ids = iso_dev_rep,
                                        labels = labels_dev, batch_size = 32)

In [None]:
model = lstm(input_size = 768,
             hidden_size = train_max_len,
             num_layers= 2,
             classes =2)

In [None]:
learning_rate = 3e-5
num_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = learning_rate)

In [None]:
dev_results = train(iso_train_dataloader, iso_dev_dataloader)