In [129]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
import pyarabic.araby as araby
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import re
import torch.nn as nn
from transformers import AdamW
from tqdm import tqdm
import random
import gc
from IPython.display import clear_output

In [None]:
train = pd.read_csv('/content/OSACT2022-sharedTask-train.txt',
sep='\t', names = ['index', 'tweet', 'offensive', 'hatespeech',
                   'vulgar', 'violent'])
dev = pd.read_csv('/content/OSACT2022-sharedTask-dev.txt',
sep='\t', names = ['index', 'tweet', 'offensive', 'hatespeech',
                   'vulgar', 'violent'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('UBC-NLP/MARBERT')
# marbert = AutoModel.from_pretrained('UBC-NLP/MARBERT', output_hidden_states= True)

### Cleaning, Tokenization, Encoding

In [None]:
def clean_data(tweet):
    text = araby.strip_tashkeel(tweet)
    text = araby.strip_tatweel(text)
    clean_tweet = text.replace('@USER', '')
    clean_tweet = clean_tweet.replace('URL', '')
    clean_tweet=bytes(clean_tweet, 'utf-8').decode('utf-8','ignore')
    clean_tweet = re.sub(r'[A-Z]+',' ', clean_tweet)
    clean_tweet = clean_tweet.replace('>', '')
    clean_tweet = clean_tweet.replace('<', '')
    return clean_tweet

In [None]:
def get_max_len(tweets, tokenizer= tokenizer):
    max_len = 0
    for tweet in tweets:
        tokens = tokenizer.encode(tweet, add_special_tokens = True)
        max_len = max(max_len, len(tokens))
    return max_len

In [None]:
def encode(tweets, max_len, tokenizer = tokenizer):
    
    input_ids = []
    attention_masks = []
    
    for tweet in tweets:
        encodings_dict = tokenizer.encode_plus(
        tweet,
        add_special_tokens = True,
        max_length = max_len,
        pad_to_max_length = True,
        return_tensors = 'pt'
    )
        input_ids.append(encodings_dict['input_ids'])
        attention_masks.append(encodings_dict['attention_mask'])

  
    input_tensor = torch.cat(input_ids, dim = 0)
    attention_mask_tensor = torch.cat(attention_masks, dim = 0)
    
    return input_tensor, attention_mask_tensor



In [None]:
def create_dataloaders(input_ids, attention_masks,
                       batch_size, labels = None):
    
    if(labels == None):
      tensor_dataset = TensorDataset(input_ids, attention_masks)
    else:
      tensor_dataset = TensorDataset(input_ids, attention_masks, 
                                   labels)


    dataloader = DataLoader(
        tensor_dataset,
        shuffle = False,
        batch_size = batch_size
    )
    
    return dataloader

In [None]:
labels_dict = {
    'OFF':1,
    'NOT_OFF':0
}

In [None]:
def create_labels(hatespeech_labels, labels_dict = labels_dict):
    labels = []
    for label in hatespeech_labels:
        
        label_value = labels_dict[label]
        labels.append(label_value)
            
    labels_tensor = torch.Tensor(labels)
    labels_tensor = labels_tensor.long()
 
    return labels_tensor

In [None]:
train.tweet.iloc[7394]
train.drop(index = 7394, inplace = True)

In [None]:
train['tweet'] = train['tweet'].apply(clean_data)
dev['tweet'] = dev['tweet'].apply(clean_data)

train_tweets = list(train['tweet'].values)
train_max_len = get_max_len(train_tweets)

dev_tweets = list(dev['tweet'].values)
dev_max_len = get_max_len(dev_tweets)

In [None]:
off_labels_train = train.offensive.values
off_labels_dev = dev.offensive.values

labels_train = create_labels(off_labels_train)
labels_dev = create_labels(off_labels_dev)

In [None]:
train_input, train_mask = encode(train_tweets, train_max_len)
dev_input, dev_mask = encode(train_tweets, dev_max_len)


In [None]:
train_dataloader = create_dataloaders(input_ids, attention_masks,
                       labels = labels_train, batch_size=32)
dev_dataloader = create_dataloaders(input_ids, attention_mask,
                       labels = labels_dev, batch_size =32)

### Training, Validation and Testing functions

In [None]:
class lstm(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers,
               classes):
    super(lstm, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.lstm = nn.LSTM(input_size,
                        hidden_size,
                        num_layers,
                        batch_first = True,
                        bidirectional = True)
    
    self.fc = nn.Linear(hidden_size *2, classes)

  def forward(self, x):
    h0 = torch.zeros(self.num_layers*2,x.size(0), self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)

    output, _ = self.lstm(x, (h0, c0))
    output = self.fc(output[:, 1, :])

    return output



In [130]:
model = lstm(input_size = 768,
             hidden_size = train_max_len,
             num_layers= 2,
             classes =2)

In [131]:
learning_rate = 3e-5
num_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr = learning_rate)



In [None]:
def validate(dataloader):
    preds = []
    target = []

    loop = tqdm(dataloader,leave = True)
    accuracy = 0
    len_labels = 0
    for batch in loop:
        embeddings = batch[0].to(device)
        labels = batch[1]

        scores = model(embeddings)

        predictions= torch.argmax(scores.float(), dim=1).to('cpu').flatten()
        labels = labels.flatten()
        preds.extend(predictions)
        target.extend(labels)
    

    f1 = f1_score(target, preds, average='macro')
    acc = accuracy_score(target, preds)
    recall = recall_score(target, preds)
    precision = precision_score(target, preds)

    results = {
        'f1': f1,
        'accuracy': acc,
        'recall': recall,
        'precision': precision,
        'results': results
    }

    return results

In [None]:
def train(train_dataloader,dev_dataloader, num_epochs, seed_val,
          optimizer = optimizer, model = model, criterion = criterion):

  seed_val = seed_val
  
  random.seed(seed_val)
  seed = random.randint(1, seed_val)
  torch.manual_seed(seed) 
  torch.cuda.manual_seed_all(seed)

  validaition_results = []
  best_f1 = 0

  for epoch in range(num_epochs):
      loop = tqdm(train_dataloader, leave = True)
      for batch in loop:
        optimizer.zero_grad()
        embeddings = batch[0].to(device)
        labels = batch[1].to(device)

        scores = model(embeddings)
        loss = criterion(scores, labels)

        loss.backward()
        optimizer.step()
          


        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss = loss.item())

      validation_scores = validate(dev_dataloader)
      best_f1 = max(best_f1, validation_scores['f1'])
      print('f1 score for Epoch ' + str(epoch) + ' is: ' +  str(best_f1))
      
      
  return validaition_results
      


### Evaluating Marbert's Isotropy

In [139]:
def isotropy(representations):
    eig_values, eig_vectors = np.linalg.eig(np.matmul(np.transpose(representations),
                                                      representations))
    max_f = -mt.inf
    min_f =  mt.inf

    for i in range(eig_vectors.shape[1]):
        f = np.matmul(representations, np.expand_dims(eig_vectors[:, i], 1))
        f = np.sum(np.exp(f))

        min_f = min(min_f, f)
        max_f = max(max_f, f)

    isotropy = min_f / max_f

    return isotropy

In [None]:
def extract_features(dataloader):

  features = []

  loop = tqdm(dataloader, leave = True)
  for batch in loop:
    with torch.no_grad():

      input_ids = batch[0]
      attention_mask_train = batch[1]
      output = marbert(input_ids, attention_mask= attention_mask_train)[0]
      
      output = output.cpu().numpy().reshape((-1,768))
      output = np.delete(output, [0, len(output)-1], axis= 0)

      for feature in output:
        features.append(feature)
      break

  return features, output

In [None]:
train_dataloader = create_dataloaders(train_input, train_mask , batch_size = 1)
train_features = extract_features(train_dataloader)

In [None]:
original_isotropy_value = isotropy(train_features)
print(original_isotropy_value)

### Make Representations Isotropic

In [None]:
def cluster_based(representations, n_cluster: int, n_pc: int):


  centroid, label=clst.vq.kmeans2(representations, n_cluster, minit='points',
                                  missing='warn', check_finite=True)
  cluster_mean=[]
  for i in range(max(label)+1):
    sum=np.zeros([1,768]);
    for j in np.nonzero(label == i)[0]:
      sum=np.add(sum, representations[j])
    cluster_mean.append(sum/len(label[label == i]))

  zero_mean_representation=[]
  for i in range(len(representations)):
    zero_mean_representation.append((representations[i])-cluster_mean[label[i]])

  cluster_representations={}
  for i in range(n_cluster):
    cluster_representations.update({i:{}})
    for j in range(len(representations)):
      if (label[j]==i):
        cluster_representations[i].update({j:zero_mean_representation[j]})

  cluster_representations2=[]
  for j in range(n_cluster):
    cluster_representations2.append([])
    for key, value in cluster_representations[j].items():
      cluster_representations2[j].append(value)

  cluster_representations2=np.array(cluster_representations2)

  model=PCA()
  post_rep=np.zeros((representations.shape[0],representations.shape[1]))

  for i in range(n_cluster):
      model.fit(np.array(cluster_representations2[i]).reshape((-1,768)))
      component = np.reshape(model.components_, (-1, 768))

      for index in cluster_representations[i]:
        sum_vec = np.zeros((1, 768))

        for j in range(n_pc):
                sum_vec = sum_vec + np.dot(cluster_representations[i][index],
                          np.transpose(component)[:,j].reshape((768,1))) * component[j]
        
        post_rep[index]=cluster_representations[i][index] - sum_vec

  clear_output()

  return post_rep

