# **Setup**

In [1]:
!git clone https://github.com/niyaryca/Idiomacity-Detection.git

fatal: destination path 'Idiomacity-Detection' already exists and is not an empty directory.


In [2]:
!pip install transformers
%cd /content/ 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
/content


In [3]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import site
site.main()

In [5]:
import os
import csv

from pathlib import Path

# **Preprocessing Data**

In [6]:
def load_csv( path, delimiter=',' ) : 
  header = None
  data   = list()
  with open( path, encoding='utf-8') as csvfile:
    reader = csv.reader( csvfile, delimiter=delimiter ) 
    for row in reader : 
      if header is None : 
        header = row
        continue
      data.append( row ) 
  return header, data

In [7]:
def write_csv( data, location ) : 
  with open( location, 'w', encoding='utf-8') as csvfile:
    writer = csv.writer( csvfile ) 
    writer.writerows( data ) 
  print( "Wrote {}".format( location ) ) 
  return

In [8]:
class Node():
  def __init__(self, sentence, label):
    self.sentence = sentence
    self.label = label

def create_idiom_dict_train(data_location, file_name) :
    idiom_dict = {}
    file_name = os.path.join( data_location, file_name ) 
    header, data = load_csv( file_name )
    for elem in data:
        label     = elem[ header.index( 'Label'  ) ]
        sentence = elem[ header.index( 'Target' ) ]
        idiom = elem[ header.index( 'MWE' ) ]
        if idiom in idiom_dict:
          idiom_dict[idiom].append(Node(sentence, label))
        else:
          idiom_dict[idiom] = [Node(sentence, label)]
    return idiom_dict

In [9]:
d1 = create_idiom_dict_train('/content/Idiomacity-Detection/Rawdata', 'train_zero_shot.csv')
d2 = create_idiom_dict_train('/content/Idiomacity-Detection/Rawdata', 'train_one_shot.csv')
for key, value in d2.items():
  if key in d1:
    d1[key].append(value)
  else:
    d1[key] = value

In [10]:
def _get_train_data( data_location, file_name) :
    
    file_name = os.path.join( data_location, file_name ) 

    header, data = load_csv( file_name )

    out_header = [ 'label1', 'label2', 'sentence1', 'sentence2', 'sentence3', 'sentence4' ]

    out_data = list()
    for elem1 in data :
        label     = elem1[ header.index( 'Label'  ) ]
        sentence1 = elem1[ header.index( 'Target' ) ]
        for elem2 in d1[elem1[ header.index( 'MWE' ) ]]:
          if elem2.sentence != sentence1:
              label2     = elem2.label
              sentence2 = elem2.sentence
              this_row = None
              sentence3 = elem1[ header.index( 'MWE' ) ]
              sentence4 = sentence3
              this_row = [ label, label2, sentence1, sentence3, sentence2, sentence4]
              out_data.append( this_row )
              assert len( out_header ) == len( this_row )
    return [ out_header ] + out_data

In [11]:
def _get_dev_eval_data( data_location, input_file_name, gold_file_name, include_idiom ) :

    input_headers, input_data = load_csv( os.path.join( data_location, input_file_name ) )
    gold_header  = gold_data = None
    if not gold_file_name is None : 
        gold_header  , gold_data  = load_csv( os.path.join( data_location, gold_file_name  ) )
        assert len( input_data ) == len( gold_data )

    # ['ID', 'Language', 'MWE', 'Previous', 'Target', 'Next']
    # ['ID', 'DataID', 'Language', 'Label']
    
    out_header = [ 'label1', 'label2', 'sentence1', 'sentence3' ]
    if include_idiom :
        out_header = [ 'label1', 'label2', 'label3', 'sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'sentence6', 'language' ]

    out_data = list()
    for index in range( len( input_data ) ) :
        label = 1
        if not gold_file_name is None : 
            this_input_id = input_data[ index ][ input_headers.index( 'ID' ) ]
            this_gold_id  = gold_data [ index ][ gold_header  .index( 'ID' ) ]
            assert this_input_id == this_gold_id
            
            label     = gold_data[ index ][ gold_header.index( 'Label'  ) ]
            language = gold_data[index][gold_header.index('Language')]
        elem      = input_data[ index ]
        sentence1 = elem[ input_headers.index( 'Target' ) ]
        this_row = None
        if not include_idiom :
            this_row = [ label, sentence1 ] 
        else :
            sentence2 = elem[ input_headers.index( 'MWE' ) ]
            this_row = [ label, sentence1, sentence2 ]
        idiom = elem[ input_headers.index( 'MWE' ) ]
        other_nodes = d1[idiom]
        if(len(other_nodes)==1):
            if not include_idiom :
                this_row = [ label, other_nodes[0].label, sentence1, other_nodes[0].sentence ] 
            else :
                sentence2 = elem[ input_headers.index( 'MWE' ) ]
                this_row = [ label, other_nodes[0].label, other_nodes[0].label, sentence1, sentence2, other_nodes[0].sentence, sentence2, other_nodes[0].sentence, sentence2, language ]
        else:
            if not include_idiom :
                this_row = [ label, other_nodes[0].label, sentence1, other_nodes[0].sentence ] 
            else :
                sentence2 = elem[ input_headers.index( 'MWE' ) ]
                this_row = [ label, other_nodes[0].label, other_nodes[1].label, sentence1, sentence2, other_nodes[0].sentence, sentence2, other_nodes[1].sentence, sentence2, language ]
           
        assert len( out_header ) == len( this_row ) 
        out_data.append( this_row )
        

    return [ out_header ] + out_data

In [12]:
train_zero_data = _get_train_data(
        data_location   = '/content/Idiomacity-Detection/Rawdata',
        file_name       = 'train_zero_shot.csv'
    )
train_one_data = _get_train_data(
        data_location   = '/content/Idiomacity-Detection/Rawdata',
        file_name       = 'train_one_shot.csv'
    )

assert train_zero_data[0] == train_one_data[0] ## Headers
train_data = train_one_data + train_zero_data[1:]

dev_data = _get_dev_eval_data(
        data_location    = '/content/Idiomacity-Detection/Rawdata',
        input_file_name  = 'dev.csv',
        gold_file_name   = 'dev_gold.csv',
        include_idiom    = True
    )

# **Importing Libraries**

In [13]:
import os
import sys
import numpy as np
import random
import pickle
import logging

from typing          import Optional
from dataclasses     import dataclass, field
from sklearn.metrics import f1_score, accuracy_score

from datasets        import load_dataset, load_metric

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.utils         import check_min_version
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from keras_preprocessing.sequence import pad_sequences
import torch

In [14]:
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.6.0.dev0")

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

logger = logging.getLogger(__name__)

In [15]:
train_data[0:4]

[['label1', 'label2', 'sentence1', 'sentence2', 'sentence3', 'sentence4'],
 ['0',
  '1',
  'Program leaders said the scholarship defines public service broadly and imagines a variety of pathways toward civic engagement.',
  'public service',
  'In the ensuing years, Wennberg might not have managed to knock down the parking deck, but his administration helped keep Central Vermont Public Service from moving its corporate headquarters out of the city, and after successfully fighting a number of shopping centers city officials worried would pose a threat to downtown, he negotiated a deal that kept Diamond Run Mall from hosting a movie theater or supermarket and got the city a couple million dollars in payments that funded a variety of projects through the years.',
  'public service'],
 ['1',
  '0',
  'In the ensuing years, Wennberg might not have managed to knock down the parking deck, but his administration helped keep Central Vermont Public Service from moving its corporate headquarters 

In [16]:
dev_data[0:4]

[['label1',
  'label2',
  'label3',
  'sentence1',
  'sentence2',
  'sentence3',
  'sentence4',
  'sentence5',
  'sentence6',
  'language'],
 ['1',
  '1',
  '1',
  'Are these interruptions of the good life a necessary condition of the high life?',
  'high life',
  'Despite having the riches to afford the high life, PSG captain Marquinhos is still in touch with his past life before becoming a multi-millionaire footballer.',
  'high life',
  'Despite having the riches to afford the high life, PSG captain Marquinhos is still in touch with his past life before becoming a multi-millionaire footballer.',
  'high life',
  'EN'],
 ['1',
  '1',
  '1',
  "But for Australian fashion designer Abby Kheir, there's no reason not to treat her employees to a taste of the high life all-year round.",
  'high life',
  'Despite having the riches to afford the high life, PSG captain Marquinhos is still in touch with his past life before becoming a multi-millionaire footballer.',
  'high life',
  'Despite ha

# **Tokenizer**

In [17]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large',
                                          cache_dir=None,
                                          use_fast=True,
                                          revision="main",
                                          use_auth_token=None,)

In [18]:
def shuffle_data(data):
    indices = list(range(len(data)))
    random.shuffle(indices)
    shuffled_data = []
    for i in indices:
        shuffled_data.append(data[i])
    return shuffled_data

In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def Preprocess_Data(input, tokenizer, max_len, batch_size, data_class="train"):
    input1 = []
    input2 = []
    label1 = []
    label2 = []
    for i in input:
      """if(i[1]!='1' and i[1]!='0'):
        continue"""
      label1.append(int(i[0]))
      label2.append(int(i[1]))
      args = (
            (i[2], i[3])
      )
      input1.append(args)
      args = (
            (i[4], i[5])
      )
      input2.append(args)
    encoded_input1 = tokenizer(input1, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt")
    encoded_input2 = tokenizer(input2, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt")
    
    input_ids1 = encoded_input1['input_ids']
    attention_mask1 = encoded_input1['attention_mask']
    labels1 = torch.tensor(label1)

    input_ids2 = encoded_input2['input_ids']
    attention_mask2 = encoded_input2['attention_mask']
    labels2 = torch.tensor(label2)

    dataset_tensor = TensorDataset(input_ids1, attention_mask1, labels1, input_ids2, attention_mask2, labels2)

    if data_class == "train":
        sampler = RandomSampler(dataset_tensor)
    else:
        sampler = SequentialSampler(dataset_tensor)
    dataloader = DataLoader(dataset_tensor, sampler=sampler, batch_size=batch_size)

    return dataloader

In [20]:
max_len = 512
batch_size = 32
def PreProcess_Dev(input, tokenizer, max_len, batch_size, data_class="dev"):
    input1 = []
    input2 = []
    input3 = []
    label1 = []
    label2 = []
    label3 = []
    language = []
    for i in input:
      """if(i[1]!='1' and i[1]!='0'):
        continue"""
      label1.append(int(i[0]))
      label2.append(int(i[1]))
      label3.append(int(i[2]))
      args = (
            (i[3], i[4])
      )
      input1.append(args)
      args = (
            (i[5], i[6])
      )
      input2.append(args)
      args = (
          (i[7], i[8])
      )
      input3.append(args)
      if i[9]=='EN':
        language.append(0)
      else:
        language.append(1)
    encoded_input1 = tokenizer(input1, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt")
    encoded_input2 = tokenizer(input2, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt")
    encoded_input3 = tokenizer(input3, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt")
    input_ids1 = encoded_input1['input_ids']
    attention_mask1 = encoded_input1['attention_mask']
    labels1 = torch.tensor(label1)
    print(input_ids1.size(), attention_mask1.size(), labels1.size())

    input_ids2 = encoded_input2['input_ids']
    attention_mask2 = encoded_input2['attention_mask']
    labels2 = torch.tensor(label2)

    print(input_ids2.size(), attention_mask2.size(), labels2.size())

    input_ids3 = encoded_input3['input_ids']
    attention_mask3 = encoded_input3['attention_mask']
    labels3 = torch.tensor(label3)

    print(input_ids3.size(), attention_mask3.size(), labels3.size())  
    language = torch.tensor(language)  

    dataset_tensor = TensorDataset(input_ids1, attention_mask1, labels1, input_ids2, attention_mask2, labels2, input_ids3, attention_mask3, labels3, language)

    if data_class == "train":
        sampler = RandomSampler(dataset_tensor)
    else:
        sampler = SequentialSampler(dataset_tensor)
    dataloader = DataLoader(dataset_tensor, sampler=sampler, batch_size=batch_size)

    return dataloader

In [21]:
import torch.nn as nn
class SiameseModel(nn.Module):
    def __init__(self):
        super(SiameseModel, self).__init__()
        
        self.base_model = AutoModel.from_pretrained(
            'xlm-roberta-base',
            from_tf=bool(".ckpt" in 'xlm-roberta-base'),
            config=config,
            cache_dir=None,
            revision="main",
            use_auth_token=None,
        ).cuda()
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 2).cuda() 
        
    def forward(self, input_ids1, attn_mask1, input_ids2, attn_mask2):
        
        outputs1 = self.base_model(input_ids1, attention_mask=attn_mask1).last_hidden_state[:, 0]
        outputs2 = self.base_model(input_ids2, attention_mask=attn_mask2).last_hidden_state[:, 0]
        difference = outputs1*outputs2
        
        outputs = self.dropout(difference)
        outputs = self.linear(outputs)
        
        return outputs

In [22]:
# import torch.nn as nn
#  class SiameseModel(nn.Module):
#      def __init__(self):
#          super(SiameseModel, self).__init__()
        
# #         self.base_model = AutoModel.from_pretrained(
# #             'xlm-roberta-large',
# #             from_tf=bool(".ckpt" in 'xlm-roberta-large'),
# #             config=config,
# #             cache_dir=None,
# #             revision="main",
# #             use_auth_token=None,
# #         ).cuda()
# #         self.dropout = nn.Dropout(0.5)
# #         self.linear = nn.Linear(768, 2).cuda() 
        
# #     def forward(self, input_ids1, attn_mask1, input_ids2, attn_mask2):
        
# #         first_output = self.base_model(input_ids1, attention_mask=attn_mask1).last_hidden_state[:, 0]
# #         second_output = self.base_model(input_ids2, attention_mask=attn_mask2).last_hidden_state[:, 0]
# #         #Similarity score calculated using dot product & the value will be between 1 & 0
# #         dot_product = first_output*second_output

# #         final_output = self.dropout(dot_product)
# #         final_output = self.linear(final_output)
        
# ###         return final_output

In [23]:
from sklearn.metrics import f1_score, accuracy_score
def Eval(bert_model, dataloader):

    bert_model.eval()
    predictions, true_labels = [], []
    predictions_en, true_labels_en = [], []
    predictions_pt, true_labels_pt = [], []
    num_correct = 0
    
    for step, batch in enumerate(tqdm(dataloader)):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits1 = nn.functional.softmax(bert_model.forward(batch[0], batch[1], batch[3], batch[4]), -1)
        with torch.no_grad():
            logits2 = nn.functional.softmax(bert_model.forward(batch[0], batch[1], batch[6], batch[7]), -1)
        logits = torch.cat((logits1, logits2), dim=1)
        max_args = torch.argmax(logits, dim=1)
        batch_predictions = []
        batch_true_labels = batch[2]
        first_sentence_labels = batch[5]
        second_sentence_labels = batch[8]
        batch_en_predictions = []
        batch_pt_predictions = []
        true_en_predictions = []
        true_pt_predictions = []
        language = batch[9]
        for idx, instance in enumerate(max_args):
          if instance == 0:
            batch_predictions.append((first_sentence_labels[idx] - 1) * -1) # 0, 1 toggle
            if language[idx].item() == 0:
              batch_en_predictions.append((first_sentence_labels[idx] - 1) * -1)
              true_en_predictions.append(batch_true_labels[idx])
            else:
              batch_pt_predictions.append((first_sentence_labels[idx] - 1) * -1)
              true_pt_predictions.append(batch_true_labels[idx])
          elif instance == 1:
            batch_predictions.append(first_sentence_labels[idx])
            if language[idx].item() == 0:
              batch_en_predictions.append(first_sentence_labels[idx])
              true_en_predictions.append(batch_true_labels[idx])
            else:
              batch_pt_predictions.append(first_sentence_labels[idx])
              true_pt_predictions.append(batch_true_labels[idx])
          elif instance == 2:
            batch_predictions.append((second_sentence_labels[idx] - 1) * -1)
            if language[idx].item() == 0:
              batch_en_predictions.append((second_sentence_labels[idx] - 1) * -1)
              true_en_predictions.append(batch_true_labels[idx])
            else:
              batch_pt_predictions.append((second_sentence_labels[idx] - 1) * -1)
              true_pt_predictions.append(batch_true_labels[idx])
          else:
            batch_predictions.append(second_sentence_labels[idx])
            if language[idx].item() == 0:
              batch_en_predictions.append(second_sentence_labels[idx])
              true_en_predictions.append(batch_true_labels[idx])
            else:
              batch_pt_predictions.append(second_sentence_labels[idx])
              true_pt_predictions.append(batch_true_labels[idx])
        predictions += batch_predictions
        true_labels += batch_true_labels
        predictions_en += batch_en_predictions
        predictions_pt += batch_pt_predictions
        true_labels_en += true_en_predictions
        true_labels_pt += true_pt_predictions
    return true_labels, predictions, true_labels_en, predictions_en, true_labels_pt, predictions_pt

In [24]:
def metrics(true_labels, predictions):
    pre = []
    tl = []
    num_correct = 0
    for pred, true_label in zip(predictions, true_labels):
        pre.append(int(pred.item()))
        tl.append(int(true_label.item()))
        if pred == true_label:
            num_correct += 1
    print("\nAccuracy: %s" % (float(num_correct) / float(len(true_labels))))
    print("F1 Score ")
    print(f1_score(tl, pre, average='macro'))

In [25]:
from tqdm import tqdm

def Train_Eval(bert_model, train_data, lr, n_epoch, tokenizer, batch_size, max_len):

    print("Start Training!")
    optimizer = AdamW(bert_model.parameters(), lr=lr)
    bert_model.train()
    dev_dataloader = PreProcess_Dev(dev_data[1:], tokenizer, max_len, batch_size, data_class="dev")
    # TRAIN loop
    for epoch in range(n_epoch):
        shuffled_train_data = shuffle_data(train_data)
        shuffled_train_data = Preprocess_Data(shuffled_train_data, tokenizer, max_len, batch_size)
        print(f"\nEpoch {epoch}")
        torch.cuda.empty_cache()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(shuffled_train_data)):
            batch = tuple(t.to(device) for t in batch)
            bert_model.zero_grad()
            # forward pass
            logits = bert_model.forward(batch[0], batch[1], batch[3], batch[4])
            # print(loss)
            loss = 0
            target = torch.where(batch[2]==batch[5], 1, 0)
            #target = target.reshape(-1,1)
            loss = nn.functional.cross_entropy(logits, target)
            
            # backward pass
            loss.backward()
            # track train loss
            tr_loss += loss.item()
            nb_tr_steps += 1
            #loss = loss.detach()
            # update parameters
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            

        # print train loss per epoch
        print("Train loss on epoch {}: {}\n".format(epoch, tr_loss / nb_tr_steps))

    true_labels, predictions, true_labels_en, predictions_en, true_labels_pt, predictions_pt  = Eval(bert_model, dev_dataloader)
    print("EN-PT")
    metrics(true_labels, predictions)
    print("EN")
    metrics(true_labels_en, predictions_en)
    print("PT")
    metrics(true_labels_pt, predictions_pt)


In [26]:
config = AutoConfig.from_pretrained(
        'xlm-roberta-large',
        num_labels=2,
        finetuning_task=None,
        cache_dir=None,
        revision="main",
        use_auth_token=None,
    )

model = SiameseModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

learning_rate = 1e-5
num_epoch = 3                         
torch.cuda.empty_cache()
max_len = 128
batch_size = 128

if n_gpu > 1:
    model.to(device)
    model = torch.nn.DataParallel(model)
else:
    model.cuda()

Train_Eval(model, train_data[1:], learning_rate, num_epoch, tokenizer, batch_size, max_len)


RuntimeError: ignored

In [None]:
%%shell
jupyter nbconvert --to html 2_MBERT_Siamese.ipynb