In [1]:
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q git+https://github.com/huggingface/accelerate
!pip install -q git+https://github.com/huggingface/datasets
!pip install sentencepiece



In [2]:
import transformers, sentencepiece
import datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import sklearn.metrics
from time import sleep
from tqdm import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset, Dataset, DatasetDict
from accelerate import notebook_launcher
from accelerate import Accelerator
from accelerate.utils import set_seed
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR



#### JSON file paths for the dataset

In [3]:
df_train = pd.read_json('/kaggle/input/dl-multilabel/dl_project_train.json', orient='records')
df_val = pd.read_json('/kaggle/input/dl-multilabel/dl_project_dev.json', orient='records')
df_test = pd.read_json('/kaggle/input/dl-multilabel/dl_project_test.json', orient='records')

In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
total_words = 0
count_articles = 0
for article_set in df_train['articles']:
    for article in article_set:
        words = word_tokenize(article['content'])
        total_words += len(words)
        count_articles += 1
print('Total words: ', total_words)
print(f'{count_articles} articles, {total_words/count_articles} words per article')
print(f"{len(df_train['articles'])} websites, {total_words/len(df_train['articles'])} words per website")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Total words:  6114653
6994 articles, 874.2712324849871 words per article
817 websites, 7484.275397796818 words per website


In [5]:
df_train

Unnamed: 0,source,json_file_path,label,large_label,mbfc_link,articles
0,ivn.us,train_json/ivn.us.json,1,Independent Voter Network (IVN) - Least Biased...,https://mediabiasfactcheck.com/independent-vot...,"[{'id': 'b3785905376194ae', 'title': 'An Indep..."
1,newyorker.com,train_json/newyorker.com.json,0,New Yorker - Left Bias - Liberal - Democrat - ...,https://mediabiasfactcheck.com/new-yorker/,"[{'id': '940ab57d099ffd69', 'title': 'An App C..."
2,cookpolitical.com,train_json/cookpolitical.com.json,1,Cook Political Report - Least Biased - Credible,https://mediabiasfactcheck.com/cook-political-...,"[{'id': 'e52875fbe3848a5c', 'title': 'Introduc..."
3,meduza.io,train_json/meduza.io.json,0,Meduza - Left Bias - Liberal - Credible and Re...,https://mediabiasfactcheck.com/meduza/,"[{'id': '1aa2210d64c1d23a', 'title': 'Bad educ..."
4,sctimes.com,train_json/sctimes.com.json,1,St. Cloud Times - Least Baised - Credible - Re...,https://mediabiasfactcheck.com/st-cloud-times/,"[{'id': '2eb423127b952e79', 'title': 'A German..."
...,...,...,...,...,...,...
812,ammoland.com,train_json/ammoland.com.json,2,Ammoland - Right Bias - Conservative - Republi...,https://mediabiasfactcheck.com/ammoland/,"[{'id': 'b7e681d5867ae712', 'title': 'Fourth C..."
813,sciencedebate.org,train_json/sciencedebate.org.json,1,Science Debate - Least Biased - Credible - Rel...,https://mediabiasfactcheck.com/science-debate/,"[{'id': 'd4e7ee12f06e3262', 'title': 'Presiden..."
814,rfi.fr,train_json/rfi.fr.json,1,RFI (Radio France Internationale) - Least Bias...,https://mediabiasfactcheck.com/rfi-radio-franc...,"[{'id': 'bc2ba227a6fdef5d', 'title': 'Paris po..."
815,howtogeek.com,train_json/howtogeek.com.json,1,How-To Geek - Least Biased - Credible - Reliable,https://mediabiasfactcheck.com/how-to-geek/,"[{'id': 'a64aa35b0a3f88e4', 'title': 'How to U..."


In [6]:
print('Non-labeled amount of data per train, val and test:')
print(len(df_train[df_train['large_label'].isna()])/len(df_train))
print(len(df_val[df_val['large_label'].isna()])/len(df_val))
print(len(df_test[df_test['large_label'].isna()])/len(df_test))

Non-labeled amount of data per train, val and test:
0.2350061199510404
0.18269230769230768
0.3431372549019608


#### Data preparation

In [7]:
df_train.drop(df_train[df_train['large_label'].isna()].index, inplace=True)
df_val.drop(df_val[df_val['large_label'].isna()].index, inplace=True)
df_test.drop(df_test[df_test['large_label'].isna()].index, inplace=True)
df_train['large_label'] = df_train['large_label'].str.lower().str.replace(' ', '')
df_val['large_label'] = df_val['large_label'].str.lower().str.replace(' ', '')
df_test['large_label'] = df_test['large_label'].str.lower().str.replace(' ', '')

In [8]:
print(df_train['large_label'])

0      independentvoternetwork(ivn)-leastbiased-credi...
1      newyorker-leftbias-liberal-democrat-progressiv...
2               cookpoliticalreport-leastbiased-credible
3            meduza-leftbias-liberal-credibleandreliable
4            st.cloudtimes-leastbaised-credible-reliable
                             ...                        
812    ammoland-rightbias-conservative-republican-cre...
813          sciencedebate-leastbiased-credible-reliable
814    rfi(radiofranceinternationale)-leastbiased-cre...
815             how-togeek-leastbiased-credible-reliable
816    lawenforcementtoday-rightbias-conservative-rep...
Name: large_label, Length: 625, dtype: object


In [9]:
df = pd.DataFrame()
df['words'] = df_train['large_label'].str.split('-')
df['words'] = df['words'].apply(lambda x: x[1:])
all_words_train = [word for words in df['words'] for word in words]

df = pd.DataFrame()
df['words'] = df_val['large_label'].str.split('-')
df['words'] = df['words'].apply(lambda x: x[1:])
all_words_val = [word for words in df['words'] for word in words]

df = pd.DataFrame()
df['words'] = df_test['large_label'].str.split('-')
df['words'] = df['words'].apply(lambda x: x[1:])
all_words_test = [word for words in df['words'] for word in words]

unique_words = set(all_words_train + all_words_val + all_words_test)
print(unique_words)

{'', 'sentinel', 'socialism', 'police', 'proscience', 'communism', 'christian', 'newschannel5', 'brexit', 'ny1', 'standard', '23abc', 'liberal', '11news', 'notliberal(left)', 'crediblefactchecker', 'southafrica', 'centerbias', 'notrightbias', 'australia', 'reliability', 'tribune', 'legitimate', 'mytwntiers', 'notalwayscredibleorreliable', 'journal', 'credibleandreliable', 'trump', 'tech', 'marxism', 'progressive', 'highlycredible', 'cityherald', 'slightright', 'freemarket', 'feminist', 'stockton', 'christianright', 'rightbias', 'notcredibleorreliable', 'myfox8', 'conspiracy', 'leastbiased', 'factchecking', 'muslim', 'rightbiased', 'notconservativeorright', 'conservative', 'progressivedemocrat', 'chronicle', 'vote.com', 'reporter', 'credible', 'libertarian', 'prwatch', 'republican', 'american', 'fakenews', 'leftbiased', 'credibilityandreliabilitymixed', 'notleftbias', 'leftbias', 'india', 'propaganda', 'instituteforlegislativeaction', 'news9', 'review', 'conservatitve', 'gazette', 'comm

In [10]:
remove = ['notleftorright','notliberal','notliberal(left)','notliberalorconservative','notconservative']

replace_noncredible = ['notalwayscredibleorreliable','notcredible','notcredibleorreliable', 'lowtrustorreliable']

replace_mixed = ['credibilityandreliabilitymixed', 'mixedcredible', 
                 'notalwayscredibleorreliable', 'somewhatcredible']

replace_credible = ['credbile','credible','credibleandreliable','crediblefactchecker',
                    'crediblenewspaper', 'factual', 'generallycredible', 'highlycredible', 
                    'mostlycredible','mostlycredibleandreliable','mostlyfactual','reliability',
                    'reliabilityhigh','reliable', 'evidencebased', 'withhighcredibilityandreliability', 
                    'generallyreliable', 'mostlyreliable', 'highlyreliable', 'reliablenewspaper', 
                    'mostlyreliableandreliable','reliablehigh', 'reliableandreliable','reliablefactchecker', 'fact']

replace_leftlean = ['leftlean', 'leftcenterbias','notright','notrightbias']
replace_rightlean = ['rightcenterbias','rightlean','rightleaning','rightleaningbias',
                    'slightleanright','slightright', 'rleaningbias', 'notleft', 'notleftbias']

def change_labels(df):
    for st in remove:
        df['large_label'] = df['large_label'].str.replace(st, '')
    for st in replace_noncredible:
        df['large_label'] = df['large_label'].str.replace(st, 'lowtrust')
    for st in replace_mixed:
        df['large_label'] = df['large_label'].str.replace(st, 'mixed')
    for st in replace_credible:
        df['large_label'] = df['large_label'].str.replace(st, 'reliable')
    for st in replace_leftlean:
        df['large_label'] = df['large_label'].str.replace(st, 'llean')
    for st in replace_rightlean:
        df['large_label'] = df['large_label'].str.replace(st, 'rlean')
    df['large_label'] = df['large_label'].str.replace('pseudoscience', 'pseudo')
    return df
df_train, df_val, df_test = change_labels(df_train), change_labels(df_val), change_labels(df_test)

In [11]:
df = pd.DataFrame()
df['words'] = df_train['large_label'].str.split('-')
df['words'] = df['words'].apply(lambda x: x[1:])
all_words_train = [word for words in df['words'] for word in words]

df = pd.DataFrame()
df['words'] = df_val['large_label'].str.split('-')
df['words'] = df['words'].apply(lambda x: x[1:])
all_words_val = [word for words in df['words'] for word in words]

df = pd.DataFrame()
df['words'] = df_test['large_label'].str.split('-')
df['words'] = df['words'].apply(lambda x: x[1:])
all_words_test = [word for words in df['words'] for word in words]

unique_words = set(all_words_train + all_words_val + all_words_test)
print(unique_words)

{'', 'sentinel', 'socialism', 'police', 'proscience', 'communism', 'christian', 'newschannel5', 'brexit', 'ny1', 'standard', '23abc', 'liberal', '11news', 'southafrica', 'mixed', 'centerbias', 'rleaning', 'australia', '(left)', 'tribune', 'legitimate', 'mytwntiers', 'journal', 'trump', 'marxism', 'tech', 'progressive', 'cityherald', 'freemarket', 'feminist', 'stockton', 'christianright', 'rightbias', 'conspiracy', 'myfox8', 'leastbiased', 'muslim', 'rightbiased', 'conservative', 'progressivedemocrat', 'chronicle', 'llean', 'vote.com', 'reliablechecker', 'reporter', 'lleanbias', 'libertarian', 'pseudo', 'prwatch', 'republican', 'american', 'fakenews', 'leftbiased', 'leftbias', 'india', 'propaganda', 'instituteforlegislativeaction', 'news9', 'orright', 'review', 'conservatitve', 'gazette', 'communist', 'socialist', 'conservativeleaning', 'tv', 'togeek', 'telegram', 'racist', 'lowtrust', 'bulletin', 'labor', 'liberalprogressive', 'rlean', 'anarchism', 'canada', 'dictatorship', 'news', 'ca

In [12]:
id2label = {
    0: 'Left',
    1: 'Right',
    2: 'Liberal',
    3: 'Conservative',
    4: 'Credible',
    5: 'Unreliable'
}
label2id = {v: k for k, v in id2label.items()}

In [13]:
def new_column_value(label):
    
    label_mappings = {
        'left': 0,
        'llean': 0,
        'right': 1,
        'rlean': 1,
        'liberal': 2,
        'progressive': 2,
        'democrat': 2,
        'labour': 2,
        'labor': 2,
        'conserv': 3,
        'libertarian': 3,
        'republican': 3,
        'tory': 3,
        'leastbiased': 4,
        'lowbias': 4,
        'leastbaised': 4,
        'minimalbias': 4,
        'nonbiased': 4,
        'center': 4,
        'reliable': 4,
        'mixed': 4,
        'lowtrust': 5,
        'fake': 5,
    }
    label_list = [0] * len(id2label)
    for key, index in label_mappings.items():
        if key in label:
            label_list[index] = 1

    return label_list

In [14]:
df_train['labels'] = df_train['large_label'].apply(new_column_value)
df_val['labels'] = df_val['large_label'].apply(new_column_value)
df_test['labels'] = df_test['large_label'].apply(new_column_value)
df_train = df_train.drop(columns=['json_file_path','label','large_label','mbfc_link'])
df_val = df_val.drop(columns=['json_file_path','label','large_label','mbfc_link'])
df_test = df_test.drop(columns=['json_file_path','label','large_label','mbfc_link'])
df_train, df_val, df_test = df_train.explode('articles'), df_val.explode('articles'), df_test.explode('articles')
df_train, df_val, df_test = df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

In [15]:
def article_format(article):
    title = article['title'].strip("'")
    content = article['content'].strip("'")
    return title + ' ' + content

def clean_text(s):
    s = re.sub(r'[^a-zA-Z0-9\s.,\']+', ' ', s)
    s = re.sub(r'\s+', ' ', s)
    return s

df_train['articles'] = df_train['articles'].apply(article_format)
df_val['articles'] = df_val['articles'].apply(article_format)
df_test['articles'] = df_test['articles'].apply(article_format)

new_name = {'articles': 'text'}

df_train, df_val, df_test = df_train.rename(columns=new_name), df_val.rename(columns=new_name), df_test.rename(columns=new_name)

df_train['text'] = df_train['text'].apply(clean_text)
df_val['text'] = df_val['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [16]:
df_train

Unnamed: 0,source,text,labels
0,ivn.us,An Independent Voter Guide to The Impeachment ...,"[0, 0, 0, 0, 1, 0]"
1,ivn.us,Witnesses of the Unseen Breaking Out of the Tw...,"[0, 0, 0, 0, 1, 0]"
2,ivn.us,Abolishing The Electoral College Could Have Ma...,"[0, 0, 0, 0, 1, 0]"
3,ivn.us,Mandate San Diego Voters Send Strong Message W...,"[0, 0, 0, 0, 1, 0]"
4,ivn.us,Chollas Creek Neighborhood Fights to Save its ...,"[0, 0, 0, 0, 1, 0]"
...,...,...,...
5355,lawenforcementtoday.com,California Supreme Court overturns 2005 death ...,"[0, 1, 0, 1, 0, 1]"
5356,lawenforcementtoday.com,Report Biden administration provided Taliban w...,"[0, 1, 0, 1, 0, 1]"
5357,lawenforcementtoday.com,What country is this Michigan s health departm...,"[0, 1, 0, 1, 0, 1]"
5358,lawenforcementtoday.com,California activist charged for ramming a car ...,"[0, 1, 0, 1, 0, 1]"


In [17]:
df_final_train = pd.concat([df_train, df_val], ignore_index=True)
train_dataset = Dataset.from_dict(df_final_train)
test_dataset = Dataset.from_dict(df_test)
my_dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [18]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-xsmall', model_max_length=512, use_fast=True)
def preprocess_function(mydata):
    return tokenizer(mydata['text'], truncation=True)
print(tokenizer.model_max_length)
tokenized_data = my_dataset_dict.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



512


Map:   0%|          | 0/6068 [00:00<?, ? examples/s]

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

In [19]:
tokenized_data = tokenized_data.with_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_data = tokenized_data.remove_columns(['source', 'text', 'token_type_ids'])
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 6068
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 574
    })
})

In [20]:
tokenized_id2label = {key: tokenizer(value, return_tensors='pt', padding='max_length', max_length=5, return_token_type_ids=False) for key, value in id2label.items()}
tokenized_id2label = {
    key: {tensor_name: tensor.squeeze(0) for tensor_name, tensor in inner_dict.items()}
    for key, inner_dict in tokenized_id2label.items()
}
tokenized_id2label[0]

{'input_ids': tensor([   1, 8038,    2,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 0, 0])}

In [21]:
example = tokenized_data['train'][0]
example['labels']

tensor([0, 0, 0, 0, 1, 0])

In [22]:
model = AutoModel.from_pretrained('microsoft/deberta-v3-xsmall')
model

pytorch_model.bin:   0%|          | 0.00/241M [00:00<?, ?B/s]

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 384, padding_idx=0)
    (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=384, out_features=384, bias=True)
            (key_proj): Linear(in_features=384, out_features=384, bias=True)
            (value_proj): Linear(in_features=384, out_features=384, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermedia

 Dataset for descriptions

In [23]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data_dict):
        self.data_dict = data_dict
        self.keys = list(data_dict.keys())

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]
        sample = self.data_dict[key]
        
        return {'input_ids': sample['input_ids'], 'attention_mask': sample['attention_mask']}

descdataset = CustomDataset(tokenized_id2label)

In [24]:
def get_dataloaders(batch_size):
    batch_size = batch_size
    train_dataloader = DataLoader(
        tokenized_data['train'], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    test_dataloader = DataLoader(
        tokenized_data['test'], batch_size=batch_size, collate_fn=data_collator
    )
    desc_dataloader = DataLoader(descdataset, batch_size=len(descdataset), shuffle=False)
    return train_dataloader, test_dataloader, desc_dataloader

#### Easy positive-hard negative mining

In [25]:
def hard_miner(anchors, samples, labels):
    distances = F.pairwise_distance(anchors.unsqueeze(1), samples.unsqueeze(0))
    
    pos_distances = distances * labels
    easy_pos = samples[torch.argmin(pos_distances, dim=1)]
    
    neg_distances = distances * (1 - labels) + 65500 * labels # make positive ones large to avoid 0s
    hardest_neg = samples[torch.argmin(neg_distances, dim=1)]

    return easy_pos, hardest_neg

#### Easy positive-semi hard negative mining 

In [26]:
def semi_hard_miner(anchors, samples, labels, margin):
    distances = F.pairwise_distance(anchors.unsqueeze(1), samples.unsqueeze(0))

    pos_distances = distances * labels
    easy_dist, easy_idx = torch.min(pos_distances, dim=1)
    easy_pos = samples[easy_idx]

    neg_distances = distances * (1 - labels) + 65500 * labels  # make positive ones large to avoid 0s

    mask = (neg_distances > easy_dist.view(-1, 1)) & (neg_distances < (easy_dist + margin).view(-1, 1))
    indices = torch.argmax(neg_distances * mask, dim=1)
  
    semi_hard_negatives = samples[indices]

    return easy_pos, semi_hard_negatives

In [27]:
class CustomTripletMarginLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(CustomTripletMarginLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        pos_distance = F.pairwise_distance(anchor, positive)
        neg_distance = F.pairwise_distance(anchor, negative)
        
        loss = F.relu((pos_distance - neg_distance) / (torch.mean(neg_distance) + 1e-4) + self.margin)

        return torch.mean(loss)

In [28]:
def training_function(model, mixed_precision='fp16', seed=42, batch_size=64, lrate=2e-5, num_epochs=10, num_proc=1):
    set_seed(seed)
    accelerator = Accelerator(mixed_precision=mixed_precision)
    
    if accelerator.is_main_process:
        datasets.utils.logging.set_verbosity_warning()
        datasets.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    
    accelerator.print(f'Getting dataloaders...')
    
    train_dataloader, _, desc_dataloader = get_dataloaders(batch_size)
    
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=lrate)
    #lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=0.003, epochs=num_epochs, steps_per_epoch=len(train_dataloader))
    margin = 0.2
    #triplet_loss = nn.TripletMarginWithDistanceLoss(margin=margin)
    triplet_loss = CustomTripletMarginLoss(margin=margin)
    model, optimizer, train_dataloader, desc_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, desc_dataloader
    )
    accelerator.print(f'Begin epochs...')
    progress_bar = tqdm(range(num_epochs * len(train_dataloader)), disable=not accelerator.is_main_process)
    
    model.train()
    for epoch in range(num_epochs):
        train_loss = []
        val_loss = []
        for batch in train_dataloader:
            labels = batch['labels']
            embeddings = model(input_ids=batch['input_ids'],
                                    attention_mask=batch['attention_mask']).last_hidden_state
            embeddings = torch.mean(embeddings, dim=1)
            
            for desc in desc_dataloader:
                samples = model(input_ids=desc['input_ids'], attention_mask=desc['attention_mask']).last_hidden_state
                samples = torch.mean(samples, dim=1)
            
            concat_embeds = torch.cat((embeddings, samples), dim=0)
            norm_embeds = F.normalize(concat_embeds, p=2, dim=1)
            embeddings, samples = norm_embeds[:embeddings.size(0)], norm_embeds[embeddings.size(0):]
            if epoch < 10:
                pos_samples, neg_samples = semi_hard_miner(embeddings, samples, labels, margin)
                pos_texts, neg_texts = semi_hard_miner(samples, embeddings, labels.T, margin)
            else:
                pos_samples, neg_samples = hard_miner(embeddings, samples, labels)
                pos_texts, neg_texts = hard_miner(samples, embeddings, labels.T)
            
            loss = triplet_loss(embeddings, pos_samples, neg_samples)
            loss_samples = triplet_loss(samples, pos_texts, neg_texts)
            
            total_loss = loss + loss_samples
            accelerator.backward(total_loss)
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()
            train_loss.append(total_loss.detach().cpu().numpy())
            
            progress_bar.update(1)

        avg_train_loss = np.mean(np.array(train_loss)) / num_proc # multi gpu case
        accelerator.print(f'Epoch {epoch+1}, training loss: {avg_train_loss: .4f}')
        

In [29]:
seed = 42
batchsize = 32
lr = 3e-4
epochs = 20
numproc = 2 # how many gpus are used
args = (model, 'fp16', seed, batchsize, lr, epochs, numproc)
notebook_launcher(training_function, args, num_processes=numproc)
model.save_pretrained('deberta-metric-reduced-final')

Launching training on 2 GPUs.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Getting dataloaders...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Begin epochs...


  0%|          | 0/1900 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  5%|▌         | 95/1900 [01:45<32:36,  1.08s/it]

Epoch 1, training loss:  0.1755


 10%|█         | 190/1900 [03:28<30:46,  1.08s/it]

Epoch 2, training loss:  0.1632


 15%|█▌        | 285/1900 [05:10<28:53,  1.07s/it]

Epoch 3, training loss:  0.1712


 20%|██        | 380/1900 [06:54<27:26,  1.08s/it]

Epoch 4, training loss:  0.1860


 25%|██▌       | 475/1900 [08:37<25:38,  1.08s/it]

Epoch 5, training loss:  0.2324


 30%|███       | 570/1900 [10:20<23:53,  1.08s/it]

Epoch 6, training loss:  0.2335


 35%|███▌      | 665/1900 [12:03<22:07,  1.08s/it]

Epoch 7, training loss:  0.2179


 40%|████      | 760/1900 [13:47<20:38,  1.09s/it]

Epoch 8, training loss:  0.2178


 45%|████▌     | 855/1900 [15:30<18:46,  1.08s/it]

Epoch 9, training loss:  0.2070


 50%|█████     | 950/1900 [17:13<17:06,  1.08s/it]

Epoch 10, training loss:  0.2047


 55%|█████▌    | 1045/1900 [18:56<15:24,  1.08s/it]

Epoch 11, training loss:  0.3198


 60%|██████    | 1140/1900 [20:39<13:40,  1.08s/it]

Epoch 12, training loss:  0.2229


 65%|██████▌   | 1235/1900 [22:22<11:57,  1.08s/it]

Epoch 13, training loss:  0.2101


 70%|███████   | 1330/1900 [24:05<10:14,  1.08s/it]

Epoch 14, training loss:  0.2075


 75%|███████▌  | 1425/1900 [25:48<08:30,  1.07s/it]

Epoch 15, training loss:  0.2050


 80%|████████  | 1520/1900 [27:31<06:51,  1.08s/it]

Epoch 16, training loss:  0.2048


 85%|████████▌ | 1615/1900 [29:13<05:06,  1.08s/it]

Epoch 17, training loss:  0.2034


 90%|█████████ | 1710/1900 [30:56<03:25,  1.08s/it]

Epoch 18, training loss:  0.2032


 95%|█████████▌| 1805/1900 [32:39<01:43,  1.09s/it]

Epoch 19, training loss:  0.2033


100%|██████████| 1900/1900 [34:22<00:00,  1.08s/it]

Epoch 20, training loss:  0.2026


100%|██████████| 1900/1900 [34:22<00:00,  1.09s/it]


In [30]:
sims = []
_, test_dataloader, desc_dataloader = get_dataloaders(32)
dataframe = df_test
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model.to(device)
model.eval()
with torch.no_grad():
    for desc in desc_dataloader:
            samples = model(input_ids=desc['input_ids'].to(device), 
                            attention_mask=desc['attention_mask'].to(device)).last_hidden_state
            samples = torch.mean(samples, dim=1)

    for batch in test_dataloader:
        embeddings = model(input_ids=batch['input_ids'].to(device),
                           attention_mask=batch['attention_mask'].to(device)).last_hidden_state
        embeddings = torch.mean(embeddings, dim=1)
        concat_embeds = torch.cat((embeddings, samples), dim=0)
        norm_embeds = F.normalize(concat_embeds, p=2, dim=1)
        embeddings, samples = norm_embeds[:embeddings.size(0)], norm_embeds[embeddings.size(0):]
        similarities = F.pairwise_distance(embeddings.unsqueeze(1), samples.unsqueeze(0))
        sims.append(similarities.cpu().numpy())

result_array = np.concatenate(sims, axis=0)
df_preds = dataframe
df_preds['predictions'] = result_array.tolist()
df_final = dataframe.drop(columns=['text'])
df_final = df_final.drop_duplicates(subset=['source'])
website_labels = df_final.set_index('source')['labels'].to_dict()
k = 3 # we take top 3 closest values as we have only 6 labels

label_dict = {source: [] for source in df_preds['source']}
for index, row in df_preds.iterrows():
    label_dict[row['source']].append(row['predictions'])
for key in label_dict:
    label_dict[key] = np.array([np.mean(values) for values in zip(*label_dict[key])])

    top_k_indices = np.argsort(label_dict[key])[:k]
    mask = np.zeros_like(label_dict[key])
    mask[top_k_indices] = 1
    result = mask
    label_dict[key] = result
y_true = list(website_labels.values())
y_pred = list(label_dict.values())
print(sklearn.metrics.classification_report(y_true, y_pred, target_names=list(id2label.values())))
f1score = sklearn.metrics.f1_score(y_true, y_pred, average='samples')
exactmr = sklearn.metrics.accuracy_score(y_true, y_pred, normalize=True, sample_weight=None)
hamming = sklearn.metrics.hamming_loss(y_true, y_pred)
print(f'Exact MR: {exactmr}, f1: {f1score}, hamming loss: {hamming}')

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


              precision    recall  f1-score   support

        Left       0.25      0.04      0.07        26
       Right       0.37      0.57      0.45        23
     Liberal       0.24      0.48      0.32        21
Conservative       0.21      0.53      0.31        17
    Credible       0.78      0.58      0.67        53
  Unreliable       0.15      0.46      0.23        13

   micro avg       0.35      0.46      0.40       153
   macro avg       0.33      0.44      0.34       153
weighted avg       0.44      0.46      0.41       153
 samples avg       0.35      0.43      0.37       153

Exact MR: 0.0, f1: 0.36972281449893385, hamming loss: 0.5323383084577115


In [31]:
import gc
gc.collect()
torch.cuda.empty_cache()