# Plausibility
by Lucrezia Labardi

## Mount drive and import tokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('drive/MyDrive/Tesi-magistrale')

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]



## Function definition

In [None]:
def mask_all_tokens(sentence, tokenizer):
  "Function to mask each token of the sentence recursively"
    masked_sentences = []
    tokenized_sentence = tokenizer(sentence, return_tensors='pt')
    for token_idx in range(1, len(tokenized_sentence['input_ids'][0]) - 1):
        masked_input = tokenizer(sentence, return_tensors='pt')  # tokenized_sentence.copy()
        masked_input['input_ids'][0][token_idx] = tokenizer.mask_token_id
        masked_sentences.append(masked_input)
    return masked_sentences

In [None]:
def compute_plausibilita(sentence, tokenizer, model):
  "Function to compute plausibility: for each sentence with each token masked get the probability of the right token"
    masked_sentences = mask_all_tokens(sentence, tokenizer)
    original_tokens = tokenizer(sentence)['input_ids']

    sent_results = {'most_prob': [], 'correct_prob': [], 'top_1': [], 'top_5': [], 'top_10': []}

    for sent_idx, masked_sentence in enumerate(masked_sentences):
        masked_idx = sent_idx + 1
        correct_token = original_tokens[masked_idx]

        with torch.no_grad():
            #move data to device
            input_ids = masked_sentence['input_ids'].to(device)
            attention_mask = masked_sentence['attention_mask'].to(device)

            # run model on masked sentence
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # get probability
            logits = outputs.logits[0, masked_idx]
            probs = logits.softmax(dim=-1)
            # get first 10 predictions
            # values, predictions = probs.topk(10)
            # print(values)
            # print(predictions)
            # save probability of correct words
            sent_results['correct_prob'].append(probs[correct_token].item())

    #compute mean
    sent_results = sum(sent_results['correct_prob']) / len(sent_results['correct_prob'])

    return sent_results

## Run compute_probabilita on each dataset of sentences of different lengths

In [None]:
import pandas as pd
df = pd.read_csv("Perplexity/frasi_perplexity/f30_words.csv")
df.head()

Unnamed: 0,text,num_words,dataset,pp_gulp_c2,pp_gulp_c32,pp_gulp_c512,pp_gulp_c8192,pp_gulp_c15449,pp_antigulp_c2,pp_antigulp_c32,...,pp_random29_c2,pp_random29_c32,pp_random29_c512,pp_random29_c8192,pp_random29_c15449,pp_misto_c2,pp_misto_c32,pp_misto_c512,pp_misto_c8192,pp_misto_c15449
0,"La Linea Maginot, intesa come difesa costruita...",30,wiki,4.5e-05,0.000791,0.010241,0.046744,0.060732,4.3e-05,0.000792,...,5.2e-05,0.000947,0.00965,0.052825,0.059921,4.2e-05,0.000627,0.010616,0.046799,0.063456
1,Sul lato destro della chiesa si trova il sagra...,30,wiki,5.8e-05,0.000645,0.023274,0.103927,0.143298,5.2e-05,0.000645,...,6e-05,0.000749,0.026805,0.115305,0.125875,5.6e-05,0.00054,0.029912,0.089399,0.127669
2,Il capo famiglia degli Allan non ha un momento...,30,wiki,4.7e-05,0.000252,0.024216,0.136757,0.147485,4.8e-05,0.000249,...,4.3e-05,0.000235,0.026362,0.142172,0.150791,5e-05,0.00024,0.030869,0.135445,0.145091
3,La COLT Technology Services (ex COLT Telecom G...,30,wiki,5e-05,0.000594,0.024413,0.187951,0.221064,5e-05,0.000621,...,5.3e-05,0.000691,0.025741,0.186906,0.206871,4.5e-05,0.000486,0.029895,0.188293,0.208359
4,"La Nazionale azzurra, nella specialità della p...",30,wiki,5e-05,0.000574,0.02228,0.111527,0.156681,4.5e-05,0.000561,...,5.4e-05,0.000659,0.024827,0.111153,0.153795,4.3e-05,0.00046,0.027233,0.120712,0.162296


In [None]:
from transformers import BertForMaskedLM
from tqdm import tqdm
tqdm.pandas()

#modelli = [10, 15, 20, 25, 30]
modelli = [30]

for m in modelli:
    df = pd.read_csv(f"Perplexity/frasi_perplexity/f{m}_words.csv")
    checkpoints = [2, 32, 512, 8192, 15449]
    orders = ["gulp", "antigulp", "random5", "random18", "random29"]

    for order in tqdm(orders, desc="Processing Orders"):
        for c in tqdm(checkpoints, desc=f"Processing Checkpoints for {order}", leave=False):
            model = BertForMaskedLM.from_pretrained(f'Pre-training/medium_{order}_order/checkpoint-{c}')
            model.to(device)
            newcol = f"pp_{order}_c{c}"
            df[newcol] = df["text"].apply(lambda x: compute_perplexity(x, tokenizer, model))

    df.to_csv(f"Perplexity/frasi_perplexity/f{m}_words.csv", index = None)

In [None]:
df.shape

(100, 33)

In [None]:
df

Unnamed: 0,text,num_words,dataset,pp_gulp_c2,pp_gulp_c32,pp_gulp_c512,pp_gulp_c8192,pp_gulp_c15449,pp_antigulp_c2,pp_antigulp_c32,...,pp_random29_c32,pp_random29_c512,pp_random29_c8192,pp_random29_c15449,pp_misto_c2,pp_misto_c32,pp_misto_c512,pp_misto_c8192,pp_misto_c15449,pp_readit2_c15449
0,"La Linea Maginot, intesa come difesa costruita...",30,wiki,0.000045,0.000791,0.010241,0.046744,0.060732,0.000043,0.000792,...,0.000947,0.009650,0.052825,0.059921,0.000042,0.000627,0.010616,0.046799,0.063456,0.089183
1,Sul lato destro della chiesa si trova il sagra...,30,wiki,0.000058,0.000645,0.023274,0.103927,0.143298,0.000052,0.000645,...,0.000749,0.026805,0.115305,0.125875,0.000056,0.000540,0.029912,0.089399,0.127669,0.161039
2,Il capo famiglia degli Allan non ha un momento...,30,wiki,0.000047,0.000252,0.024216,0.136757,0.147485,0.000048,0.000249,...,0.000235,0.026362,0.142172,0.150791,0.000050,0.000240,0.030869,0.135445,0.145091,0.160665
3,La COLT Technology Services (ex COLT Telecom G...,30,wiki,0.000050,0.000594,0.024413,0.187951,0.221064,0.000050,0.000621,...,0.000691,0.025741,0.186906,0.206871,0.000045,0.000486,0.029895,0.188293,0.208359,0.239931
4,"La Nazionale azzurra, nella specialità della p...",30,wiki,0.000050,0.000574,0.022280,0.111527,0.156681,0.000045,0.000561,...,0.000659,0.024827,0.111153,0.153795,0.000043,0.000460,0.027233,0.120712,0.162296,0.177744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"Non è escluso, dunque, che la vicenda si chiud...",30,treebank,0.000061,0.000867,0.024138,0.114936,0.141774,0.000056,0.000865,...,0.000999,0.027393,0.127301,0.142965,0.000063,0.000708,0.027339,0.115552,0.146054,0.157574
96,"La Bbc Company, l'agenzia che organizza le dat...",30,treebank,0.000053,0.000595,0.025371,0.099543,0.109656,0.000052,0.000580,...,0.000668,0.027000,0.097070,0.107353,0.000049,0.000480,0.029587,0.102895,0.121549,0.140136
97,Obiettivi conseguiti grazie anche alla complet...,30,treebank,0.000048,0.000229,0.029944,0.117975,0.132045,0.000054,0.000219,...,0.000212,0.032916,0.125583,0.142817,0.000054,0.000215,0.034251,0.119816,0.135486,0.155114
98,Ma il cinismo declinò al progredire della sua ...,30,treebank,0.000054,0.000426,0.024969,0.086818,0.115435,0.000056,0.000407,...,0.000459,0.026476,0.086985,0.121603,0.000053,0.000372,0.027483,0.086992,0.121371,0.167025


In [None]:
df["pp_readit2_c15449"]

Unnamed: 0,pp_readit2_c15449
0,0.089183
1,0.161039
2,0.160665
3,0.239931
4,0.177744
...,...
95,0.157574
96,0.140136
97,0.155114
98,0.167025
