# Prompting strategies

In [2]:
# install libraries
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://us

In [3]:
# Google Colab: set current dir
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/data

Mounted at /content/drive
/content/drive/MyDrive/data


In [144]:
# load libraries
import os
import gc
import transformers
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers.file_utils import is_torch_available
from scipy.spatial import distance
from scipy.stats import pearsonr

In [5]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [6]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed(1024)

In [145]:
# load data
dir = "prompting-data"
data_file = "dataset_shorter_chem_names.tsv"

data = pd.read_csv(os.path.join(dir, data_file), sep="\t")
print(data.head())

   fungi_id                 fungi_name       family_name  pubchem_id  \
0    119834       Alternaria alternata     Pleosporaceae     5360741   
1    257047  Cephalosporium aphidicola   Cordycipitaceae      457964   
2    237604        Cordyceps militaris   Cordycipitaceae        6303   
3    284309          Aspergillus niger    Aspergillaceae     5748546   
4    815927     Albifimbria verrucaria  Stachybotryaceae     6326658   

                      chem_name  nb_ref  y  
0  Alternariol monomethyl ether      11  1  
1                   Aphidicolin      10  1  
2                    Cordycepin       6  1  
3                  Flavasperone       6  1  
4                  Verrucarin A       5  1  


In [8]:
models = {'ChemicalBERT':'recobo/chemical-bert-uncased',

          'BioBERT':'dmis-lab/biobert-base-cased-v1.2',

          'BERT':'bert-base-uncased',
          'BERT-large': 'bert-large-cased-whole-word-masking',

          'RoBERTa':'roberta-base', # needs <mask>
          'RoBERTa-large':'roberta-large',

          'BigBird-RoBERTa-large':'google/bigbird-roberta-large',

          'Muppet-RoBERTa-large':'facebook/muppet-roberta-large',
          
          'PubMedBERT-full':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
          'PubMedBERT':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
          
          'Clinical-BigBird':'yikuan8/Clinical-BigBird',
          'Clinical-Longformer':'yikuan8/Clinical-Longformer'
}

## Task 1: Association Fungi - Chemical compound

In [9]:
# 1) Set-up manual prompts verbaliser

class ManualPromptDataset(Dataset):
  """
  Dataset generator for task1.
  Generate manual prompt for each pairs in the dataset.
  Parameters are:
    - the dataset
    - the model to select the AutoTokenizer
    - a prompt template
  """

  def __init__(self, data, tokenizer, template, max_length):
    self.data = data
    self.tokenizer = tokenizer
    self.template = template
    self.max_length = max_length
  
  def __len__(self):
    return self.data.shape[0]
  
  def __getitem__(self, index):

    # Get data
    chemical_name = self.data.loc[index, "chem_name"]
    fungi_name = self.data.loc[index, "fungi_name"]
    # fill and tokenize prompt template
    filled_template = self.template.format(compound=chemical_name, mask=self.tokenizer.mask_token, fungi=fungi_name)
    tokenized = self.tokenizer(filled_template, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
    
    for k in tokenized.keys():
      tokenized[k] = torch.squeeze(tokenized[k])

    # remove 
    return tokenized


def get_proba_matrix(dataloader, model, vocab_size, mask_token_id, device):
  """
  Evaluate the performances of each models and each template 
  """

  # Init token expected counts
  proba_matrix = np.array([]).reshape(0, vocab_size)

  for step, batch in enumerate(dataloader):
    print("    - batch: " + str(step))

    n, m = batch["input_ids"].shape

    # save input ids in classic device before using gpu (in case)
    input_ids = batch["input_ids"].clone()
    
    inputs = batch.to(device)
    
    # send batch to model
    out = model(**inputs)

    # get proba for the masked token
    # 1- recover indexes of the masked token
    masked_token_indexes = (input_ids == mask_token_id).nonzero(as_tuple=True)[1]

    # 2- transform indexes so that we can extract the correspond line in the 3D tensor. The idea is to transform the 3D tensor (batch_size, seq_length, hidden_size) en un 2D tensor (batch_size * seq_length, hidden_size).
    # Ensuite, on a plus qu'a incrémenter les index initiaux de 64 en plus pour chaque ligne de tel sorte à ce qu'il corresponde dans la matrice 2D.
    masked_token_indexes = masked_token_indexes + torch.arange(0, m * n, m)

    # Compute proba
    proba_masked_tokens = torch.nn.functional.softmax(out.logits[:, :, 0:vocab_size].view(-1, vocab_size)[masked_token_indexes], dim=1)

    # concat in proba matrix
    proba_matrix = np.concatenate((proba_matrix, proba_masked_tokens.detach().cpu().numpy()), axis=0)

  return proba_matrix



def get_expected_top_k(dataloader, model, vocab_size, mask_token_id, k, device):
  """
  Get the top k expected tokens
  """

  expected_count_matrix = np.zeros(vocab_size)

  for step, batch in enumerate(dataloader):
    print("    - batch: " + str(step))

    n, m = batch["input_ids"].shape

    # save input ids in classic device before using gpu (in case)
    input_ids = batch["input_ids"].clone()
    
    inputs = batch.to(device)
    
    # send batch to model
    out = model(**inputs)

    # get proba for the masked token
    # 1- recover indexes of the masked token
    masked_token_indexes = (input_ids == mask_token_id).nonzero(as_tuple=True)[1]

    # 2- transform indexes so that we can extract the correspond line in the 3D tensor. The idea is to transform the 3D tensor (batch_size, seq_length, hidden_size) en un 2D tensor (batch_size * seq_length, hidden_size).
    # Ensuite, on a plus qu'a incrémenter les index initiaux de 64 en plus pour chaque ligne de tel sorte à ce qu'il corresponde dans la matrice 2D.
    masked_token_indexes = masked_token_indexes + torch.arange(0, m * n, m)

    # Compute proba
    proba_masked_tokens = torch.nn.functional.softmax(out.logits[:, :, 0:vocab_size].view(-1, vocab_size)[masked_token_indexes], dim=1)

    # concat in proba matrix
    expected_count_matrix += np.sum(proba_masked_tokens.detach().cpu().numpy(), axis=0)
  
  top_k_indexes = np.argsort(expected_count_matrix)[::-1][:k]
  top_k_values = expected_count_matrix[top_k_indexes]

  return top_k_indexes, top_k_values


  
def get_top_1_distribution(input_matrix):
  
  # init
  n, m = input_matrix.shape
  counts = np.zeros(m)
  
  # get top 1 for each example
  for i in range(n):
    top1_index = np.argmax(input_matrix[i])
    counts[top1_index] += 1
  
  # transform as probs
  counts = counts / np.sum(counts)
  return counts




def compute_JS_divergences(m1, m2, n_sample):
  """
  m1 and m2 are matrix of words probability od dim (n x V) where n in the numner of samples and V the vocabulary size.
  """
  
  if not m1.shape == m2.shape:
    print("m1 and n2 must have the same dimensions")
    return False
  
  n, m = m1.shape
  JS_divergences = np.empty(n_sample)
  
  for k in range(n_sample):

    i = random.choice(range(n))
    j = random.choice(range(n))

    JS_divergences[k] = distance.jensenshannon(m1[i], m2[j])
  
  return JS_divergences

### Les templates

-  On a des templates pour faire de la sentiment analysis: on demande de compléter par un verbe ou un adjectif qui devrait être représentatif de la nature du statement : vrai ou faux

-  Les templates dits de prédiction demande quant à eux de compléter la phrase avec un composé chimique.

In [10]:
templates_task1_sentiment = ['Compound {compound} was {mask} from fungus {fungi} with the antimicrobial guided isolation procedure.',
                 '{compound} was {mask} from {fungi} with the antimicrobial guided isolation procedure.',
                 'Compound {compound} was {mask} obtained from fungus {fungi} with the antimicrobial guided isolation procedure.',
                 '{compound} was {mask} obtained from {fungi} with the antimicrobial guided isolation procedure.',          
                 'Fungus {fungi} showed {mask} {compound} activity.',
                 '{fungi} showed {mask} {compound} activity.',
                 'Fungus {fungi} {mask} {compound} activity.',
                 '{fungi} {mask} {compound} activity.',
                 'Fungus {fungi} {mask} compound {compound}.',
                 'Authors {mask} a natural product called {compound} from the {fungi}.',
                 'A strain {fungi} was isolated as a {mask} {compound} producer.',                 
                 'A strain {fungi} was {mask} as a high {compound} producer.',
                 'Compound {compound} is produced by fungus {fungi}. It is {mask}.']

template_task1_prediction = ['The fungus {fungi} is a natural producer of the compound {mask}.',
                             'The compound {mask} was isolated and identified from culture of fungus {fungi}',
                             'Seconday metabolite {compound} has been isolated from crude extracts of fungi {fungi}.']

templates_task2_sentiment = ['Among isolated chemical compounds, {compound} presented {mask} antimicrobial activities.',
                 'Compound {compound} showed {mask} growth inhibition on strains.',
                 '{compound} showed {mask} growth inhibition on strains.',
                 'Compound {compound} showed {mask} growth inhibition on drug-resistant pathogenic strains.',
                 '{compound} showed {mask} growth inhibition on drug-resistant pathogenic strains.',
                 'Compound {compound} {mask} the growth of the strains.',
                 '{compound} {mask} the growth of the strains.',
                 'Compound {compound} has antibiotic activity. It is {mask}']

 
templates_task2_prediction = ['Compound {chemical} showed {mask} activity.']

new_templates_task2_sentiment = ['Compound {compound} has antibiotic activity. It is {mask}']


test_models = {'BioBERT':'dmis-lab/biobert-base-cased-v1.2',
               'RoBERTa':'roberta-base'}

test_templates_task1_sentiment = ['Compound {compound} was {mask} from fungus {fungi} with the antimicrobial guided isolation procedure.',
  '{compound} was {mask} from {fungi} with the antimicrobial guided isolation procedure.']

In [11]:
positive_pairs = data[data["y"] == 1].reset_index()
negative_pairs = data[data["y"] == 0].reset_index()

model = AutoModelForMaskedLM.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
model.to(device)
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', use_fast=True)
vocab_size = tokenizer.vocab_size
mask_token_id = tokenizer.mask_token_id

template = templates_task2_sentiment[0]
max_length = 64
batch_size = 64

dataset = ManualPromptDataset(data=negative_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
dataloader_positive = DataLoader(dataset, batch_size = batch_size, shuffle = False, num_workers = 0)

# a = get_proba_matrix(dataloader_positive, model, vocab_size, mask_token_id, device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [85]:
model = AutoModelForMaskedLM.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', use_fast=True)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [129]:
input = tokenizer("carcinosarcoma of lung has a genetic association with [MASK] [MASK] [MASK] [MASK]", max_length=64, padding='max_length', truncation=True, return_tensors='pt')

In [130]:
(input["input_ids"] == mask_token_id).nonzero(as_tuple=True)[1]

tensor([13, 14, 15, 16])

In [140]:
tokenizer.encode("deoxy")

[101, 1260, 10649, 1183, 102]

In [143]:
tokenizer.decode(1183)

'##y'

In [131]:
out = model(**input)

In [132]:
[np.argmax(out.logits[0, i, :].cpu().detach().numpy()) for i in [13, 14, 15, 16]]

[145, 1105, 3187, 119]

In [119]:
print(tokenizer.decode(5358))
print(tokenizer.decode(8508))
print(tokenizer.decode(2042))

da
##pi
##ine


In [134]:
tokenizer.decode([101,  1610, 16430,  9275, 19878,  7903,  1104, 13093,  1144,   170,
          7434,  3852,  1114,   145, 1105, 3187, 119,   102])

'[CLS] carcinosarcoma of lung has a genetic association with H and risk. [SEP]'

## Run evaluation on all combinations and save the results

### What's the idea ?

-- Hyp: The distribution of the answers should be different between true pairs and negative pairs, and this difference should be sufficient significant.

-- How to compute the difference betwwen the answers distribution ? We compute the JS-divergence between the probability distribution of words for the MASKED token.

So, the JS-divergence observed for the word probability distribution obtained between a positive pair and a negative pair should be high. The model should not answer the same thing in the both case.

-- How to tell if it is significant ?
To estimate if the JS-Divergence value is signficant, we can compare it to JS-Divergence values obtained between pairs of positive or pairs of negative examples. Between pairs of positive examples (or between pairs of negative examples) the JS divergence should be small as the model should have a similar answer distribution whem it's positive examples or when its negative examples.

So the idea is simply to do like a sort of Monte-Carlo p.value, we estimate the probability that the JS-Divergence between Pos and Neg > the JS-Divergence between Pos examples OR JS-Divergence between Neg examples.


If we found close to 0.5 (like we did), this means that the answer distribution by comparing Positive and Negatives is not much diffenrent than the natural variability we observed by comparing postives examples or negative examples together. 






In [10]:

def compute_JS_divergences_on_dataset(positive_pairs, negative_pairs, models_set, templates_set, n_sample):

  proba_matrix_positives = np.array([])
  proba_matrix_negatives = np.array([])

  print("Device: " + str(device))
  batch_size = 64
  max_length = 64
  dir = "prompting-data/results"

  JS_divergence_table = pd.DataFrame()

  # For each model
  for model_name, model_ref in models_set.items():
    
    print("Treating model " + model_name)

    # load model and tokenizer
    model = AutoModelForMaskedLM.from_pretrained(model_ref)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_ref, use_fast=True)
    vocab_size = tokenizer.vocab_size
    mask_token_id = tokenizer.mask_token_id

    # just inference, no backward needed
    with torch.no_grad(): 

      for template_index in range(len(templates_set)):

        print(" - Template: " + str(template_index))
        
        template = templates_set[template_index]

        # load data for POSTIVE pairs
        dataset = ManualPromptDataset(data=positive_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
        dataloader_positive = DataLoader(dataset, batch_size = batch_size, shuffle = False, num_workers = 0)

        # get proba matrix for positive examples
        proba_matrix_positives = get_proba_matrix(dataloader_positive, model, vocab_size, mask_token_id, device)

        # load data for NEGATIVE pairs
        dataset = ManualPromptDataset(data=negative_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
        dataloader_negative = DataLoader(dataset, batch_size = batch_size, shuffle = False, num_workers = 0)

        # get proba matrix for negative examples
        proba_matrix_negatives = get_proba_matrix(dataloader_negative, model, vocab_size, mask_token_id, device)

        # Compute Positive x Positive JS divergence:
        pos_pos_JS_divergences = compute_JS_divergences(proba_matrix_positives, proba_matrix_positives, n_sample)

        # Compute Negative x Negative JS divergence:
        neg_neg_JS_divergences = compute_JS_divergences(proba_matrix_negatives, proba_matrix_negatives, n_sample)

        # Compute Postive x Negative JS divergence:
        pos_neg_JS_divergences = compute_JS_divergences(proba_matrix_positives, proba_matrix_negatives, n_sample)
        n = len(pos_neg_JS_divergences)

        # Compile and export
        model_template_JS_table = pd.DataFrame({"model": [model_name] * n, "Template": [template_index] * n, "PosxPos": pos_pos_JS_divergences, "NegxNeg": neg_neg_JS_divergences, "PosxNeg": pos_neg_JS_divergences})
        JS_divergence_table = pd.concat([JS_divergence_table, model_template_JS_table])

    # clean
    model = None
    tokenizer = None
    gc.collect()
    torch.cuda.empty_cache()
  
  # output
  return JS_divergence_table



def compute_top_k_tokens_on_dataset(positive_pairs, negative_pairs, models_set, templates_set, k):

  print("Device: " + str(device))
  batch_size = 64
  max_length = 64

  top_k_table = pd.DataFrame()

  # For each model
  for model_name, model_ref in models_set.items():
    
    print("Treating model " + model_name)

    # load model and tokenizer
    model = AutoModelForMaskedLM.from_pretrained(model_ref)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_ref, use_fast=True)
    vocab_size = tokenizer.vocab_size
    mask_token_id = tokenizer.mask_token_id

    # just inference, no backward needed
    with torch.no_grad(): 

      for template_index in range(len(templates_set)):

        print(" - Template: " + str(template_index))
        
        template = templates_set[template_index]

        # load data for POSTIVE pairs
        dataset_positive = ManualPromptDataset(data=positive_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
        dataloader_positive = DataLoader(dataset_positive, batch_size = batch_size, shuffle = False, num_workers = 0)

        # get tokens expected counts for positive examples
        top_k_indexes_positives, top_k_e_counts_positives = get_expected_top_k(dataloader_positive, model, vocab_size, mask_token_id, k, device)
        top_k_tokens_positives = [tokenizer.decode(t) for t in top_k_indexes_positives]

        # load data for NEGATIVE pairs
        dataset_negative = ManualPromptDataset(data=negative_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
        dataloader_negative = DataLoader(dataset_negative, batch_size = batch_size, shuffle = False, num_workers = 0)

        # get tokens expected counts for negative examples
        top_k_indexes_negatives, top_k_e_counts_negatives = get_expected_top_k(dataloader_negative, model, vocab_size, mask_token_id, k, device)
        top_k_tokens_negatives = [tokenizer.decode(t) for t in top_k_indexes_negatives]

        # Compile and export
        n = k * 2
        model_template_top_k_table = pd.DataFrame({"model": [model_name] * n, "Template": [template_index] * n, "Type": ["Positive"] * k + ["Negative"] * k, "Rank": list(range(1, k + 1)) * 2, "index": np.concatenate((top_k_indexes_positives, top_k_indexes_negatives)), "word": np.concatenate((top_k_tokens_positives, top_k_tokens_negatives)), "count": np.concatenate((top_k_e_counts_positives, top_k_e_counts_negatives))})
        top_k_table = pd.concat([top_k_table, model_template_top_k_table])

    # clean
    model = None
    tokenizer = None
    gc.collect()
    torch.cuda.empty_cache()

  return top_k_table



def compute_top1_correlation(positive_pairs, negative_pairs, models_set, templates_set):

  print("Device: " + str(device))
  batch_size = 64
  max_length = 64

  top1_correlations = pd.DataFrame()
  n_templates = len(templates_set)

  # For each model
  for model_name, model_ref in models_set.items():
    
    print("Treating model " + model_name)

    # load model and tokenizer
    model = AutoModelForMaskedLM.from_pretrained(model_ref)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_ref, use_fast=True)
    vocab_size = tokenizer.vocab_size
    mask_token_id = tokenizer.mask_token_id

    cors = np.empty(n_templates)

    # just inference, no backward needed
    with torch.no_grad(): 

      for template_index in range(len(templates_set)):

        print(" - Template: " + str(template_index))
        
        template = templates_set[template_index]

        # load data for POSTIVE pairs
        dataset_positive = ManualPromptDataset(data=positive_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
        dataloader_positive = DataLoader(dataset_positive, batch_size = batch_size, shuffle = False, num_workers = 0)

        # get proba matrix for positive examples
        proba_matrix_positives = get_proba_matrix(dataloader_positive, model, vocab_size, mask_token_id, device)

        # load data for NEGATIVE pairs
        dataset_negative = ManualPromptDataset(data=negative_pairs, tokenizer=tokenizer, template=template, max_length=max_length)
        dataloader_negative = DataLoader(dataset_negative, batch_size = batch_size, shuffle = False, num_workers = 0)

        # get proba matrix for negative examples
        proba_matrix_negatives = get_proba_matrix(dataloader_negative, model, vocab_size, mask_token_id, device)

        # Cpmpute top 1 correlations
        top1_pos = get_top_1_distribution(proba_matrix_positives)
        top1_neg = get_top_1_distribution(proba_matrix_negatives)
        cors[template_index] = pearsonr(top1_pos, top1_neg)[0]

    # Compile and export
    model_template_cors = pd.DataFrame({"model": [model_name] * n_templates , "Template": list(range(n_templates)), "cors": cors})
    top1_correlations = pd.concat([top1_correlations, model_template_cors])

    # clean
    model = None
    tokenizer = None
    gc.collect()
    torch.cuda.empty_cache()
  
  # output
  return top1_correlations


#### For task 1:

In [None]:
##  Get JS - Divergences
n_sample = 5000
positive_pairs = data[data["nb_ref"] > 0].reset_index()
negative_pairs = data[data["nb_ref"] < 0].reset_index()

JS = compute_JS_divergences_on_dataset(positive_pairs, negative_pairs, models, templates_task1_sentiment, n_sample)
JS.to_csv("prompting-data/results/JS_divergence_complete_template_task1_5000.tsv", index=False, sep="\t")

##  Get Top k tokens
k = 20
TOP_K = compute_top_k_tokens_on_dataset(positive_pairs, negative_pairs, models, templates_task1_sentiment, k)
TOP_K.to_csv("prompting-data/results/top_k_table_task1.tsv", index=False, sep="\t")

In [None]:
# Compute the Top 1 token correlation

positive_pairs = data[data["nb_ref"] > 0].reset_index()
negative_pairs = data[data["nb_ref"] < 0].reset_index()

TOP_1_COR = compute_top1_correlation(positive_pairs, negative_pairs, models, templates_task1_sentiment)
TOP_1_COR.to_csv("prompting-data/results/top_1_cor_task1.tsv", index=False, sep="\t")

In [41]:
TOP_1_COR

Unnamed: 0,model,Template,cors
0,BioBERT,0,1.0
1,BioBERT,1,1.0
0,RoBERTa,0,0.998396
1,RoBERTa,1,0.998348


#### For task 2:

In [None]:
##  Get JS - Divergences
n_sample = 5000
positive_pairs = data[data["y"] == 1].reset_index()
negative_pairs = data[data["y"] == 0].reset_index()

JS_2 = compute_JS_divergences_on_dataset(positive_pairs, negative_pairs, models, new_templates_task2_sentiment, n_sample)
JS_2.to_csv("prompting-data/results/new_JS_divergence_table_complete_task2_2.tsv", index=False, sep="\t")

##  Get Top k tokens
k = 20
TOP_K = compute_top_k_tokens_on_dataset(positive_pairs, negative_pairs, models, new_templates_task2_sentiment, k)
TOP_K.to_csv("prompting-data/results/new_top_k_table_task2.tsv", index=False, sep="\t")

TOP_1_COR = compute_top1_correlation(positive_pairs, negative_pairs, models, new_templates_task2_sentiment)
TOP_1_COR.to_csv("prompting-data/results/new_top_1_cor_task2.tsv", index=False, sep="\t")

In [None]:
positive_pairs = data[data["y"] == 1].reset_index()
negative_pairs = data[data["y"] == 0].reset_index()




## Run expected tokens counts 

In [None]:
save_test_1 = JS_2

In [None]:
save_test_1.shape

(420000, 5)

In [None]:
TOP_K.shape

(3360, 7)