In [1]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [2]:
# from conllu import parse
from google.colab import drive
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
import glob
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, GPTNeoXForCausalLM

In [16]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def to_tokens_and_logprobs(model, tokenizer, input_texts):
    # pad the sentence
    bos_token = tokenizer.bos_token if tokenizer.bos_token is not None else tokenizer.pad_token
    padded_texts = [bos_token + " " + text for text in input_texts]

    input_ids = tokenizer(padded_texts, padding=True, return_tensors="pt").input_ids
    outputs = model(input_ids)
    # probs = torch.log_softmax(outputs.logits, dim=-1).detach() # natural log
    probs = torch.softmax(outputs.logits, dim=-1).detach()
    surprisals = -1 * torch.log2(probs)

    # collect the probability of the generated token -- probability at index 0 corresponds to the token at index 1
    surprisals = surprisals[:, :-1, :]
    input_ids = input_ids[:, 1:]
    gen_surprisals = torch.gather(surprisals, 2, input_ids[:, :, None]).squeeze(-1)

    text_sequence = []
    for input_sentence, input_probs in zip(input_ids, gen_surprisals):
        for token, p in zip(input_sentence, input_probs):
            if token not in tokenizer.all_special_ids:
                text_sequence.append((tokenizer.decode(token), p.item()))
    return text_sequence

In [6]:
folder_name = '/content/drive/MyDrive/comp_drop/sentences_with_comp_test'
file_type = 'csv'

sentences = pd.concat([pd.read_csv(f, sep=",") for f in glob.glob(folder_name + "*."+file_type)],ignore_index=True)


In [8]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
for i, row in tqdm(sentences.iterrows()):

  sentence = row.sentence.split()
  pseudo_that = row.pseudo_that.split()

  verb_in_text = row.verb_in_text

  try:
    verb_in_text_index = pseudo_that.index(verb_in_text)
  except ValueError:
    print(row.sentence)
    print(verb_in_text)
    continue

  matrix_verb_to_cc = row.matrix_verb_to_cc

  context = pseudo_that[:verb_in_text_index]
  context = " ".join(context)
  sentences.loc[i,"context"] = context

  context_verb = pseudo_that[:verb_in_text_index+1]
  context_verb = " ".join(context_verb)
  sentences.loc[i,"context_verb"] = context_verb

  context_verb_n = pseudo_that[:verb_in_text_index+matrix_verb_to_cc+8]
  context_verb_n = " ".join(context_verb_n)
  sentences.loc[i,"context_verb_n"] = context_verb_n

  # get the logodds of n words in the complementizer clause
  # logprob(embedded_n) = logprob(context_verb_n) - logprob(context_verb)
  logprobs_context_verb_n_sum = 0
  logprobs_context_verb_n = to_tokens_and_logprobs(model, tokenizer, [context_verb_n])
  for n in range(len(logprobs_context_verb_n)):
    logprobs_context_verb_n_sum += logprobs_context_verb_n[n][1]

  logprobs_context_verb_sum = 0
  logprobs_context_verb = to_tokens_and_logprobs(model, tokenizer, [context_verb])
  for n in range(len(logprobs_context_verb)):
    logprobs_context_verb_sum += logprobs_context_verb[n][1]
  sentences.loc[i,"embedded_n_sum"] = logprobs_context_verb_n_sum - logprobs_context_verb_sum

  # get the logodds of the verb
  # logprob(verb) = logprob(context_verb) - logprob(context)
  logprobs_context_sum = 0
  logprobs_context = to_tokens_and_logprobs(model, tokenizer, [context])
  for n in range(len(logprobs_context)):
    logprobs_context_sum += logprobs_context[n][1]
  sentences.loc[i,"verb_sum"] = logprobs_context_verb_sum - logprobs_context_sum

sentences.to_csv("/content/sentences_surprisal_alt_2.csv", index=False)

254it [02:02,  2.08it/s]


In [10]:
sentences.to_csv("/content/drive/MyDrive/comp_drop/sentences_surprisal_alt_more_words.csv", index=False)