In [1]:
import io

#from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import AutoTokenizer, AutoModelForMaskedLM

import torch

import torch.nn.functional as F
from torch import nn
import numpy as np
import pandas as pd
from scipy.stats import entropy

import matplotlib.pyplot as plt

import string #for later cleaning
import textgrid

In [2]:
print('Loading Language Model ...')

# Loading TransformerXL language model
#tokenizer_language_model = AutoTokenizer.from_pretrained("bert-base-german-cased")
#language_model =  AutoModel.from_pretrained("bert-base-german-cased")

tokenizer_language_model = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
language_model =  AutoModelForMaskedLM.from_pretrained("distilbert-base-german-cased")


#with io.capture_output() as captured:

torch.no_grad()

Loading Language Model ...


<torch.autograd.grad_mode.no_grad at 0x7fcd1b601be0>

**Code provided by laura.aina@upf.edu**

In [3]:
def get_probability_distribution_over_next_word(language_model, context,
                                                tokenizer_language_model):
  '''
  Given a string containing a linguistic context,
  return the probability distribution over the vocabulary for the next word.
  '''
  # Encode context: transform words into indices (number identifier)
  context_indices = tokenizer_language_model.encode(context)
  # Find [MASK]
  mask_id = tokenizer_language_model.encode(tokenizer_language_model.mask_token)[1]
  index_target = context_indices.index(mask_id)
  # List of indices in a vector
  context = torch.tensor(context_indices).unsqueeze(0)
  # Run model on context (vector of word indices) and get next-word scores over the vocabulary
  # Vector of the size of the vocabulary containing a score per word
  next_word_scores = language_model(context)[0].squeeze(0)[index_target]
  # Trasform scores into probabilities through softmax function
  # probability distribution over the words in the vocabulary
  next_word_probability_distribution = F.softmax(next_word_scores, dim = -1)
  return next_word_probability_distribution

def get_top_next_words(language_model, context, tokenizer_language_model, n = 10, with_probabilities = False):
  '''
  Given a string containing a linguistic context,
  return the n words with the highest probability.
  If with_probabilities, also returns their associated probability scores.
  '''
  # Deploy model on context and get the probability distribution for the next word
  # Vector of the size of the vocabulary containing a probability score per word
  next_word_probability_distribution = get_probability_distribution_over_next_word(language_model,
                                                                                   context, tokenizer_language_model)
  # Sort words by probability score and get the top n words as candidates
  sorted_word_candidates = torch.argsort(next_word_probability_distribution, descending = True)[:n]
  # Decode list of candidates: from indices to word forms
  top_word_candidates = [tokenizer_language_model.decode(torch.Tensor([w])).replace(' ', '') for w in sorted_word_candidates]
  # If probabilities need to be output, match top words to their probabilities and return them together
  if with_probabilities:
    # Sort probability scores
    sorted_probabilities = list(torch.sort(next_word_probability_distribution, descending = True)[0][:n])
    # Pair words to probabilities and store them together
    top_word_candidates_with_probabilities = []
    for i in range(len(top_word_candidates)):
      top_word_candidates_with_probabilities.append((top_word_candidates[i], float(sorted_probabilities[i])))
    return top_word_candidates_with_probabilities
  else:
  # Else, return list of top words
    return top_word_candidates

def get_probability_of_word(language_model, context, tokenizer_language_model, word):
    '''
    Given two strings containing a linguistic context and a candidate word to complete it, respectively,
    returns the probability of the word according to the language model.
    '''
    # Deploy model on context and get the probability distribution for the next word
    # Vector of the size of the vocabulary containing a probability score per word
    next_word_probability_distribution = get_probability_distribution_over_next_word(language_model,
                                                                                   context, tokenizer_language_model)
    # Encode word: transform into index (number identifier)
    word = tokenizer_language_model.encode(word)
    # Acess probability for that word
    probability_score = float(next_word_probability_distribution[word][1])
    return float(probability_score)


**Remove punctuations from predicted words**

In [None]:
def remove_punctuations(next_word_list):
    '''
    removes predicted punctuations from list
    '''
    next_word_list_DF=pd.DataFrame(next_word_list,
                                  columns=['Word', 'Prob'])
    
    ind_punct = np.empty([len(next_word_list)])

    for ii in range(len(next_word_list)):
        ind_punct[ii]=int(next_word_list[ii][0] in string.punctuation)
    
    next_word_list_DF=next_word_list_DF.drop(list(np.nonzero(ind_punct)[0]))
    
    return next_word_list_DF

---------------------------------------

# STUFF FOR ENTROPY

In [10]:
import os

def textgrid2trans(textgridfile):
    tg = textgrid.TextGrid.fromFile(textgridfile)
    transcr=[]

    for ii in range(len(tg.tiers[0])):
        transcr.append(tg.tiers[0].intervals[ii].mark)
        
    return transcr

In [11]:
textgridfile='/Users/b1019548/Documents/GitHub/Experiments/RestorePhoneme/listening_paradigm/Audio final version/Fabi/textgrids/1_1_m.TextGrid'

In [15]:
def compute_entropy(textgridfile, context_length = 5, preds4entropy_N = 5):
    transcr = textgrid2trans(textgridfile)

    nwords = len(transcr)

    entropyPD = pd.DataFrame(columns=['Context', 'Entropy'])

    for ii in range(context_length-1):
        transcr.insert(0, '')
    
    ss = 0
    ee = context_length

    

    while ee <= len(transcr):
        context = ' '.join(transcr[ss:ee]) + ' [MASK]'
        ss = ss+1
        ee = ee+1
    
        number_of_candidates = 30 #@param {type:"slider", min:1, max:20, step:1}
        # Specify if you want to also output their probabilities
        with_probabilities = True #@param {type:"boolean"}

        # Deploy language model on context and get the n words with highest probability
        next_word_list = get_top_next_words(language_model, context, tokenizer_language_model,
                                    n = number_of_candidates, with_probabilities = with_probabilities)
    
        next_word_list=remove_punctuations(next_word_list)
    
        entropyPD = entropyPD.append({'Context' : context, 'Entropy' : entropy(next_word_list['Prob'][:preds4entropy_N])},
                                ignore_index=True)
    
    return entropyPD
    

## LOOP OVER FABI AND JULIANE FOLDERS

In [99]:
os.chdir('/Users/b1019548/Documents/GitHub/Experiments/RestorePhoneme/listening_paradigm/Audio final version/')
folderNames = ['Fabi', 'Julie']
outDir = '/Entropy/'

In [103]:
for ii in range(len(folderNames)):

    os.makedirs(folderNames[ii] + outDir, exist_ok = True)
    AllTextGrids = os.listdir(folderNames[ii] + '/textgrids')
    
    for tt in range(len(AllTextGrids)):
        entropyPD = compute_entropy(folderNames[ii] + '/textgrids/' + AllTextGrids[tt], context_length = 8)
        entropyPD.to_csv(folderNames[ii] + outDir + AllTextGrids[tt].split('.')[0] + '.csv', index = False)

----------------------------

# STUFF FOR SURPRISE

In [104]:
def compute_wordprob(textgridfile, context_length = 5):
    transcr = textgrid2trans(textgridfile)

    nwords = len(transcr)


    surprisePD = pd.DataFrame(columns=['Context', 'Word', 'Probability'])

    for ii in range(context_length):
        transcr.insert(0, '')
    
    ss = 0
    ee = context_length

    while ee < len(transcr):
        context = ' '.join(transcr[ss:ee]) + ' [MASK]'
    
        # Deploy language model on context and get the n words with highest probability
        next_word_prob = get_probability_of_word(language_model, context, tokenizer_language_model, transcr[ee])
    
        surprisePD = surprisePD.append({'Context' : context, 'Word' : transcr[ee], 'Probability' : next_word_prob},
                                ignore_index=True)
    
        ss = ss+1
        ee = ee+1

    return surprisePD

## LOOP OVER FABI AND JULIANE FOLDERS

In [107]:
os.chdir('/Users/b1019548/Documents/GitHub/Experiments/RestorePhoneme/listening_paradigm/Audio final version/')
folderNames = ['Fabi', 'Julie']
outDir = '/WordProb/'

In [108]:
for ii in range(len(folderNames)):

    os.makedirs(folderNames[ii] + outDir, exist_ok = True)
    AllTextGrids = os.listdir(folderNames[ii] + '/textgrids')
    
    for tt in range(len(AllTextGrids)):
        surprisePD = compute_wordprob(folderNames[ii] + '/textgrids/' + AllTextGrids[tt], context_length = 8)
        surprisePD.to_csv(folderNames[ii] + outDir + AllTextGrids[tt].split('.')[0] + '.csv', index = False)

-------------------

## Plot Entropy

## Plot Probability

To find tokens and remove them:
https://github.com/huggingface/transformers/issues/4827