## Code requirements: libraries and modules

In [None]:
!python -m spacy download ca_core_news_trf

In [None]:
import pip
from importlib.util import find_spec
from scipy.spatial.distance import cosine

required_packages = ['transformers', 'torch']

for package in required_packages:
  if find_spec(package) is None:
    print(f'Installing package: {package}...')
    pip.main(['install', package])

import torch
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, pipeline
from pprint import pprint
from sklearn.decomposition import PCA
import numpy as np

import spacy
lemmatizer = spacy.load('ca_core_news_trf')

model_name = 'PlanTL-GOB-ES/roberta-base-ca'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name, output_hidden_states = True, )
model.eval()
model_mask = pipeline('fill-mask', model=model_name)

## Functions to process data

In [None]:
def process_with_BERT(tokenized_text):

  # we have to map the tokens to BERT vocabulary indices
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  # BERT expects either 1 or 2 sentences. If we give only one sentence,
  # we identify its tokens with 1s.
  segments_ids = [1] * len(tokenized_text)
  # Convert inputs to PyTorch tensors

  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Run the text through BERT, and collect all of the hidden states produced
  # from all 12 layers.
  with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on
    # how it's  configured in the `from_pretrained` call earlier. In this case,
    # becase we set `output_hidden_states = True`, the third item will be the
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_layers = outputs[1]

  return hidden_layers

In [None]:
def rearrange_BERT_object(hidden_layers):
  token_embeddings = torch.stack(hidden_layers, dim=0)
  #print(token_embeddings)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  token_embeddings = token_embeddings.permute(1,0,2)
  return token_embeddings

In [None]:
def embeddings_by_sum(hidden_layers):
  # we rearrange the hidden layers so that we can more easily iterate over tokens
  token_embeddings = rearrange_BERT_object(hidden_layers)
  token_vecs_sum = []

  for token in token_embeddings:
    # 'token' is a [12 x 768] tensor
    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    # Use 'sum_vec' to represent token.
    token_vecs_sum.append(sum_vec)

  return token_vecs_sum

## Computing word point in sentence and parsing data

In [None]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.9f' % x)

In [None]:
def get_sentences_from_csv_file(file):
  df = pd.read_csv(file, header=None, names=['Sense_1', 'Sense_2'])

  sentences_sense_1 = []
  sentences_sense_2 = []

  for i in range(0,len(df['Sense_1'])):
    sentences_sense_1.append("[CLS] " + df['Sense_1'][i] + " [SEP]")

  for i in range(0,len(df['Sense_2'])):
    sentences_sense_2.append("[CLS] " + df['Sense_2'][i] + " [SEP]")

  return (sentences_sense_1, sentences_sense_2)

In [None]:
def tokenize_sentences(sentences):
  tokenized_sentences = []
  for sentence in sentences:
    tokenized_sentences.append(tokenizer.tokenize(sentence))
  return tokenized_sentences

In [None]:
def merged_bpe (llista):
  for unit in llista:
    merged_text = []
    i = 0
    while i < len(llista):
      if i == 0 or llista[i].startswith('Ġ'):
          merged_text.append(llista[i][1:])
          i += 1
      else:
          merged_text[-1] += llista[i]  # Append the subword without the '##' prefix
          i += 1

  return merged_text

In [None]:
def find_word_in_bpe(llista, word):
  adapted_word = adapt_word_to_tokenizer(word).lower()
  current_word = ""
  indices = []
  i = 0
  while i < len(llista):
    if (llista[i].startswith('-')):
      i += 1
      while not llista[i].startswith('Ġ'):
        i += 1
      continue
    if i == 0 or llista[i].startswith('Ġ'):
      filtered_current_word = ''.join(filter(str.isalpha, current_word.lower()))
      if filtered_current_word == adapted_word:
        return indices
      current_word = llista[i][1:]
      indices.clear()
    else:
      current_word += llista[i]
    indices.append(i)
    i += 1
  if current_word.lower() == adapted_word:
    return indices
  return []

In [None]:
def adapt_word_to_tokenizer(word):
  word_converter_dictionary={'à': 'Ãł','è': 'Ã¨', 'ò': 'Ã²', 'é': 'Ã©', 'í': 'ÃŃ', 'ó': 'Ã³', 'ú': 'Ãº'} #dièresi, ç, l·l
  adapted_string = ''
  for character in word:
    if character in word_converter_dictionary:
      adapted_string += word_converter_dictionary[character]
    else:
      adapted_string += character
  return adapted_string

In [None]:
def find_target_word_lemmatizing(lemmatized_words, sentence):
  lemmatized_sentence = lemmatizer(sentence)
  for index, word in enumerate(lemmatized_sentence):
    if (word.lemma_.lower() in lemmatized_words):
      return str(word)
  return None

In [None]:
def find_word_indices(tokenized_sentences, sentences, words):
  indices = []
  #we need to adapt the word because the tokenizer does not take accents correctly
  adapted_word = adapt_word_to_tokenizer(words[0]).lower()
  for idx, sent in enumerate(tokenized_sentences):
    target_word = find_target_word_lemmatizing(words, sentences[idx])
    if (target_word == None):
      print(f"{words[0]}, {idx} - {sentences[idx]}")
      indices.append(None)
      continue
    word_indices_in_sent = find_word_in_bpe(sent, target_word)

    #just to check errors
    if len(word_indices_in_sent) == 0:
      print(f"Word: [{words[0]} - {idx}, (adapted as: {adapted_word}), lemmatized as: {target_word}] not found in sentence: [{sent}]")
      indices.append(None)
      continue

    indices.append(word_indices_in_sent)

  return indices

In [None]:
def compute_word_position_in_sentences(tokenized_sentences, sentences, words):
  indices_in_sents = find_word_indices(tokenized_sentences, sentences, words)

  vecs_sums = []

  for i in range(0, len(tokenized_sentences)):
    if indices_in_sents[i] == None:
      continue
    hidden_layers = process_with_BERT(tokenized_sentences[i])
    token_vecs_sum = embeddings_by_sum(hidden_layers)
    pos_sum = token_vecs_sum[indices_in_sents[i][0]]
    if len(indices_in_sents[i]) > 1:
      for idx in range(1,len(indices_in_sents[i])):
        pos_sum += token_vecs_sum[indices_in_sents[i][idx]]
    vecs_sums.append(pos_sum/len(indices_in_sents[i]))

  return vecs_sums


## Analyze word in sentence

In [None]:
def cosine_similarity(point1, point2):
  return 1 - cosine(point1, point2)

In [None]:
def analyze_stabilization(word, context, positions, stabilization_df):
  if len(positions) == 0:
    return
  #initializing variables to compare with previous result
  previous_accumulated = positions[0].clone()
  accumulated = positions[0].clone()
  previous_similarity = None

  for i in range(1,len(positions)):
    accumulated += positions[i]
    #for each 10 results
    if i%10 == 0:
      #compute cosine similarity, compute similarity difference with the last similarity
      similarity = cosine_similarity(previous_accumulated, accumulated)
      similarity_difference = abs(similarity - previous_similarity) if previous_similarity != None else 0
      #append it to the dataframe
      stabilization_df.loc[len(stabilization_df.index)] = [word,i, similarity, similarity_difference, context]
      #update the variables to compare
      previous_accumulated = accumulated.clone()
      previous_similarity = similarity

In [None]:
def analyse_word_in_sentence(sentences_sense_1, sentences_sense_2, words, stabilization_df):
  tokenized_sents_sense_1 = tokenize_sentences(sentences_sense_1)
  tokenized_sents_sense_2 = tokenize_sentences(sentences_sense_2)

  word_positions_in_sents_sense_1 = compute_word_position_in_sentences(tokenized_sents_sense_1, sentences_sense_1, words)
  word_positions_in_sents_sense_2 = compute_word_position_in_sentences(tokenized_sents_sense_2, sentences_sense_2, words)

  analyze_stabilization(words[0], "neo", word_positions_in_sents_sense_1, stabilization_df)
  analyze_stabilization(words[0], "no_neo", word_positions_in_sents_sense_2, stabilization_df)

  #we only care about comparing words with the new sense with the ones in the old sense
  #because of this, we are going to compare old senses with themselves and new senses with old senses
  avg_point_sense_1 = sum(word_positions_in_sents_sense_1)/len(word_positions_in_sents_sense_1)
  avg_point_sense_2 = sum(word_positions_in_sents_sense_2)/len(word_positions_in_sents_sense_2)
  cosine_avg = cosine_similarity(avg_point_sense_1, avg_point_sense_2)
  print(cosine_avg)
  return cosine_avg

In [None]:
def analyse_word_in_sentence_from_csv(file, words, stabilization_df):
  (sentences_sense_1, sentences_sense_2) = get_sentences_from_csv_file(file)
  return analyse_word_in_sentence(sentences_sense_1, sentences_sense_2, words, stabilization_df)

In [None]:
def analyze_multiple_words_in_sentences_from_csv(files_and_words, stabilization_df):
  df = pd.DataFrame({'word': [], 'cosine': []})
  for (file, words) in files_and_words:
    #(new_sense, old_sense, comparison_new_old_sense, difference) =
    cosine_avg = analyse_word_in_sentence_from_csv(file, words, stabilization_df)
    df.loc[len(df.index)] = [words[0], cosine_avg]
  return df

# Analyze the selected verbs

In [None]:
# useful to debug the lemmatizer
sentence_lemmatize = "això, perduda en un munt de paraules sense sentit que em condueixen a un bogeria sense fi, una bogeria dolça, complaent, anestesiant . Una bogeria que m'allibera de tot això tan imperfecte que hi ha al meu voltant, una bogeria que trobo més real que tot"  #@param {type:"string"}
lemmatized_sentence = lemmatizer(sentence_lemmatize)
for token in lemmatized_sentence:
  print(token.lemma_)

Useful to compute stabilization and save the stabilization data

In [None]:
stabilization_df = pd.DataFrame({'paraula': [], 'num_contextos': [], 'cosine': [], 'diferencia': [], 'context': []})

In [None]:
stabilization_df.to_csv('stabilization_data.csv', float_format=lambda x: '%.9f' % x)

In [None]:
stabilization_df



> Neologic verbs




In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('anestesiar.csv', ['anestesiar', 'anestesiat', 'anestesiant'])], stabilization_df)
print(verbs_analitzats)

#we save it to a file so that if we want to use the compute data, we don't have to recompute it again
#verbs_analitzats.to_csv('verbs.csv', index=False)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('maquillar.csv', ['maquillar', 'maquillat'])], stabilization_df)
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('blindar.csv', ['blindar', 'blindat', 'blindir'])], stabilization_df)
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('arrasar.csv', ['arrasar', 'arrasat'])], stabilization_df)
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('disparar.csv', ['disparar', 'disparat'])], stabilization_df)
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('quallar.csv', ['quallar', 'quallat', 'quallir'])], stabilization_df)
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('sacsejar.csv', ['sacsejar', 'sacsegen', 'sacsegar', 'sacsejat'])], stabilization_df)
print(verbs_analitzats)

> Monosemic verbs

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('nevar.csv', ['nevar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('agilitzar.csv', ['agilitzar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('agredir.csv', ['agredir'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('conversar.csv', ['conversar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('esmentar.csv', ['esmentar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('esmenar.csv', ['esmenar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('fullejar.csv', ['fullejar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('inculcar.csv', ['inculcar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('numerar.csv', ['numerar', 'numerat'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('precisar.csv', ['precisar'])])
print(verbs_analitzats)

In [None]:
verbs_analitzats = analyze_multiple_words_in_sentences_from_csv([('teclejar.csv', ['teclejar'])])
print(verbs_analitzats)