<a href="https://colab.research.google.com/github/dgromann/MultiLexBATS/blob/main/scripts/MultiLexBATS_querying_generative_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install huggingface_hub

# **Analogical Reasoning Task**
The following description relates to setting up the classical analogy task to be completed by a generative model. To be used in masked-type models, please consul the corresponding scripts on our GitHub.

For this paper, we utilized the generative language model [BLOOM](https://huggingface.co/bigscience/bloom).

## Loading and Preprocessing MultiLexBATS

Instead of running the analogy template on all available analogies, we randomly select 30 {a}, {b}, {c}, {d} pairs that we then run on all languages.

To this end, the CSV per relation for all languages is loaded and a parallel dataset with no nan, DUPLICATE, or NO_TRANSLATION in any row is created. From this dataset, 30 random analogies are compiled.


In [246]:
import pandas as pd
import numpy as np

def find_parallels_across_all_languages(df_parallel, df):
  #Reshape data structure to make EN fit the format of the other languages and rename column Target to EN
  df_parallel.loc[df_parallel['ID'].notna(), 'Target'] = df_parallel['Source']
  df_parallel.rename(columns={'Target': 'EN'}, inplace = True)
  df_parallel['ID'].fillna(method='ffill', inplace=True)

  df.loc[df['ID'].notna(), 'Target'] = df['Source']
  df.rename(columns={'Target': 'EN'}, inplace = True)
  df['ID'].fillna(method='ffill', inplace=True)

  #Drop all rows with nan, DUPLICATE or NO_TRANSLATION in any language
  df_parallel.dropna(subset=df_parallel.columns[4:], inplace=True)
  for column in df_parallel.columns[4:]:
    df_parallel = df_parallel[~df_parallel[column].str.contains('DUPLICATE|NO_TRANSLATION')]

  #Only keep ideas that have parallel source words across all languages
  all_valid_ids = df_parallel[df_parallel['Source'].notna()]['ID']
  df_parallel = df_parallel[df_parallel['ID'].isin(all_valid_ids)]

  return df_parallel, df

def find_all_pairs(df_parallel, df, lang):
  lang_dict_parallel, lang_dict = {}, {}
  for i, row in df_parallel.iterrows():
    if type(row['Source']) == str:
      ID = row['ID']
      en_source_word = row['EN'].replace("_", " ")
      lang_source_word = row[lang].replace("_", " ")
      lang_dict[ID] = {'source': lang_source_word, 'targets': []}
      for x in list(df.loc[df['ID']==ID][lang])[1:]:
        if type(x) == str and "DUPLICATE" not in x and "NO_TRANSLATION" not in x:
          if "," in x.replace("_", " "):
            lang_dict[ID]['targets'].extend(x.replace("_", " ").split(","))
          else:
            lang_dict[ID]['targets'].append(x.replace("_", " "))
    else:
      if type(row['EN']) == str:
        #Create one dictionary only containing pairs and one containing the source word and the set of all target words
        lang_dict_parallel[ID+"__"+en_source_word+"_"+row['EN']] = [lang_source_word, row[lang].replace("_", " ")]

  return lang_dict_parallel, lang_dict


## Define Analogy Templates
Use the following function to create a set of language-specific templates for the classical anlogy task.

In [None]:
def get_template_with_quotes(lang, a, b, c):
    if lang =='EN':
        return f'"{a}" is to "{b}" as "{c}" is to '
    if lang =='FR':
        return f'"{a}" est à "{b}" ce que "{c}" est à '
    if lang =='IT':
        return f'"{a}" sta a "{b}" come "{c}" sta a  '
    if lang =='ES':
        return f'"{a}" es a "{b}" como "{c}" es a  '
    if lang =='DE':
        #Version1
        #return f'"{a}" verhält sich zu "{b}" wie "{c}" zu '
        #Version2
        return f'"{a}" ist zu "{b}" wie "{c}" ist zu '
    if lang =='PT':
        return f'"{a}" está para "{b}" assim como "{c}" está para '
    if lang == 'AL':
        return f'"{a}" është për "{b}" ashtu si "{c}" për '
    if lang == 'BM':
        return f'"{a}" is to "{b}" as "{c}" is to '
        #return f'"{a}" ye "{b}" ye i n’a fɔ "{c}" ye '
    if lang == 'HR':
        #Version 1
        #return f'"{a}" je za "{b}" kao što je "{c}" za '
        #Version 2
        #return f'Riječ "{a}" je riječi "{b}" jednako što je riječ "{c}" riječi '
        #Version 3
        return f'Odnos između riječi "{a}" i "{b}" jednak je odnosu između riječi "{c}" i '
    if lang == 'LT':
        return f'"{a}" yra "{b}" taip, kaip "{c}" yra '
    if lang == 'SL':
        #Version 1
        return f'Beseda "{a}" je besedi "{b}" enako, kot je beseda "{c}" besedi '
        #Version 2
        #return f'Beseda "{a}" je besedi "{b}" enako, kot je besedi ... beseda "{c}".'
        #Version 3
        #return f'"{a}" in "{b}" sta kot "{c}" in '
    if lang == 'SK':
        #Version 1
        return f'Slovo "{a}"  sa má k slovu "{b}" ako slovo "{c}" k slovu '
        #Version 2
        #return f'Vzťah medzi slovami "{a}" a "{b}" je rovnaký ako medzi "{c}" a '
        #Version 3
        #return f'"{a}" sa má k "{b}" ako "{c}" k '
    if lang == 'RO':
        return f'"{a}" este pentru "{b}" cum "{c}" este pentru '
    if lang == 'HE':
        #Version 1
        return f'"{a}" ל "{b}" כמו "{c}" ל '
        #Version 2
        #return f' ל "{c}" כ "{b}" ל "{a}"'
    if lang == 'EL':
        #Version 1
        return f'το "{a}" είναι προς το "{b}" ό,τι το "{c}" προς το '
        #Version2
        #return  f'Η σχέση μεταξύ "{a}" και "{b}" είναι ίδια με τη σχέση μεταξύ "{c}" και '
    if lang == 'MK':
        #Version1
        #return f'"{a}" е за "{b}" исто што и "{c}" за '
        #Version2
        #return f'Зборот "{a}" за зборот "{b}" е исто што и зборот "{c}" за зборот '
        #Version3
        return f'Односот меѓу зборовите "{a}" и "{b}" е еднаков со односот меѓу зборовите "{c}" и '

## Run Analogies on a Model
In order to test a generative model for analogy completion, the following code for querying the Huggingface Interface API can be utilised. Please be aware that the free version has severe rate limits, leading to time outs on a regular basis if too many queries are submitted.

In [214]:
import requests

#Specify model you wish to prompt
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
headers = {"Authorization": "Bearer xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}

def query(payload):
  response = requests.post(API_URL, headers=headers, json=payload)
  return response.json()

In [None]:
from numpy.lib.shape_base import row_stack
import random
import pandas as pd
from statistics import mean

#Specify paths
Path_rel_file = "Path_to_relation_file"
Path_to_output_folder = "Path_to_relation_file"

#Specify path to relation CSV with all languages, type of relation and number of analogies you wish to randomly select
df = pd.read_csv(Path_rel_file)
relation = "L01_hypernyms_animals"
num_analogies = 30

def get_analogy_response(ab_analogies, cd_analogies, lang_dict_parallel, lang_dict_all, language, num_analogies):
  i, correct = 0, 0
  missing_ids = 0
  tracing, accuracies = [], []
  while i < num_analogies:
    ab, cd = lang_dict_parallel[ab_analogies[i]], lang_dict_parallel[cd_analogies[i]]
    d = lang_dict_all[cd_analogies[i].split("__")[0]]
    prompt = get_template_with_quotes(language, ab[0], ab[1], cd[0])
    tracing.append({'Language': language, 'a': ab[0], 'b': ab[1], 'c': cd[0], 'd': d['targets'], 'prompts': prompt})
    response = query({
        "inputs": prompt,
        "parameters": {"max_new_tokens": 10, "do_sample": False},
    })
    response = response[0]['generated_text'].replace(prompt, "").lower()
    for word in d['targets']:
      if word.lower() in response:
        correct += 1
        break
    i += 1
  accuracies.append({'Language': language, relation: str(correct/num_analogies)})
  print(language, correct/num_analogies)

  return tracing, accuracies

def get_random_ids(lang_dict_parallel, num_analogies):
  if len(lang_dict_parallel.keys()) > num_analogies*2:
    analogies = random.sample(list(lang_dict_parallel.keys()), k=num_analogies*2)
    ab_analogies, cd_analogies = analogies[:num_analogies], analogies[num_analogies:]
  else:
    ab_analogies = random.choices(list(lang_dict_parallel.keys()), k=num_analogies)
    cd_analogies = random.choices(list(lang_dict_parallel.keys()), k=num_analogies)
  return ab_analogies, cd_analogies

def run_same_random_sample_all_languages(df, num_analogies):
  df_parallel, df = find_parallels_across_all_languages(df.copy(), df)
  lang_dict_parallel, lang_dict = find_all_pairs(df_parallel, df, "EN")
  ab_analogies, cd_analogies = get_random_ids(lang_dict_parallel, num_analogies)
  rows_prompts, rows_accuracies = [], []
  print("Getting accuracies on analogy task for relation "+relation+" in:")
  for lang in df_parallel.columns[3:]:
    lang_dict_parallel, lang_dict = find_all_pairs(df_parallel, df, lang)
    rows, accuracies = get_analogy_response(ab_analogies, cd_analogies, lang_dict_parallel, lang_dict, lang, num_analogies)
    rows_prompts.extend(rows)
    rows_accuracies.extend(accuracies)

  accuracies_df = pd.DataFrame(rows_accuracies, columns=['Language', relation])
  accuracies_df.to_csv(Path_to_output_folder+"accuracies_"+relation+".csv")
  prompt_df = pd.DataFrame(rows_prompts, columns=['Language', 'a', 'b', 'c', 'd', 'prompts'])
  prompt_df.to_csv(Path_to_output_folder+"prompts_"+relation+".csv")


run_same_random_sample_all_languages(df, num_analogies)


# **Translation Task**

The translation tasks comprises using analogies in order to predict translation equivalences. This entails composing one analogy template from prompt templates in two languages, e.g. {a} is to {b} as {c} es a, where the first part represents an English template and the second a Spanish template.

As a example of a filled version, the template could be "apple is to fruit as manzana es a " in which case the model is expected to predict "fruta". The following function takes a language, analogy pair a and b, and which part of the template (first or second) is needed. In the above instance, we would need to use the function as follows to compose a translation template:  

```
prompt1 = get_translation_template_with_quotes("EN", apple, fruit, 1)
prompt2 = get_translation_template_with_quotes("ES", manzana, "", 2)
prompt = promtp1 + prompt2
```









In [259]:
def get_translation_template_with_quotes(lang, a, b, part):
    if lang =='EN':
        if part == 1:
          return f'"{a}" is to "{b}" as '
        if part == 2:
          return f'"{a}" is to '
    if lang =='FR':
        if part == 1:
          return f'"{a}" est à "{b}" ce que '
        if part == 2:
          return f'"{a}" est à '
    if lang =='IT':
        if part == 1:
          return f'"{a}" sta a "{b}" come '
        if part == 2:
          return f'"{a}" sta a  '
    if lang =='ES':
        if part == 1:
          return f'"{a}" es a "{b}" como '
        if part == 2:
          return f'"{a}" es a  '
    if lang =='DE':
        if part == 1:
          return f'"{a}" ist zu "{b}" wie '
        if part == 2:
          return f'"{a}" ist zu '
    if lang =='PT':
        if part == 1:
          return f'"{a}" está para "{b}" assim como '
        if part == 2:
          return f'"{a}" está para '
    if lang == 'AL':
      if part == 1:
        return f'"{a}" është për "{b}" ashtu si '
      if part == 2:
        return f'"{a}" për '
    if lang == 'BM':
      if part == 1:
        return f'"{a}" is to "{b}" as '
      if part == 2:
        return f'"{a}" is to '
        #return f'"{a}" ye "{b}" ye i n’a fɔ "{c}" ye '
    if lang == 'HR':
      if part == 1:
        return f'Odnos između riječi "{a}" i "{b}" jednak je '
        #return f'Riječ "{a}" je riječi "{b}" jednako što je '
      if part == 2:
        return f'odnosu između riječi "{a}" i '
        #return f'riječ "{a}" riječi '
    if lang == 'LT':
      if part == 1:
        return f'"{a}" yra "{b}" taip, '
      if part == 2:
        return f'"kaip "{a}" yra '
    if lang == 'SL':
      if part == 1:
        return f'Beseda "{a}" je besedi "{b}" enako, kot je '
      if part == 2:
        return f'beseda "{a}" besedi '
    if lang == 'SK':
      if part == 1:
        return f'Slovo "{a}"  sa má k slovu "{b}" ako '
      if part == 2:
        return f'slovo "{a}" k slovu '
        #Version 2
    if lang == 'RO':
      if part == 1:
        return f'"{a}" este pentru "{b}" cum '
      if part == 2:
        return f'"{a}" este pentru '
    if lang == 'HE':
        #Version 1
      if part == 1:
        return f'"{a}" ל "{b}" כ '
      if part == 2:
        return f' מו "{a}" ל '
        #Version 2
        #return f' ל "{c}" כ "{b}" ל "{a}"'
    if lang == 'EL':
        #Version 1
      if part == 1:
        return f'το "{a}" είναι προς το "{b}" ό,τι '
      if part == 2:
        return f'το "{a}" προς το '
    if lang == 'MK':
      if part == 1:
        return f'Односот меѓу зборовите "{a}" и "{b}" е '
      if part == 2:
        return f'еднаков со односот меѓу зборовите "{c}" и '

## Obtaining parallel dictionary

In order to compose analogy-based translation prompts, we require all parallel pairs in the two languages that do not contain "DUPLICATE" or "NO_TRANSLATION" strings (see the paper for more details).

The following function returns two ID- and English-aligned dictionaries of pairs in the requested languages. Since the data structure is slightly different for English, the function needs to consider this if one of the languages requested is English.

In [264]:
from pandas.core.groupby.generic import DataFrameGroupBy
import pandas as pd

def get_pairs(df, lang1, lang2):
  lang1_dict, lang2_dict = {}, {}
  for i, row in df.iterrows():
    if type(row['ID']) == str:
      ID = row['ID']
      en_source_word = row['EN'].replace("_", " ")
      lang1_source_word = row[lang1].replace("_", " ")
      lang2_source_word = row[lang2].replace("_", " ")
    else:
      if type(row['EN']) == str:
        lang1_dict[ID+"__"+en_source_word+"_"+row['EN']] = [lang1_source_word, row[lang1].replace("_", " ")]
        lang2_dict[ID+"__"+en_source_word+"_"+row['EN']] = [lang2_source_word, row[lang2].replace("_", " ")]

  return lang1_dict, lang2_dict


def reshape_en_data_stucture(df):
  #Reshape data structure to make EN fit the format of the other languages and rename column Target to EN
  df.loc[df['ID'].notna(), 'Target'] = df['Source']
  df.rename(columns={'Target': 'EN'}, inplace = True)
  return df

def find_translation_pairs(df, lang1, lang2):
  lang1_dict, lang2_dict = {}, {}

  #Remove all lines with empty values, "DUPLICATE" or "NO_TRANSLATION" in either language
  subset = df[df[lang1].notna() & df[lang2].notna()]
  subset = subset[['ID','Relation','Source', 'EN', lang1, lang2]] if lang1 != "EN" and lang2 != "EN" else subset[['ID','Relation','Source', lang1, lang2]]
  subset = subset[~subset[lang1].str.contains("DUPLICATE|NO_TRANSLATION") & ~subset[lang2].str.contains("DUPLICATE|NO_TRANSLATION")]

  #Use the following function to display the dataframe
  #display(df)
  lang1_dict, lang2_dict = get_pairs(subset, lang1, lang2)

  return lang1_dict, lang2_dict

## Prompt a model with analogy-based templates

Specify the type of language pairs you wish to test on the analogy-based translation task, specify the languages, and the type of model you wish to use.


In [None]:
from numpy.lib.shape_base import row_stack
import random
import pandas as pd
import requests
from statistics import mean

#Specify path to file, type of relation (for output file), language1 and language2 to be used in the analogy-based translation task
df = pd.read_csv(Path_rel_file)
df = reshape_en_data_stucture(df)

def find_translation(lang_dict1, lang_dict2, language1, language2):
  correct, corr = 0, 0
  tracing, accuracies = [], []
  print("Translating from ", language1, "to ", language2)
  for id in lang_dict1.keys():
    a, b = lang_dict1[id]
    c, d = lang_dict2[id]
    prompt_part1 = get_translation_template_with_quotes(language1, a, b, 1)
    prompt_part2 = get_translation_template_with_quotes(language2, c, "", 2)
    prompt = prompt_part1+prompt_part2
    response = query({
        "inputs": prompt,
        "parameters": {"max_new_tokens": 10, "do_sample": False},
    })
    response = response[0]['generated_text'].replace(prompt, "").lower()
    if d.lower() in response:
      correct += 1
      corr = 1
    tracing.append({'Host': language1, 'Transfer': language2, 'a': a, 'b': b, 'c': c, 'd': d, 'prompts': prompt, 'response': response, 'corect': corr})
  print("Number of templates: ",len(lang_dict1.keys()), "Accuracy: ", correct/len(lang_dict1.keys()))
  accuracies.append({'Transfer': language1+"=>"+language2, 'Accuracy':correct/len(lang_dict1.keys())})
  corr = 0
  prompt_df = pd.DataFrame(tracing, columns=['Source', 'Transfer', 'a', 'b', 'c', 'd', 'prompts', 'response'])
  prompt_df.to_csv(Path_to_output_folder+"/translation_experiment_"+language1+"_"+language2+"_"+relation+".csv")
  return accuracies

#List of language combinations tested in the publication
rows_accuracies = []
for pair in [["EN","ES"],["EN", "DE"],["EN","FR"],["FR","EN"],["FR","RO"],["ES","PT"],["ES","IT"],["HR","SL"],["HR","SK"],["SK","HR"], ["EN","BM"], ["EN","AL"],["EN","EL"],["EN","HE"],["FR","BM"]]:
  language1, language2 = pair[0], pair[1]
  lang1_dict, lang2_dict = find_translation_pairs(df, language1, language2)
  rows_accuracies.extend(find_translation(lang1_dict, lang2_dict, language1, language2))

  #Output all accuracies for this relation to CSV
  accuracies_df = pd.DataFrame(rows_accuracies, columns=['Language', relation])
  accuracies_df.to_csv(Path_to_output_folder+"translation_acc_"+relation+".csv")

