In [None]:
# Author: Malik Altakrori, PhD
# IBM Research
# malik.altakrori@ibm.com

## Notebook 1
Prep the Belebele dataset, and put it in the SQuAD format to use it with PrimeQA toolkit from IBM

In [None]:
import os
from datasets import load_dataset   
import json

In [None]:
root_folder = "<Provide the abs path to the repo>"
data_path = "Data"

### 1. Read both files

In [None]:
All_data_en = load_dataset("csv", data_files=os.path.join(root_folder, data_path, "Annotated/416_Annotated_En.csv"), split="train")
All_data_ar = load_dataset("csv", data_files=os.path.join(root_folder, data_path, "Annotated/416_Annotated_Ar.csv"), split="train")

In [None]:
print(f"We have {len(All_data_ar)} Arabic Questions, and:")
print(f"We have {len(All_data_en)} English Questions.")

## Remove rows with X annotation

In [None]:
print(f"Total number of rows English BEFORE automatic filtering: {len(All_data_en)}")
print(f"Total number of rows Arabic  BEFORE automatic filtering: {len(All_data_ar)}")

In [None]:
All_data_en = All_data_en.filter(lambda example: example['Span']!='X')
All_data_ar = All_data_ar.filter(lambda example: example['Annotated Passage']!='X')

print(f"Total number of rows English AFTER automatic filtering: {len(All_data_en)}")
print(f"Total number of rows Arabic  AFTER automatic filtering: {len(All_data_ar)}")

In [None]:
All_data_en, All_data_ar

In [None]:
# Function to find the offset (where the answer span start from)
# with a test example

def find_offset_En(example):
    spn_start = example['Span'].find('$!') 
    spn_end = example['Span'].find('!$')
    answer = example['Span'][spn_start+2:spn_end]
    example['Span_en'] = answer
    example['offset_en'] = spn_start
    
    if spn_start == -1 or spn_end == -1:
        print(example)
    return example

example = {"flores_passage": "The American plan relied on launching coordinated attacks from three different directions. General John Cadwalder would launch a diversionary attack against the British garrison at Bordentown, in order to block off any reinforcements. General James Ewing would take 700 militia across the river at Trenton Ferry, seize the bridge over the Assunpink Creek and prevent any enemy troops from escaping. The main assault force of 2,400 men would cross the river nine miles north of Trenton, and then split into two groups, one under Greene and one under Sullivan, in order to launch a pre-dawn attack.",
           "Span": "The American plan relied on launching coordinated attacks from three different directions. General John Cadwalder would launch a diversionary attack against the British garrison at $!Bordentown!$, in order to block off any reinforcements. General James Ewing would take 700 militia across the river at Trenton Ferry, seize the bridge over the Assunpink Creek and prevent any enemy troops from escaping. The main assault force of 2,400 men would cross the river nine miles north of Trenton, and then split into two groups, one under Greene and one under Sullivan, in order to launch a pre-dawn attack.",
            "Arabic passage":"وطالب المستعمرون، الذين شاهدوا هذا النشاط، بتعزيزات. ضمّت تعزيزات المواقع الأمامية كتيبتي نيوهامبشير الأولى والثالثة في صفوفهما 200 من الرجال يقودهم العقيد چون ستارك والعقيد چيمس ريد (والذين أصبحا بعدها جنرالين). تمركز رجال ستارك في مواقع بطول السياج من الطرف الشمالي لمكان المستعمر. عندما فتح المدّ المنخفض فجوة على طول النهر الغامض على طول الجانب الشمالي الشرقي من شبه الجزيرة، مدّوا السياج سريعاً بجدار حجري قصير إلى الشمال ينتهي عند حافة المياه على شاطئ صغير. وضع جريدلي أو ستارك وتدًا على بعد حوالي 100 قدم (30 مترًا) أمام السياج وأمر ألا يطلق أحد النار حتى يعبره ضباط الجيش.",
            "Pre-Annotated Arabic Passage": "وطالب المستعمرون، الذين شاهدوا هذا النشاط، بتعزيزات. ضمّت تعزيزات المواقع الأمامية كتيبتي نيوهامبشير الأولى والثالثة في صفوفهما 200 من الرجال يقودهم العقيد چون ستارك والعقيد چيمس ريد (والذين أصبحا بعدها جنرالين). تمركز رجال ستارك في $!مواقع بطول السياج من الطرف الشمالي لمكان المستعمر.!$ عندما فتح المدّ المنخفض فجوة على طول النهر الغامض على طول الجانب الشمالي الشرقي من شبه الجزيرة، مدّوا السياج سريعاً بجدار حجري قصير إلى الشمال ينتهي عند حافة المياه على شاطئ صغير. وضع جريدلي أو ستارك وتدًا على بعد حوالي 100 قدم (30 مترًا) أمام السياج وأمر ألا يطلق أحد النار حتى يعبره ضباط الجيش."
           }

find_offset_En(example)

In [None]:
def find_offset_Ar(example):

    spn_start = example['Annotated Passage'].find('$!') 
    spn_end = example['Annotated Passage'].find('!$')
    if spn_end<spn_start:
        print("oops")
        print(example)
        temp = spn_end
        spn_end = spn_start
        spn_start = temp
    example['Span_ar'] = example['Annotated Passage'][spn_start+2:spn_end]
    example['offset_ar'] = spn_start
                               
    if spn_start == -1 or spn_end == -1:
        print(example)
    return example

example = {"flores_passage": "The Colonists, seeing this activity, had also called for reinforcements. Troops reinforcing the forward positions included the 1st and 3rd New Hampshire regiments of 200 men, under Colonels John Stark and James Reed (both later became generals). Stark's men took positions along the fence on the north end of the Colonist's position. When low tide opened a gap along the Mystic River along the northeast of the peninsula, they quickly extended the fence with a short stone wall to the north ending at the water's edge on a small beach. Gridley or Stark placed a stake about 100 feet (30 m) in front of the fence and ordered that no one fire until the regulars passed it.",
           "Span": "The Colonists, seeing this activity, had also called for reinforcements. Troops reinforcing the forward positions included the 1st and 3rd New Hampshire regiments of 200 men, under Colonels John Stark and James Reed (both later became generals). Stark's men took positions $!along the fence on the north end of the Colonist's position!$. When low tide opened a gap along the Mystic River along the northeast of the peninsula, they quickly extended the fence with a short stone wall to the north ending at the water's edge on a small beach. Gridley or Stark placed a stake about 100 feet (30 m) in front of the fence and ordered that no one fire until the regulars passed it.",
            "Arabic passage":"وطالب المستعمرون، الذين شاهدوا هذا النشاط، بتعزيزات. ضمّت تعزيزات المواقع الأمامية كتيبتي نيوهامبشير الأولى والثالثة في صفوفهما 200 من الرجال يقودهم العقيد چون ستارك والعقيد چيمس ريد (والذين أصبحا بعدها جنرالين). تمركز رجال ستارك في مواقع بطول السياج من الطرف الشمالي لمكان المستعمر. عندما فتح المدّ المنخفض فجوة على طول النهر الغامض على طول الجانب الشمالي الشرقي من شبه الجزيرة، مدّوا السياج سريعاً بجدار حجري قصير إلى الشمال ينتهي عند حافة المياه على شاطئ صغير. وضع جريدلي أو ستارك وتدًا على بعد حوالي 100 قدم (30 مترًا) أمام السياج وأمر ألا يطلق أحد النار حتى يعبره ضباط الجيش.",
            "Annotated Passage": "وطالب المستعمرون، الذين شاهدوا هذا النشاط، بتعزيزات. ضمّت تعزيزات المواقع الأمامية كتيبتي نيوهامبشير الأولى والثالثة في صفوفهما 200 من الرجال يقودهم العقيد چون ستارك والعقيد چيمس ريد (والذين أصبحا بعدها جنرالين). تمركز رجال ستارك في $!مواقع بطول السياج من الطرف الشمالي لمكان المستعمر.!$ عندما فتح المدّ المنخفض فجوة على طول النهر الغامض على طول الجانب الشمالي الشرقي من شبه الجزيرة، مدّوا السياج سريعاً بجدار حجري قصير إلى الشمال ينتهي عند حافة المياه على شاطئ صغير. وضع جريدلي أو ستارك وتدًا على بعد حوالي 100 قدم (30 مترًا) أمام السياج وأمر ألا يطلق أحد النار حتى يعبره ضباط الجيش."
           }

find_offset_Ar(example)



In [None]:
#Apply the offset function, and check that no example has a -1 (the sign of wrong annotation)
test_ar = All_data_ar.map(find_offset_Ar)


In [None]:
# making sure whe dont have any wrong offsets
test_ar.filter(lambda example: example['offset_ar'] == -1)

In [None]:
# sanity check: the filter will return the whol dataset
test_ar = test_ar.filter(lambda example: example['offset_ar'] != -1) # Notice that we flipped the == to !=
test_ar

In [None]:
test_en = All_data_en.map(find_offset_En)

In [None]:
# test example
ex = {'Annotator': 'Anon', 'link': 'https://en.wikibooks.org/wiki/Basic_Physics_of_Digital_Radiography/The_Basics', 'question_number': 2, 'flores_passage': 'The atom can be considered to be one of the fundamental building blocks of all matter. Its a very complex entity which consists, according to a simplified Bohr model, of a central nucleus orbited by electrons, somewhat similar to planets orbiting the sun - see Figure 1.1. The nucleus consists of two particles - neutrons and protons. Protons have a positive electric charge while neutrons have no charge. The electrons have a negative electric charge.', 'question': 'The nucleus is composed of which particles?', 'dialect': 'eng_Latn', 'Span': 'The atom can be considered to be one of the fundamental building blocks of all matter. Its a very complex entity which consists, according to a simplified Bohr model, of a central nucleus orbited by electrons, somewhat similar to planets orbiting the sun - see Figure 1.1. $!neutrons and protons!$. Protons have a positive electric charge while neutrons have no charge. The electrons have a negative electric charge.', 'Arabic passage': 'يُمكن اعتبار الذرة واحدة من وحدات البناء الأساسية لكل المواد. إنه كيان معقد للغاية يتكون، وفقًا لنموذج بور البسيط، من نواة مركزية تدور حولها الإلكترونات، تشبه إلى حد ما الكواكب التي تدور حول الشمس - انظر الشكل 1.1. تتكون النواة من جسيمين - النيوترونات والبروتونات. للبروتونات شحنة كهربية موجبة، بينما النيوترونات ليس لها شحنة، أما الإلكترونات فشحنتها سالبة.', 'Arabic Question': 'من أي جسيمات تتكون النواة؟', 'Pre-Annotated Arabic Passage': 'يُمكن اعتبار الذرة واحدة من وحدات البناء الأساسية لكل المواد. إنه كيان معقد للغاية يتكون، وفقًا لنموذج بور البسيط، من نواة مركزية تدور حولها الإلكترونات، تشبه إلى حد ما الكواكب التي تدور حول الشمس - انظر الشكل 1.1. تتكون النواة من جسيمين - $!النيوترونات والبروتونات.!$ للبروتونات شحنة كهربية موجبة، بينما النيوترونات ليس لها شحنة، أما الإلكترونات فشحنتها سالبة.', 'Span_en': 'neutrons and protons', 'offset_en': 273}
ex

In [None]:
print(test_en.column_names)

In [None]:
print(test_ar.column_names)

In [None]:
# Not really original, but fixed manually by the authors ... one problematic row at a time .. 
# Important: This is modified from the HF files
original_data_path = "Data/Belebele_original"

In [None]:
test_en

In [None]:
"""
map the dialectal questions to English and MSA passages

# First, we load the original questions (URL issue solved manually)

# The following URLs have commas in the URL field. They had to be removed manually:

# * problematic questions: 73, 290, 291, 357, 358, 400, 401, 452, 453,473, 474, 479, 480, 491, 492, 515, 516,535, 569, 570, 571, 572
"""

for f_name in ['acm', 'apc', 'ars', 'ary', 'arz']:
    Qs = {}
    def map_dialect(example):
        # try:
        example[f'question_{f_name}_Arab'] = Qs[example['link']][example['question_number']]
        # except:
            # print()
        return example

    print(f"Processing {f_name}")
    with open(os.path.join(root_folder, original_data_path, f"{f_name}_Arab.jsonl"), "r") as f:
        dial_ar = f.readlines()
        for line in dial_ar:
        # for line in acm_ar:
            
            line = json.loads(line)
            # print(line)        
            
            if line['link'] not in Qs.keys():
                Qs[line['link']] = {}
                Qs[line['link']][int(line["question_number"])] = line["question"]
            else:
                Qs[line['link']][int(line["question_number"])] = line["question"]
                
            # Qs[int(line[-1].strip())] = line[3]
    
    print(f"Prcessing {f_name} for English")
    test_en = test_en.map(map_dialect)
    print(f"Prcessing {f_name} for Arabic")
    test_ar = test_ar.map(map_dialect)
    
    print(f"Prcessing {f_name} is complete")
    # break

### Map MSA questions to the EN dataset

In [None]:
f_name ='arb'
Qs = {}
def map_dialect(example):
    example[f'question_{f_name}_Arab'] = Qs[example['link']][example['question_number']]

    return example

print(f"Processing {f_name}")
with open(os.path.join(root_folder, original_data_path, f"{f_name}_Arab.jsonl"), "r") as f:
    dial_ar = f.readlines()
    for line in dial_ar:
        line = json.loads(line)
        
        if line['link'] not in Qs.keys():
            Qs[line['link']] = {}
            Qs[line['link']][int(line["question_number"])] = line["question"]
        else:
            Qs[line['link']][int(line["question_number"])] = line["question"]

test_en = test_en.map(map_dialect)

### Map EN questions to the MSA dataset

In [None]:
f_name ='eng'
Qs = {}

print(f"Processing {f_name}")
with open(os.path.join(root_folder, original_data_path, f"{f_name}_Latn.jsonl"), "r") as f:
    Lang_en = f.readlines()
    for line in Lang_en:
        line = json.loads(line)

        if line['link'] not in Qs.keys():
            Qs[line['link']] = {}
            Qs[line['link']][int(line["question_number"])] = line["question"]
        else:
            Qs[line['link']][int(line["question_number"])] = line["question"]

def map_dialect(example):
    example[f'question_{f_name}_Latn'] = Qs[example['link']][example['question_number']]

    return example

test_ar = test_ar.map(map_dialect)

In [None]:
# Notice how the columns increased: (ignore "unnamed " columns. They are artifcats of excel and won't affect the script)
test_en, test_ar

In [None]:
sample = test_en[0]
sample

In [None]:
sample_ar = test_ar[0]
sample_ar

In [None]:
# example from En making sure that the passage[from offset (start index) to: span lenghth] matches the actual span answer
sample['flores_passage'][sample['offset_en']:sample['offset_en']+len(sample['Span_en'])]

In [None]:
# Same as above but for Ar
sample_ar['Arabic passage'][sample_ar['offset_ar']:sample_ar['offset_ar']+len(sample_ar['Span_ar'])]

In [None]:
# the same verification but on a sample from the end
sample_end = test_en[-1]
sample_end

In [None]:
sample_end['flores_passage'][sample_end['offset_en']:sample_end['offset_en']+len(sample_end['Span_en'])]

In [None]:
sample_end_ar = test_ar[-2]
sample_end_ar

In [None]:
sample_end_ar['Arabic passage'][sample_end_ar['offset_ar']:sample_end_ar['offset_ar']+len(sample_end_ar['Span_ar'])]

In [None]:
print(test_en.column_names[:10])
print(test_en.column_names[10:])

## Splitting All questions into Files

In [None]:
# work on the original or the translated dialectal questions
Translated = True

if not Translated:
    settings_folder = "Settings"
else:
    settings_folder = "Settings_Translated"

print(f"working with {settings_folder}")

### Splitting English (All)

create the home folder

In [None]:
output_folder = os.path.join(root_folder, settings_folder, "All")
output_folder

In [None]:
overwrite_existing = True
try:
    os.makedirs(output_folder, exist_ok=overwrite_existing)
except:
    print(f"WARNING: Folder {output_folder} exists, either delete manually, or set overwrite_existing to True")
    raise ValueError

In [None]:
print("En Passage  -- En Qs")

json_samples = []
for example in test_en:
    json_samples.append({"context": example['flores_passage'],
                        "question": example[f'question'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_en']], "answer_start": [example['offset_en']]}
                        })

with open(os.path.join(output_folder, "EN-P_EN-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
print("En Passage  -- MSA Qs")

json_samples = []
for example in test_en:
    json_samples.append({"context": example['flores_passage'],
                        "question": example[f'question_arb_Arab'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_en']], "answer_start": [example['offset_en']]}
                        })

with open(os.path.join(output_folder, "EN-P_MSA-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")

print("En Passage  -- Dialect Qs")

for dialect in  ['question_acm_Arab',
    'question_apc_Arab',
    'question_ars_Arab',
    'question_ary_Arab',
    'question_arz_Arab']:
    dial = dialect.split('_')[1]
    print(f"En Passage  -- {dial} Qs")

    json_samples = []
    for example in test_en:
        json_samples.append({"context": example['flores_passage'],
                            "question": example[dialect],
                            "id": str(len(json_samples)),
                            "answers": {"text": [example['Span_en']], "answer_start": [example['offset_en']]}
                            })

    
    with open(os.path.join(output_folder, f"EN-P_{dial}-Q.jsonl"), encoding="utf8", mode="w") as f:
        f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("Done!")

### Splitting Arabic (All)

In [None]:
print("Ar Passage  -- En Qs")

json_samples = []
for example in test_ar:
    json_samples.append({"context": example['Arabic passage'],
                        "question": example[f'question_eng_Latn'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_ar']], "answer_start": [example['offset_ar']]}
                        })
    
with open(os.path.join(output_folder, "MSA-P_EN-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
print("Ar Passage  -- MSA Qs")

json_samples = []
for example in test_ar:
    json_samples.append({"context": example['Arabic passage'],
                        "question": example[f'Arabic Question'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_ar']], "answer_start": [example['offset_ar']]}
                        })
    
with open(os.path.join(output_folder, "MSA-P_MSA-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
for dialect in  ['question_acm_Arab',
    'question_apc_Arab',
    'question_ars_Arab',
    'question_ary_Arab',
    'question_arz_Arab']:
    dial = dialect.split('_')[1]
    print(f"Ar Passage  -- {dial} Qs")

    json_samples = []
    for example in test_ar:
        json_samples.append({"context": example['Arabic passage'],
                            "question": example[dialect],
                            "id": str(len(json_samples)),
                            "answers": {"text": [example['Span_ar']], "answer_start": [example['offset_ar']]}
                            })
        
    with open(os.path.join(output_folder, f"MSA-P_{dial}-Q.jsonl"), encoding="utf8", mode="w") as f:
        f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("Done!")


## Excluding Belebele questions

In [None]:
test_ar

In [None]:
test_ar_noBB = test_ar.filter(lambda example: example['Belebele problem?']== 'No')
test_ar_noBB

In [None]:
print(test_ar.num_rows-test_ar_noBB.num_rows,' was removed')

In [None]:
test_en_noBB = test_en.filter(lambda example: example['Belebele problem?']== 'No')
print(test_en.num_rows-test_en_noBB.num_rows,' was removed')

Creating another output folder

In [None]:
output_folder = os.path.join(root_folder, settings_folder, "NoBB")
output_folder

In [None]:
overwrite_existing = True
try:
    os.makedirs(output_folder, exist_ok=overwrite_existing)
except:
    print(f"WARNING: Folder {output_folder} exists, either delete manually, or set overwrite_existing to True")
    raise ValueError

### Splitting English (NoBB)

In [None]:
print("En Passage  -- EN Qs")
json_samples = []
for example in test_en_noBB:
    json_samples.append({"context": example['flores_passage'],
                        "question": example[f'question'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_en']], "answer_start": [example['offset_en']]}
                        })
    
with open(os.path.join(output_folder, "EN-P_EN-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
print("En Passage  -- MSA Qs")
json_samples = []
for example in test_en_noBB:
    json_samples.append({"context": example['flores_passage'],
                        "question": example[f'question_arb_Arab'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_en']], "answer_start": [example['offset_en']]}
                        })
    
with open(os.path.join(output_folder, "EN-P_MSA-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
print("En Passage  -- Dialectal Qs")
for dialect in  ['question_acm_Arab',
    'question_apc_Arab',
    'question_ars_Arab',
    'question_ary_Arab',
    'question_arz_Arab']:
    dial = dialect.split('_')[1]
    print(f"En Passage  -- {dial} Qs")

    json_samples = []
    for example in test_en_noBB:
        json_samples.append({"context": example['flores_passage'],
                            "question": example[dialect],
                            "id": str(len(json_samples)),
                            "answers": {"text": [example['Span_en']], "answer_start": [example['offset_en']]}
                            })
        
    with open(os.path.join(output_folder, f"EN-P_{dial}-Q.jsonl"), encoding="utf8", mode="w") as f:
        f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))



### Splitting Arabic (NoBB)

In [None]:
print("Ar Passage  -- EN Qs")
json_samples = []
for example in test_ar_noBB:
    json_samples.append({"context": example['Arabic passage'],
                        "question": example[f'question_eng_Latn'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_ar']], "answer_start": [example['offset_ar']]}
                        })
    
with open(os.path.join(output_folder, "MSA-P_EN-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
print("Ar Passage  -- MSA Qs")
json_samples = []
for example in test_ar_noBB:
    json_samples.append({"context": example['Arabic passage'],
                        "question": example[f'Arabic Question'],
                        "id": str(len(json_samples)),
                        "answers": {"text": [example['Span_ar']], "answer_start": [example['offset_ar']]}
                        })
    
with open(os.path.join(output_folder, "MSA-P_MSA-Q.jsonl"), encoding="utf8", mode="w") as f:
    f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

print("=====================================")
print("Ar Passage  -- Dialectal Qs")
for dialect in  ['question_acm_Arab',
    'question_apc_Arab',
    'question_ars_Arab',
    'question_ary_Arab',
    'question_arz_Arab']:
    dial = dialect.split('_')[1]
    print(f"Ar Passage  -- {dial} Qs")

    json_samples = []
    for example in test_ar_noBB:
        json_samples.append({"context": example['Arabic passage'],
                            "question": example[dialect],
                            "id": str(len(json_samples)),
                            "answers": {"text": [example['Span_ar']], "answer_start": [example['offset_ar']]}
                            })
        
    with open(os.path.join(output_folder, f"MSA-P_{dial}-Q.jsonl"), encoding="utf8", mode="w") as f:
        f.write("\n".join([json.dumps(json_sample) for json_sample in json_samples]))

Go to notebook 2 to prepare the experiments!