# EDA on Backtranslated Samples

Now that we have backtranslated samples for both Spanish and Japanese, this
notebook will analyze those samples as well as the original English samples.

In [136]:
import ast
import os
from typing import List

import pandas as pd
import spacy

In [41]:
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [103]:
pd.set_option('display.max_colwidth', None)

## Load data

In [28]:
ROOT_DIR = "../data/consolidated"

SPANISH_BACKTRANSLATIONS = "spanish_consolidated_backtranslations.jsonl"
SPANISH_DATA_FP = os.path.join(ROOT_DIR, SPANISH_BACKTRANSLATIONS)

JAPANESE_BACKTRANSLATIONS = "japanese_consolidated_backtranslations.jsonl"
JAPANESE_DATA_FP = os.path.join(ROOT_DIR, JAPANESE_BACKTRANSLATIONS)

In [29]:
spanish_consolidated_data = []

In [30]:
with open(SPANISH_DATA_FP, 'r') as f:
    for line in f.readlines():
        spanish_consolidated_data.append(
            ast.literal_eval(line)
        )

In [31]:
japanese_consolidated_data = []

In [32]:
with open(JAPANESE_DATA_FP, 'r') as f:
    for line in f.readlines():
        japanese_consolidated_data.append(
            ast.literal_eval(line)
        )

In [33]:
consolidated_backtranslated_data = (
    spanish_consolidated_data + japanese_consolidated_data
)

In [34]:
df = pd.DataFrame(consolidated_backtranslated_data)

### How many of the backtranslations are exactly the same as the originals?

In [35]:
df["pivot_lang"].value_counts()

es    1000
ja    1000
Name: pivot_lang, dtype: int64

In [36]:
df.groupby("pivot_lang").apply(
    lambda x: pd.Series(
        {
            "identical": (x.question == x.backtranslation).sum()
        }
    )
)

Unnamed: 0_level_0,identical
pivot_lang,Unnamed: 1_level_1
es,200
ja,61


### How many backtranslations had their entities changed?

In [85]:
def get_text_entities(text: str):
    doc = nlp(text)
    return {str(ent) for ent in doc.ents}

In [None]:
def get_text_entities_for_series(col: pd.Series):
    return [
        get_text_entities(str(elem)) for elem in col
    ]

In [94]:
ents_identical = [
    get_text_entities(question) == get_text_entities(backtranslation)
    for (question, backtranslation)
    in zip(df["question"], df["backtranslation"])
]

In [100]:
ents_identical_by_lang = pd.DataFrame(
    zip(ents_identical, df["pivot_lang"], df["question"], df["backtranslation"]),
    columns=["ents_identical", "pivot_lang", "question", "backtranslation"]
)

In [99]:
ents_identical_by_lang[["ents_identical", "pivot_lang"]].groupby("pivot_lang").sum()

Unnamed: 0_level_0,ents_identical
pivot_lang,Unnamed: 1_level_1
es,785
ja,705


In [112]:
ents_identical_by_lang[~ents_identical_by_lang["ents_identical"]].head(30)

Unnamed: 0,ents_identical,pivot_lang,question,backtranslation
13,False,es,The Kingdom of the Netherlands was formed by which countries?,What countries made up the Kingdom of the Netherlands?
24,False,es,What Treaty is the CAF part of?,What Treaty is CAF part of?
27,False,es,What is an example of a Sassanid language?,What is an example of a Sasanian language?
28,False,es,At what century's start did revivalist fall into disfavor?,At the beginning of what century did the revival fall out of favor?
29,False,es,What is one type of fixture commonly found in offices?,What is a type of accessory commonly found in offices?
53,False,es,Each Brigade contains how many regiments?,How many regiments does each Brigade contain?
55,False,es,What characteristic of Middle Iranian civilization is shown by the number of different languages and their speakers?,Which characteristic of the average Iranian civilization shows the number of different languages and their speakers?
70,False,es,Who was the author of Seven Lamps of Architecture?,Who was the author of Seven Architecture Lamps?
75,False,es,Who advises the Chief of the Defence?,Who advises the Chief of Defense?
79,False,es,Where is the current focus of the Canadian Military set?,Where is the current focus of the Canadian military ensemble?


In [111]:
ents_identical_by_lang[ents_identical_by_lang["pivot_lang"] == "ja"][
    ~ents_identical_by_lang["ents_identical"]
].head(30)

  ents_identical_by_lang[ents_identical_by_lang["pivot_lang"] == "ja"][


Unnamed: 0,ents_identical,pivot_lang,question,backtranslation
1003,False,ja,Who commands the reserves?,Who will command the Reserve Army?
1004,False,ja,Who issued the Pragmatic Sanction?,Who issued practical sanctions?
1007,False,ja,"What country's government, on which the US government was modeled, did not formally implement separation of powers?",Which government has served as a model for the U.S. government but never formally implemented the separation of powers?
1012,False,ja,"In what year did Francis, Duke of Anjou leave the Netherlands?","What year did Franz, Duke of Anjou, leave the Netherlands?"
1013,False,ja,The Kingdom of the Netherlands was formed by which countries?,Which country was the Kingdom of the Netherlands formed by?
1014,False,ja,Which city in the Netherlands has the oldest stock exchange?,Which Dutch city has the oldest stock exchange?
1019,False,ja,Who had more power during times of war than the raadspensionaris?,Who held more power during the war than Lars Pensioner?
1027,False,ja,What is an example of a Sassanid language?,An example in Sasanian?
1028,False,ja,At what century's start did revivalist fall into disfavor?,At the beginning of what century were revivalists hated?
1052,False,ja,When did Middle Persian start being u sed?,When did Medieval Persian come into use?


In [93]:
df.groupby("pivot_lang").apply(
    lambda x: pd.Series(
        {
            "identical": (
                sum([
                    question_ents == backtranslation_ents
                    for (question_ents, backtranslation_ents)
                    in zip(
                        get_text_entities_for_series(x.question),
                        get_text_entities_for_series(x.backtranslation)
                    )
                ])
            )
        }
    )
)

Type of x: 0      The Dutch operated a slave trade from which lo...
1      What land is near the worms' first known locat...
2      How many days can the Daysimeter gather for an...
3                             Who commands the reserves?
4                     Who issued the Pragmatic Sanction?
                             ...                        
995    When did usage of Middle Persian script fall off?
996    Where does the information stored on the Wayba...
997    Which courts decisions are binding across the ...
998    Who had granted Franklin Roosevelt sweeping au...
999    What made it possible to design architecture t...
Name: question, Length: 1000, dtype: object
Idx 0:	Elem: The Dutch operated a slave trade from which locations?	Type: <class 'str'>
Type of x: 0      The Dutch operated a slave trade from which lo...
1      What land is near the first known location of ...
2      How many days can the Daysimeter collect for a...
3                             Who commands the r

Unnamed: 0_level_0,identical
pivot_lang,Unnamed: 1_level_1
es,785
ja,705


In [121]:
df.head()

Unnamed: 0,id,title,context,question,answers,translation,backtranslation,source_lang,pivot_lang
0,56dde2609a695914005b964b,Dutch_Republic,"Between 1590–1712 the Dutch also possessed one of the strongest and fastest navies in the world, allowing for their varied conquests including breaking the Portuguese sphere of influence on the Indian Ocean and in the Orient, as well as a lucrative slave trade from Africa and the Pacific.",The Dutch operated a slave trade from which locations?,"{'text': ['Africa and the Pacific'], 'answer_start': [266]}",¿Los holandeses operaron un comercio de esclavos desde qué lugares?,The Dutch operated a slave trade from which locations?,en-US,es
1,56de25ab4396321400ee260b,Symbiosis,"One of the most spectacular examples of obligate mutualism is between the siboglinid tube worms and symbiotic bacteria that live at hydrothermal vents and cold seeps. The worm has no digestive tract and is wholly reliant on its internal symbionts for nutrition. The bacteria oxidize either hydrogen sulfide or methane, which the host supplies to them. These worms were discovered in the late 1980s at the hydrothermal vents near the Galapagos Islands and have since been found at deep-sea hydrothermal vents and cold seeps in all of the world's oceans.",What land is near the worms' first known location?,"{'text': ['the Galapagos Islands'], 'answer_start': [429]}",¿Qué tierra está cerca de la primera ubicación conocida de los gusanos?,What land is near the first known location of the worms?,en-US,es
2,56df9a3738dc4217001520bf,Lighting,"The small, head-mounted device measures an individual's daily rest and activity patterns, as well as exposure to short-wavelength light that stimulates the circadian system. The device measures activity and light together at regular time intervals and electronically stores and logs its operating temperature. The Daysimeter can gather data for up to 30 days for analysis.",How many days can the Daysimeter gather for analysis?,"{'text': ['30'], 'answer_start': [351]}",¿Cuántos días puede recolectar el Daysimeter para el análisis?,How many days can the Daysimeter collect for analysis?,en-US,es
3,56df065d3277331400b4d8c7,Canadian_Armed_Forces,"Approximately 26,000 citizen soldiers, sailors, and airmen and women, trained to the level of and interchangeable with their Regular Force counterparts, and posted to CAF operations or duties on a casual or ongoing basis, make up the Primary Reserve. This group is represented, though not commanded, at NDHQ by the Chief of Reserves and Cadets, who is usually a major general or rear admiral, and is divided into four components that are each operationally and administratively responsible to its corresponding environmental command in the Regular Force – the Naval Reserve (NAVRES), Land Force Reserve (LFR), and Air Reserve (AIRRES) – in addition to one force that does not fall under an environmental command, the Health Services Reserve under the Canadian Forces Health Services Group.",Who commands the reserves?,"{'text': ['Chief of Reserves and Cadets'], 'answer_start': [315]}",¿Quién manda en las reservas?,Who commands the reserves?,en-US,es
4,56dddab666d3e219004dad30,Dutch_Republic,"Most of the Low Countries had come under the rule of the House of Burgundy and subsequently the House of Habsburg. In 1549 Holy Roman Emperor Charles V issued the Pragmatic Sanction, which further unified the Seventeen Provinces under his rule. Charles was succeeded by his son, King Philip II of Spain. In 1568 the Netherlands, led by William I of Orange, revolted against Philip II because of high taxes, persecution of Protestants by the government, and Philip's efforts to modernize and centralize the devolved-medieval government structures of the provinces. This was the start of the Eighty Years' War.",Who issued the Pragmatic Sanction?,"{'text': ['Holy Roman Emperor Charles V'], 'answer_start': [123]}",¿Quién dictó la Pragmática Sanción?,Who issued the Pragmatic Sanction?,en-US,es


In [126]:
df_small = df[["id", "question", "backtranslation", "pivot_lang"]]

In [127]:
df_small

Unnamed: 0,id,question,backtranslation,pivot_lang
0,56dde2609a695914005b964b,The Dutch operated a slave trade from which locations?,The Dutch operated a slave trade from which locations?,es
1,56de25ab4396321400ee260b,What land is near the worms' first known location?,What land is near the first known location of the worms?,es
2,56df9a3738dc4217001520bf,How many days can the Daysimeter gather for analysis?,How many days can the Daysimeter collect for analysis?,es
3,56df065d3277331400b4d8c7,Who commands the reserves?,Who commands the reserves?,es
4,56dddab666d3e219004dad30,Who issued the Pragmatic Sanction?,Who issued the Pragmatic Sanction?,es
...,...,...,...,...
1995,56de0b40cffd8e1900b4b571,When did usage of Middle Persian script fall off?,When did the use of Middle Persian script decline?,ja
1996,56ddb46c9a695914005b958e,Where does the information stored on the Wayback Machine come from?,Where does the information stored on the Wayback Machine come from?,ja
1997,56de40c0cffd8e1900b4b710,Which courts decisions are binding across the entire United States?,Which court decisions are binding across the United States?,ja
1998,56de485ccffd8e1900b4b789,Who had granted Franklin Roosevelt sweeping authority during the great depression?,Who gave Franklin Roosevelt full power during the Great Depression?,ja


In [129]:
df_pivot = df_small.pivot(index="id", columns=["pivot_lang"], values=["question", "backtranslation"])

In [133]:
df_pivot.head(60)

Unnamed: 0_level_0,question,question,backtranslation,backtranslation
pivot_lang,es,ja,es,ja
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
56ddb46c9a695914005b958e,Where does the information stored on the Wayback Machine come from?,Where does the information stored on the Wayback Machine come from?,Where does the information stored in the Wayback Machine come from?,Where does the information stored on the Wayback Machine come from?
56ddb46c9a695914005b958f,Which company made the Wayback Machine?,Which company made the Wayback Machine?,What company made the Wayback Machine?,What company made the Wayback Machine?
56ddb46c9a695914005b9590,Where is Internet Archive headquartered?,Where is Internet Archive headquartered?,Where is the Internet Archive based?,Where is the Internet Archive headquarters?
56ddb46c9a695914005b9591,What individuals founded Internet Archive?,What individuals founded Internet Archive?,What people founded the Internet Archive?,Who founded the Internet Archive?
56ddb46c9a695914005b9592,What is the term used by Internet Archive to describe the Wayback Machine?,What is the term used by Internet Archive to describe the Wayback Machine?,What is the term used by the Internet Archive to describe the Wayback Machine?,What term does the Internet Archive use to describe the Wayback Machine?
56ddb53b66d3e219004daca3,What operating system is used on Wayback Machine's servers?,What operating system is used on Wayback Machine's servers?,What operating system is used on the Wayback Machine servers?,What operating system is used on the Wayback Machine's servers?
56ddb53b66d3e219004daca4,When does Wayback Machine save a copy of a website?,When does Wayback Machine save a copy of a website?,When does the Wayback Machine save a copy of a website?,When does the Wayback Machine store a copy of the website?
56ddb53b66d3e219004daca5,What is the ultimate aim of the Wayback Machine?,What is the ultimate aim of the Wayback Machine?,What is the ultimate goal of the Wayback Machine?,What is the ultimate purpose of the Wayback Machine?
56ddb92966d3e219004daca9,What TV show served as inspiration for the Wayback Machine's name?,What TV show served as inspiration for the Wayback Machine's name?,What television show inspired the name of the Wayback Machine?,What TV show inspired the name Wayback Machine?
56ddb92966d3e219004dacaa,Which characters on The Rocky and Bullwinkle Show used a device that allowed them to travel through time?,Which characters on The Rocky and Bullwinkle Show used a device that allowed them to travel through time?,Which characters on The Rocky and Bullwinkle Show used a device that allowed them to travel through time?,Which character on The Rocky and Bullwinkle Show used a device that allowed them to travel through time?


## How many sentences that began with "who", "what", "when", "where", "why", and "how" were reworded?

In [155]:
def question_starts_with_word(text: str, word: str) -> bool:
    return text.lower().startswith(word)

In [156]:
def question_starts_with_certain_words(text: str, word_list: List[str]) -> bool:
    return any([
        question_starts_with_word(text=text, word=word)
        for word in word_list
    ])

In [157]:
INTERROGATIVE_WORD_LIST = ["who", "what", "when", "where", "why", "how"]

In [158]:
df["question_starts_with_interrogative_words"] = [
    question_starts_with_certain_words(
        text=text, word_list=INTERROGATIVE_WORD_LIST
    )
    for text in df["question"]
]

df["backtranslation_starts_with_interrogative_words"] = [
    question_starts_with_certain_words(
        text=text, word_list=INTERROGATIVE_WORD_LIST
    )
    for text in df["backtranslation"]
]

In [159]:
is_identical_bool_list = [
    question == backtranslation
    for (question, backtranslation)
    in zip(df["question"], df["backtranslation"])
]

In [160]:
df["question_backtranslation_identical_bool"] = is_identical_bool_list

In [161]:
df_interrogation = df[
    [
        "id", "question", "backtranslation", "pivot_lang",
        "question_starts_with_interrogative_words",
        "backtranslation_starts_with_interrogative_words",
        "question_backtranslation_identical_bool"
    ]
]

In [162]:
df_interrogation.head()

Unnamed: 0,id,question,backtranslation,pivot_lang,question_starts_with_interrogative_words,backtranslation_starts_with_interrogative_words,question_backtranslation_identical_bool
0,56dde2609a695914005b964b,The Dutch operated a slave trade from which locations?,The Dutch operated a slave trade from which locations?,es,False,False,True
1,56de25ab4396321400ee260b,What land is near the worms' first known location?,What land is near the first known location of the worms?,es,True,True,False
2,56df9a3738dc4217001520bf,How many days can the Daysimeter gather for analysis?,How many days can the Daysimeter collect for analysis?,es,True,True,False
3,56df065d3277331400b4d8c7,Who commands the reserves?,Who commands the reserves?,es,True,True,True
4,56dddab666d3e219004dad30,Who issued the Pragmatic Sanction?,Who issued the Pragmatic Sanction?,es,True,True,True


In [164]:
# how many English seed phrases start with interrogative word.
df_interrogation["question_starts_with_interrogative_words"].value_counts()

True     1526
False     474
Name: question_starts_with_interrogative_words, dtype: int64

#### How many phrases that initially start with an interrogative word are then reworded?

In [135]:
"foo".startswith("fa")

False