# Here we start by importing our curated Datasets about factual knowledge on famous people

In [1]:
import pandas as pd

# Assuming your CSV file is named 'UK People.csv' and is in the same directory as your notebook
data = pd.read_csv(R"C:\Users\PC\Downloads\master_thesis\Datasets\Events\Multi_Culture_Events_filled.csv")

# Display the first few rows of the dataset
data.head()


Unnamed: 0,item,itemLabel_EN,itemLabel_DE,itemLabel_FR,itemLabel_JA,itemLabel_TR,countryLabel_EN,countryLabel_DE,countryLabel_FR,countryLabel_JA,...,participantLabel_EN,participantLabel_DE,participantLabel_FR,participantLabel_JA,participantLabel_TR,effectLabel_EN,effectLabel_DE,effectLabel_FR,effectLabel_JA,effectLabel_TR
0,http://www.wikidata.org/entity/Q4872730,Battle of White Oak Swamp,Battle of White Oak Swamp,bataille de White Oak Swamp,ホワイトオーク湿地の戦い,Battle of White Oak Swamp,United States of America,Vereinigte Staaten,États-Unis,アメリカ合衆国,...,,,,,,,,,,
1,http://www.wikidata.org/entity/Q4872426,Battle of Springfield,Battle of Springfield,bataille de Springfield,スプリングフィールドの戦い (1780年),Battle of Springfield,United States of America,Vereinigte Staaten,États-Unis,アメリカ合衆国,...,United States of America,Vereinigte Staaten,États-Unis,アメリカ合衆国,Amerika Birleşik Devletleri,,,,,
2,http://www.wikidata.org/entity/Q88849674,Battle of la Truyère,Battle of la Truyère,bataille de la Truyère,Battle of la Truyère,Battle of la Truyère,France,Frankreich,France,フランス,...,,,,,,,,,,
3,http://www.wikidata.org/entity/Q4870244,Battle of Alazan Creek,Battle of Alazan Creek,Battle of Alazan Creek,Battle of Alazan Creek,Battle of Alazan Creek,United States of America,Vereinigte Staaten,États-Unis,アメリカ合衆国,...,,,,,,,,,,
4,http://www.wikidata.org/entity/Q11608769,Battle of Mihonoseki,Battle of Mihonoseki,Battle of Mihonoseki,美保関の合戦,Battle of Mihonoseki,Japan,Japan,Japon,日本,...,,,,,,,,,,


## Here is where we extract the data that we have on people, which are their birth year, Nationality, and their profession

In [2]:
# Display the first few rows of the dataset
data.head()

# Define a generic function to retrieve event information
def get_event_info(name=None, index=None, language='EN'):
    """
    Retrieve event information by name or index.
    
    :param name: Name of the event (case-insensitive).
    :param index: Index of the event in the DataFrame.
    :param language: Language code (EN, FR, DE, TR, JA).
    :return: Dictionary of event information.
    """
    if index is not None:
        # Accessing by index
        event_info = data.iloc[index]
    elif name is not None:
        # Accessing by name
        name_column = f'itemLabel_{language}'
        event_info = data[data[name_column].str.lower() == name.lower()].iloc[0]
    else:
        return "Error: Either name or index must be provided."

    country_column = f'countryLabel_{language}'
    participant_column = f'participantLabel_{language}'
    effect_column = f'effectLabel_{language}'
    name_column = f'itemLabel_{language}'

    return {
        'Name': event_info[name_column],
        'Country': event_info[country_column],
        'Participant': event_info.get(participant_column, 'N/A'),
        'Effect': event_info.get(effect_column, 'N/A')
    }

# Define language-specific functions for convenience
def get_event_info_en(name=None, index=None):
    return get_event_info(name, index, language='EN')

def get_event_info_fr(name=None, index=None):
    return get_event_info(name, index, language='FR')

def get_event_info_tr(name=None, index=None):
    return get_event_info(name, index, language='TR')

def get_event_info_ja(name=None, index=None):
    return get_event_info(name, index, language='JA')

def get_event_info_de(name=None, index=None):
    return get_event_info(name, index, language='DE')

# Example usage
print(get_event_info_en(name="Battle of Springfield"))
print(get_event_info_fr(index=1))

{'Name': 'Battle of Springfield', 'Country': 'United States of America', 'Participant': 'United States of America', 'Effect': nan}
{'Name': 'bataille de Springfield', 'Country': 'États-Unis', 'Participant': 'États-Unis', 'Effect': nan}


# In this part we fill in the missing names for some languages by their english names, as not all names are translated to all other languages. 

### This step effectively puts the number of columns with at least 1 missing value from 455 to 955, which is a substantial upgrade

In [3]:
# Update specific language columns with the English names where they are missing
#for lang_code in ['TR', 'JA','FR','DE']:  # Add other language codes if needed
#    name_column = f'itemName{lang_code}'
#    data[name_column] = data[name_column].fillna(data['itemNameEN'])
#

In [3]:
info_de = get_event_info_de(index=999)
print("German Info:", info_de)

info_en = get_event_info_en(index=999)
print("English Info:", info_en)

info_fr = get_event_info_fr(index=999)
print("French Info:", info_fr)

info_tr = get_event_info_tr(index=999)
print("Turkish Info:", info_tr)

info_ja = get_event_info_ja(index=999)
print("Japanese Info:", info_ja)


German Info: {'Name': 'Battle of Le Mans', 'Country': 'Frankreich', 'Participant': nan, 'Effect': nan}
English Info: {'Name': 'Battle of Le Mans', 'Country': 'France', 'Participant': nan, 'Effect': nan}
French Info: {'Name': 'bataille du Mans', 'Country': 'France', 'Participant': nan, 'Effect': nan}
Turkish Info: {'Name': 'Battle of Le Mans', 'Country': 'Fransa', 'Participant': nan, 'Effect': nan}
Japanese Info: {'Name': 'Battle of Le Mans', 'Country': 'フランス', 'Participant': nan, 'Effect': nan}


In [6]:
# Count entries with at least 10 empty columns
entries_with_at_least_10_missing = data.isna().sum(axis=1).ge(11).sum()

print(f"Number of entries with at least 10 missing columns: {entries_with_at_least_10_missing}")


Number of entries with at least 10 missing columns: 375


## In this section we create and form the Multiple Choice Questiosn that will be asked to LLMs.

#### We show the formatting in all languages

## The first cell is used to save all questions and answers in DFs to later be used
## The second cell is msotly used for visualization purposes to see if there are any missmatches

In [9]:
import pandas as pd
import random

def generate_choices(correct_answer, options_list, num_choices=3):
    """Generate random choices for a given correct answer."""
    if correct_answer not in options_list:
        options_list.append(correct_answer)
    choices = random.sample([option for option in options_list if option != correct_answer], num_choices)
    choices.append(correct_answer)
    random.shuffle(choices)  # Randomize order of choices
    return choices

def create_quiz_questions(data, name_column, country_column, year_column, question_format_country, question_format_year):
    """
    Create quiz questions for country and year for each entry in the dataset.
    
    :param data: DataFrame containing the dataset.
    :param name_column: Column name for the event name.
    :param country_column: Column name for the country.
    :param year_column: Column name for the year.
    :param question_format_country: Question format string for country.
    :param question_format_year: Question format string for year.
    :return: Lists of dictionaries containing the questions and options.
    """
    unique_countries = data[country_column].dropna().unique()
    unique_years = data[year_column].dropna().unique().tolist()

    country_questions = []
    year_questions = []

    for index, row in data.iterrows():
        name = row[name_column]
        correct_country = row[country_column]
        correct_year = row[year_column]

        if pd.isna(name) or pd.isna(correct_country) or pd.isna(correct_year):
            continue

        country_choices = generate_choices(correct_country, unique_countries)
        year_choices = generate_choices(correct_year, unique_years, num_choices=3)

        country_question = question_format_country.format(name=name)
        year_question = question_format_year.format(name=name)

        # Get correct letter for the correct answer
        correct_letter_country = chr(65 + country_choices.index(correct_country))
        correct_letter_year = chr(65 + year_choices.index(correct_year))

        country_questions.append({'question': country_question, 'options': {f'{chr(65+i)}': choice for i, choice in enumerate(country_choices)}, 'correct_answer': correct_country, 'correct_letter': correct_letter_country})
        year_questions.append({'question': year_question, 'options': {f'{chr(65+i)}': choice for i, choice in enumerate(year_choices)}, 'correct_answer': correct_year, 'correct_letter': correct_letter_year})

    return country_questions, year_questions

# Define column names and question formats for each language
languages = {
    'EN': ('itemLabel_EN', 'countryLabel_EN', 'year', "In which country did the event '{name}' take place?", "In which year did the event '{name}' occur?"),
    'FR': ('itemLabel_FR', 'countryLabel_FR', 'year', "Dans quel pays a eu lieu l'événement '{name}' ?", "En quelle année l'événement '{name}' a-t-il eu lieu ?"),
    'DE': ('itemLabel_DE', 'countryLabel_DE', 'year', "In welchem Land fand das Ereignis '{name}' statt?", "In welchem Jahr fand das Ereignis '{name}' statt?"),
    'TR': ('itemLabel_TR', 'countryLabel_TR', 'year', "'{name}' olayı hangi ülkede gerçekleşti?", "'{name}' olayı hangi yıl gerçekleşti?"),
    'JA': ('itemLabel_JA', 'countryLabel_JA', 'year', "イベント'{name}'はどの国で行われましたか？", "イベント'{name}'は何年に行われましたか？")
}

# Define a function to nicely print the question and options
def print_question_details(question_details):
    print(f"Question: {question_details['question']}")
    for option_key, option_value in question_details['options'].items():
        print(f"  {option_key}: {option_value}")
    print(f"Correct Answer: {question_details['correct_answer']} (Option {question_details['correct_letter']})")
    print()

# Generate questions for each language and print them in a structured way
for code, (name_col, country_col, year_col, country_q, year_q) in languages.items():
    country_questions, year_questions = create_quiz_questions(data, name_col, country_col, year_col, country_q, year_q)
    
    print(f"===== {code} - Country Questions =====")
    if country_questions:
        print_question_details(country_questions[0])
    
    print(f"===== {code} - Year Questions =====")
    if year_questions:
        print_question_details(year_questions[0])


===== EN - Country Questions =====
Question: In which country did the event 'Battle of White Oak Swamp' take place?
  A: Japan
  B: Germany
  C: United States of America
  D: France
Correct Answer: United States of America (Option C)

===== EN - Year Questions =====
Question: In which year did the event 'Battle of White Oak Swamp' occur?
  A: 1806.0
  B: 1862.0
  C: 1942.0
  D: 1975.0
Correct Answer: 1862.0 (Option B)

===== FR - Country Questions =====
Question: Dans quel pays a eu lieu l'événement 'bataille de White Oak Swamp' ?
  A: Allemagne
  B: États-Unis
  C: Turquie
  D: France
Correct Answer: États-Unis (Option B)

===== FR - Year Questions =====
Question: En quelle année l'événement 'bataille de White Oak Swamp' a-t-il eu lieu ?
  A: 1637.0
  B: 1043.0
  C: 942.0
  D: 1862.0
Correct Answer: 1862.0 (Option D)

===== DE - Country Questions =====
Question: In welchem Land fand das Ereignis 'Battle of White Oak Swamp' statt?
  A: Vereinigte Staaten
  B: Deutschland
  C: Vereinigt

In [10]:
import pandas as pd
import random
def generate_choices(correct_answer, options_list, num_choices=3):
    """Generate random choices for a given correct answer."""
    if correct_answer not in options_list:
        options_list.append(correct_answer)
    choices = random.sample([option for option in options_list if option != correct_answer], num_choices)
    choices.append(correct_answer)
    random.shuffle(choices)  # Randomize order of choices
    return choices

def create_quiz_questions(data, name_column, country_column, year_column, question_format_country, question_format_year):
    """
    Create quiz questions for country and year for each entry in the dataset.
    
    :param data: DataFrame containing the dataset.
    :param name_column: Column name for the event name.
    :param country_column: Column name for the country.
    :param year_column: Column name for the year.
    :param question_format_country: Question format string for country.
    :param question_format_year: Question format string for year.
    :return: Lists of dictionaries containing the questions and options.
    """
    unique_countries = data[country_column].dropna().unique()
    unique_years = data[year_column].dropna().unique().tolist()

    country_questions = []
    year_questions = []

    for index, row in data.iterrows():
        name = row[name_column]
        correct_country = row[country_column]
        correct_year = row[year_column]

        if pd.isna(name) or pd.isna(correct_country) or pd.isna(correct_year):
            continue

        country_choices = generate_choices(correct_country, unique_countries)
        year_choices = generate_choices(correct_year, unique_years, num_choices=3)

        country_question = question_format_country.format(name=name)
        year_question = question_format_year.format(name=name)

        # Get correct letter for the correct answer
        correct_letter_country = chr(65 + country_choices.index(correct_country))
        correct_letter_year = chr(65 + year_choices.index(correct_year))

        country_questions.append({'question': country_question, 'options': {f'{chr(65+i)}': choice for i, choice in enumerate(country_choices)}, 'correct_answer': correct_country, 'correct_letter': correct_letter_country})
        year_questions.append({'question': year_question, 'options': {f'{chr(65+i)}': choice for i, choice in enumerate(year_choices)}, 'correct_answer': correct_year, 'correct_letter': correct_letter_year})

    return country_questions, year_questions

# Define column names and question formats for each language
languages = {
    'EN': ('itemLabel_EN', 'countryLabel_EN', 'year', "In which country did the event '{name}' take place?", "In which year did the event '{name}' occur?"),
    'FR': ('itemLabel_FR', 'countryLabel_FR', 'year', "Dans quel pays a eu lieu l'événement '{name}' ?", "En quelle année l'événement '{name}' a-t-il eu lieu ?"),
    'DE': ('itemLabel_DE', 'countryLabel_DE', 'year', "In welchem Land fand das Ereignis '{name}' statt?", "In welchem Jahr fand das Ereignis '{name}' statt?"),
    'TR': ('itemLabel_TR', 'countryLabel_TR', 'year', "'{name}' olayı hangi ülkede gerçekleşti?", "'{name}' olayı hangi yıl gerçekleşti?"),
    'JA': ('itemLabel_JA', 'countryLabel_JA', 'year', "イベント'{name}'はどの国で行われましたか？", "イベント'{name}'は何年に行われましたか？")
}

# Create DataFrames to store questions for each language
country_dfs = {}
year_dfs = {}

# Generate questions for each language
for code, (name_col, country_col, year_col, country_q, year_q) in languages.items():
    country_questions, year_questions = create_quiz_questions(data, name_col, country_col, year_col, country_q, year_q)
    
    country_df = pd.DataFrame(country_questions)
    country_df['language'] = code  # Add language column
    year_df = pd.DataFrame(year_questions)
    year_df['language'] = code  # Add language column
    
    country_dfs[code] = country_df
    year_dfs[code] = year_df

# Accessing questions for a specific language later on
print("Accessing questions for English:")
print(country_dfs['EN'])
print(year_dfs['EN'])


Accessing questions for English:
                                               question  \
0     In which country did the event 'Battle of Whit...   
1     In which country did the event 'Battle of Spri...   
2     In which country did the event 'Battle of la T...   
3     In which country did the event 'Battle of Miho...   
4     In which country did the event 'Battle of Hang...   
...                                                 ...   
1967  In which country did the event 'Battle of Ropp...   
1968  In which country did the event 'Battle at the ...   
1969  In which country did the event 'Battle of Gilg...   
1970  In which country did the event 'coronation of ...   
1971  In which country did the event 'Battle of Sout...   

                                                options  \
0     {'A': 'United States of America', 'B': 'Turkey...   
1     {'A': 'United States of America', 'B': 'Japan'...   
2     {'A': 'Turkey', 'B': 'United Kingdom', 'C': 'F...   
3     {'A': 'United Ki

## Now we move on to the second format of questions. 

### This format will be in the form of True/False statements where 50% of questions framed will be True.

In [11]:
import pandas as pd
import random

def generate_language_specific_questions(name, country, year, unique_countries, unique_years, language):
    result = []

    # Generate country question
    country_true = random.choice([True, False])
    if country_true:
        question_country = {
            'EN': f"The event '{name}' took place in {country}.",
            'FR': f"L'événement '{name}' a eu lieu en {country}.",
            'DE': f"Das Ereignis '{name}' fand in {country} statt.",
            'TR': f"'{name}' olayı {country} ülkesinde gerçekleşti.",
            'JA': f"イベント'{name}'は{country}で行われました。"
        }[language]
        correct_answer_country = True
        real_answer_country = country
    else:
        fake_country = random.choice([c for c in unique_countries if c != country])
        question_country = {
            'EN': f"The event '{name}' took place in {fake_country}.",
            'FR': f"L'événement '{name}' a eu lieu en {fake_country}.",
            'DE': f"Das Ereignis '{name}' fand in {fake_country} statt.",
            'TR': f"'{name}' olayı {fake_country} ülkesinde gerçekleşti.",
            'JA': f"イベント'{name}'は{fake_country}で行われました。"
        }[language]
        correct_answer_country = False
        real_answer_country = country

    # Generate year question
    year_true = random.choice([True, False])
    if year_true:
        question_year = {
            'EN': f"The event '{name}' occurred in {year}.",
            'FR': f"L'événement '{name}' a eu lieu en {year}.",
            'DE': f"Das Ereignis '{name}' fand im Jahr {year} statt.",
            'TR': f"'{name}' olayı {year} yılında gerçekleşti.",
            'JA': f"イベント'{name}'は{year}年に行われました。"
        }[language]
        correct_answer_year = True
        real_answer_year = year
    else:
        fake_year = random.choice([y for y in unique_years if y != year])
        question_year = {
            'EN': f"The event '{name}' occurred in {fake_year}.",
            'FR': f"L'événement '{name}' a eu lieu en {fake_year}.",
            'DE': f"Das Ereignis '{name}' fand im Jahr {fake_year} statt.",
            'TR': f"'{name}' olayı {fake_year} yılında gerçekleşti.",
            'JA': f"イベント'{name}'は{fake_year}年に行われました。"
        }[language]
        correct_answer_year = False
        real_answer_year = year

    result.extend([
        {'question': question_country, 'correct_answer': correct_answer_country, 'real_answer': real_answer_country},
        {'question': question_year, 'correct_answer': correct_answer_year, 'real_answer': real_answer_year}
    ])

    return result

# Define the function to generate true/false questions
def generate_true_false_questions(data, settings):
    questions = {lang: [] for lang in settings}
    for index, row in data.iterrows():
        for lang, lang_settings in settings.items():
            name = row[lang_settings['name_column']]
            actual_country = row[lang_settings['country_column']]
            actual_year = row[lang_settings['year_column']]
            
            if pd.isna(name) or pd.isna(actual_country) or pd.isna(actual_year):
                continue  # Skip incomplete data entries

            questions[lang].extend(generate_language_specific_questions(
                name, actual_country, actual_year,
                lang_settings['unique_countries'],
                lang_settings['unique_years'],
                lang
            ))

    return questions

# Define the language settings for the historical events dataset
language_settings = {
    'EN': {
        'name_column': 'itemLabel_EN',
        'country_column': 'countryLabel_EN',
        'year_column': 'year',
        'unique_countries': data['countryLabel_EN'].dropna().unique(),
        'unique_years': data['year'].dropna().unique()
    },
    'FR': {
        'name_column': 'itemLabel_FR',
        'country_column': 'countryLabel_FR',
        'year_column': 'year',
        'unique_countries': data['countryLabel_FR'].dropna().unique(),
        'unique_years': data['year'].dropna().unique()
    },
    'DE': {
        'name_column': 'itemLabel_DE',
        'country_column': 'countryLabel_DE',
        'year_column': 'year',
        'unique_countries': data['countryLabel_DE'].dropna().unique(),
        'unique_years': data['year'].dropna().unique()
    },
    'TR': {
        'name_column': 'itemLabel_TR',
        'country_column': 'countryLabel_TR',
        'year_column': 'year',
        'unique_countries': data['countryLabel_TR'].dropna().unique(),
        'unique_years': data['year'].dropna().unique()
    },
    'JA': {
        'name_column': 'itemLabel_JA',
        'country_column': 'countryLabel_JA',
        'year_column': 'year',
        'unique_countries': data['countryLabel_JA'].dropna().unique(),
        'unique_years': data['year'].dropna().unique()
    }
}

# Now call the function using the defined settings
true_false_questions = generate_true_false_questions(data, language_settings)

# Define a function to nicely print the true/false question details
def print_question_details(question):
    print(f"Question: {question['question']}")
    print(f"True/False: {'True' if question['correct_answer'] else 'False'}")
    print(f"Actual Answer: {question['real_answer']}")
    print()

# Define the function to generate true/false questions and print them
def generate_and_print_true_false_questions(data, settings):
    questions = {lang: [] for lang in settings}
    for index, row in data.iterrows():
        for lang, lang_settings in settings.items():
            name = row[lang_settings['name_column']]
            actual_country = row[lang_settings['country_column']]
            actual_year = row[lang_settings['year_column']]
            
            if pd.isna(name) or pd.isna(actual_country) or pd.isna(actual_year):
                continue  # Skip incomplete data entries

            questions[lang].extend(generate_language_specific_questions(
                name, actual_country, actual_year,
                lang_settings['unique_countries'],
                lang_settings['unique_years'],
                lang
            ))
    
    # Now print the formatted questions for each language
    for language, qs in questions.items():
        print(f"===== {language} - Sample Questions: =====")
        for question in qs[:3]:  # Adjust the range for the number of questions you want to print
            print_question_details(question)

# Call the function using the defined settings
generate_and_print_true_false_questions(data, language_settings)


===== EN - Sample Questions: =====
Question: The event 'Battle of White Oak Swamp' took place in Japan.
True/False: False
Actual Answer: United States of America

Question: The event 'Battle of White Oak Swamp' occurred in 1516.0.
True/False: False
Actual Answer: 1862.0

Question: The event 'Battle of Springfield' took place in Germany.
True/False: False
Actual Answer: United States of America

===== FR - Sample Questions: =====
Question: L'événement 'bataille de White Oak Swamp' a eu lieu en États-Unis.
True/False: True
Actual Answer: États-Unis

Question: L'événement 'bataille de White Oak Swamp' a eu lieu en 2014.0.
True/False: False
Actual Answer: 1862.0

Question: L'événement 'bataille de Springfield' a eu lieu en Royaume-Uni.
True/False: False
Actual Answer: États-Unis

===== DE - Sample Questions: =====
Question: Das Ereignis 'Battle of White Oak Swamp' fand in Japan statt.
True/False: False
Actual Answer: Vereinigte Staaten

Question: Das Ereignis 'Battle of White Oak Swamp' fa

In [12]:
import csv
# Define the function to generate and collect true/false questions
def generate_true_false_questions_to_csv(data, settings, output_file):
    questions = {lang: [] for lang in settings}
    
    # Collect all questions
    for index, row in data.iterrows():
        for lang, lang_settings in settings.items():
            name = row[lang_settings['name_column']]
            actual_country = row[lang_settings['country_column']]
            actual_year = row[lang_settings['year_column']]
            
            if pd.isna(name) or pd.isna(actual_country) or pd.isna(actual_year):
                continue  # Skip incomplete data entries

            questions[lang].extend(generate_language_specific_questions(
                name, actual_country, actual_year,
                lang_settings['unique_countries'],
                lang_settings['unique_years'],
                lang
            ))
    
    # Write the questions to a CSV file
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['question', 'True/False', 'ActualAnswer', 'Language'])
        
        for language, qs in questions.items():
            for question in qs:
                writer.writerow([question['question'], 'True' if question['correct_answer'] else 'False', question['real_answer'], language])

# Define the output file name
output_file = 'true_false_questions_events.csv'

# Call the function to generate the questions and export them to CSV
generate_true_false_questions_to_csv(data, language_settings, output_file)

print(f"Questions exported to {output_file}")

Questions exported to true_false_questions_events.csv


## Checking the calculations to see if all questions that have no NA columns are answered we do simple math.

#### (we take Total True annotations + Total False annotations) / (3*5) 
####  3 because we check 3 statements ( Nationality, Profession, and birth year)
#### 5 because there are 5 languages 
## All iterations and cases yield 955, which is equal to the total amount of statements that have no NA columns

In [13]:
def count_true_false_annotations(questions):
    true_count = 0
    false_count = 0

    # Iterate through each language's list of questions
    for lang_questions in questions.values():
        for question in lang_questions:
            if question['correct_answer']:
                true_count += 1
            else:
                false_count += 1

    return true_count, false_count

# Example usage:
true_count, false_count = count_true_false_annotations(true_false_questions)
total_statements=(true_count+false_count)/15
percentage_of_false=false_count/(false_count+true_count)
percentage_of_true=true_count/(false_count+true_count)
print(f"Total True annotations: {true_count}")
print(f"Total False annotations: {false_count}")
print(f"Total statements: {total_statements}")
print(f"Percentage of False statemetns: {(percentage_of_false)*100}%")
print(f"Percentage of True statemetns: {(percentage_of_true)*100}%")

Total True annotations: 9812
Total False annotations: 10006
Total statements: 1321.2
Percentage of False statemetns: 50.48945403168836%
Percentage of True statemetns: 49.51054596831164%


## Now we will form open-ended statements 

In [15]:
import pandas as pd

def generate_open_ended_questions(name, country, year, language):
    # Open-ended questions in different languages
    question_templates = {
        'EN': {
            'country': f"In which country did the event '{name}' take place?",
            'year': f"In which year did the event '{name}' occur?"
        },
        'FR': {
            'country': f"Dans quel pays a eu lieu l'événement '{name}' ?",
            'year': f"En quelle année a eu lieu l'événement '{name}' ?"
        },
        'DE': {
            'country': f"In welchem Land fand das Ereignis '{name}' statt?",
            'year': f"In welchem Jahr fand das Ereignis '{name}' statt?"
        },
        'TR': {
            'country': f"'{name}' olayı hangi ülkede gerçekleşti?",
            'year': f"'{name}' olayı hangi yıl gerçekleşti?"
        },
        'JA': {
            'country': f"イベント'{name}'はどの国で行われましたか？",
            'year': f"イベント'{name}'は何年に行われましたか？"
        }
    }

    # Generate open-ended questions
    question_country = question_templates[language]['country']
    question_year = question_templates[language]['year']

    # Output structure includes only the questions and correct answers
    result = [
        {'question': question_country, 'correct_answer': country},
        {'question': question_year, 'correct_answer': year}
    ]

    return result

# Define the function to generate all open-ended questions for the dataset
def generate_all_open_ended_questions(data, settings):
    all_questions = {lang: [] for lang in settings}
    for index, row in data.iterrows():
        for lang, lang_settings in settings.items():
            name = row[lang_settings['name_column']]
            country = row[lang_settings['country_column']]
            year = row[lang_settings['year_column']]

            if pd.isna(name) or pd.isna(country) or pd.isna(year):
                continue  # Skip incomplete data entries

            all_questions[lang].extend(generate_open_ended_questions(
                name, country, year, lang
            ))

    return all_questions

# Define a function to nicely print the open-ended question details
def print_open_ended_question_details(question):
    print(f"Question: {question['question']}")
    print(f"Correct Answer: {question['correct_answer']}")
    print()

# Define the function to generate, print, and collect all open-ended questions for the dataset
def generate_and_print_all_open_ended_questions(data, settings):
    all_questions = {lang: [] for lang in settings}
    questions_list = []
    for index, row in data.iterrows():
        for lang, lang_settings in settings.items():
            name = row[lang_settings['name_column']]
            country = row[lang_settings['country_column']]
            year = row[lang_settings['year_column']]

            if pd.isna(name) or pd.isna(country) or pd.isna(year):
                continue  # Skip incomplete data entries

            questions = generate_open_ended_questions(name, country, year, lang)
            all_questions[lang].extend(questions)
            for question in questions:
                questions_list.append({
                    'language': lang,
                    'question': question['question'],
                    'correct_answer': question['correct_answer']
                })

    # Now print the formatted questions for each language
    for language, qs in all_questions.items():
        print(f"===== {language} - Sample Open-Ended Questions: =====")
        for question in qs[:3]:  # Adjust the range for the number of questions you want to print
            print_open_ended_question_details(question)
        print()
    
    # Create a DataFrame from the collected questions
    all_questions_oe = pd.DataFrame(questions_list)
    return all_questions_oe

# Define the language settings for the historical events dataset
language_settings = {
    'EN': {
        'name_column': 'itemLabel_EN',
        'country_column': 'countryLabel_EN',
        'year_column': 'year'
    },
    'FR': {
        'name_column': 'itemLabel_FR',
        'country_column': 'countryLabel_FR',
        'year_column': 'year'
    },
    'DE': {
        'name_column': 'itemLabel_DE',
        'country_column': 'countryLabel_DE',
        'year_column': 'year'
    },
    'TR': {
        'name_column': 'itemLabel_TR',
        'country_column': 'countryLabel_TR',
        'year_column': 'year'
    },
    'JA': {
        'name_column': 'itemLabel_JA',
        'country_column': 'countryLabel_JA',
        'year_column': 'year'
    }
}

# Call the function using the defined settings
all_questions_oe = generate_and_print_all_open_ended_questions(data, language_settings)

# Display the resulting DataFrame
print(all_questions_oe)

===== EN - Sample Open-Ended Questions: =====
Question: In which country did the event 'Battle of White Oak Swamp' take place?
Correct Answer: United States of America

Question: In which year did the event 'Battle of White Oak Swamp' occur?
Correct Answer: 1862.0

Question: In which country did the event 'Battle of Springfield' take place?
Correct Answer: United States of America


===== FR - Sample Open-Ended Questions: =====
Question: Dans quel pays a eu lieu l'événement 'bataille de White Oak Swamp' ?
Correct Answer: États-Unis

Question: En quelle année a eu lieu l'événement 'bataille de White Oak Swamp' ?
Correct Answer: 1862.0

Question: Dans quel pays a eu lieu l'événement 'bataille de Springfield' ?
Correct Answer: États-Unis


===== DE - Sample Open-Ended Questions: =====
Question: In welchem Land fand das Ereignis 'Battle of White Oak Swamp' statt?
Correct Answer: Vereinigte Staaten

Question: In welchem Jahr fand das Ereignis 'Battle of White Oak Swamp' statt?
Correct Answe

In [16]:
excel_filename = "events_openended.csv"
all_questions_oe.to_csv(excel_filename, index=False)

# We have effectively formed all types of questions/structures to prompt the API of models.

## We have 3 formats of questions:

- Multiple Choice Questions
- True/False Questions
- Open Ended Questions

#### At this current stage we will only prompt Open AI's models because of time constraints. 

In [34]:
all_questions_oe

Unnamed: 0,language,question,correct_answer
0,EN,In which country did the event 'Battle of Whit...,United States of America
1,EN,In which year did the event 'Battle of White O...,1862.0
2,FR,Dans quel pays a eu lieu l'événement 'bataille...,États-Unis
3,FR,En quelle année a eu lieu l'événement 'bataill...,1862.0
4,DE,In welchem Land fand das Ereignis 'Battle of W...,Vereinigte Staaten
...,...,...,...
19813,DE,In welchem Jahr fand das Ereignis 'Battle of S...,1862.0
19814,TR,'Battle of South Mills' olayı hangi ülkede ger...,Amerika Birleşik Devletleri
19815,TR,'Battle of South Mills' olayı hangi yıl gerçek...,1862.0
19816,JA,イベント'Battle of South Mills'はどの国で行われましたか？,アメリカ合衆国


In [18]:
import pandas as pd

# Assuming the MCQ questions DataFrames are already created and named accordingly
country_question_en = country_dfs['EN'][['question', 'options', 'correct_letter', 'language']].copy()
country_question_fr = country_dfs['FR'][['question', 'options', 'correct_letter', 'language']].copy()
country_question_de = country_dfs['DE'][['question', 'options', 'correct_letter', 'language']].copy()
country_question_tr = country_dfs['TR'][['question', 'options', 'correct_letter', 'language']].copy()
country_question_ja = country_dfs['JA'][['question', 'options', 'correct_letter', 'language']].copy()

year_question_en = year_dfs['EN'][['question', 'options', 'correct_letter', 'language']].copy()
year_question_fr = year_dfs['FR'][['question', 'options', 'correct_letter', 'language']].copy()
year_question_de = year_dfs['DE'][['question', 'options', 'correct_letter', 'language']].copy()
year_question_tr = year_dfs['TR'][['question', 'options', 'correct_letter', 'language']].copy()
year_question_ja = year_dfs['JA'][['question', 'options', 'correct_letter', 'language']].copy()

# Concatenating all questions for each language
all_questions_en = pd.concat([country_question_en, year_question_en])
all_questions_fr = pd.concat([country_question_fr, year_question_fr])
all_questions_de = pd.concat([country_question_de, year_question_de])
all_questions_tr = pd.concat([country_question_tr, year_question_tr])
all_questions_ja = pd.concat([country_question_ja, year_question_ja])

# Optional: Reset index if needed
all_questions_en.reset_index(drop=True, inplace=True)
all_questions_fr.reset_index(drop=True, inplace=True)
all_questions_de.reset_index(drop=True, inplace=True)
all_questions_tr.reset_index(drop=True, inplace=True)
all_questions_ja.reset_index(drop=True, inplace=True)

# Printing the first few rows of each concatenated DataFrame for verification
print("All questions in English:")
print(all_questions_en.head())

print("All questions in French:")
print(all_questions_fr.head())

print("All questions in German:")
print(all_questions_de.head())

print("All questions in Turkish:")
print(all_questions_tr.head())

print("All questions in Japanese:")
print(all_questions_ja.head())


All questions in English:
                                            question  \
0  In which country did the event 'Battle of Whit...   
1  In which country did the event 'Battle of Spri...   
2  In which country did the event 'Battle of la T...   
3  In which country did the event 'Battle of Miho...   
4  In which country did the event 'Battle of Hang...   

                                             options correct_letter language  
0  {'A': 'United States of America', 'B': 'Turkey...              A       EN  
1  {'A': 'United States of America', 'B': 'Japan'...              A       EN  
2  {'A': 'Turkey', 'B': 'United Kingdom', 'C': 'F...              C       EN  
3  {'A': 'United Kingdom', 'B': 'France', 'C': 'J...              C       EN  
4  {'A': 'United Kingdom', 'B': 'Germany', 'C': '...              D       EN  
All questions in French:
                                            question  \
0  Dans quel pays a eu lieu l'événement 'bataille...   
1  Dans quel pays a eu lie

In [26]:
all_questions_fr


Unnamed: 0,question,options,correct_letter,language
0,Dans quel pays a eu lieu l'événement 'bataille...,"{'A': 'États-Unis', 'B': 'Japon', 'C': 'Royaum...",A,FR
1,Dans quel pays a eu lieu l'événement 'bataille...,"{'A': 'États-Unis', 'B': 'France', 'C': 'Royau...",A,FR
2,Dans quel pays a eu lieu l'événement 'bataille...,"{'A': 'États-Unis', 'B': 'Allemagne', 'C': 'Fr...",C,FR
3,Dans quel pays a eu lieu l'événement 'Battle o...,"{'A': 'Allemagne', 'B': 'Japon', 'C': 'Royaume...",B,FR
4,Dans quel pays a eu lieu l'événement 'bataille...,"{'A': 'Royaume-Uni', 'B': 'États-Unis', 'C': '...",B,FR
...,...,...,...,...
3989,En quelle année l'événement 'Battle of Roppaga...,"{'A': 1579.0, 'B': 1091.0, 'C': 1576.0, 'D': 8...",A,FR
3990,En quelle année l'événement 'site archéologiqu...,"{'A': 1871.0, 'B': 1541.0, 'C': 235.0, 'D': 15...",C,FR
3991,En quelle année l'événement 'Battle of Gilgal ...,"{'A': 1993.0, 'B': 1814.0, 'C': 235.0, 'D': 18...",D,FR
3992,En quelle année l'événement 'coronation of Que...,"{'A': 1430.0, 'B': 1507.0, 'C': 1559.0, 'D': 1...",C,FR


In [27]:
all_questions_mc = pd.concat([all_questions_en,all_questions_fr,all_questions_de,all_questions_ja,all_questions_tr])

In [31]:
all_questions_mc

Unnamed: 0,question,options,correct_letter,language
0,In which country did the event 'Battle of Whit...,"{'A': 'United States of America', 'B': 'Turkey...",A,EN
1,In which country did the event 'Battle of Spri...,"{'A': 'United States of America', 'B': 'Japan'...",A,EN
2,In which country did the event 'Battle of la T...,"{'A': 'Turkey', 'B': 'United Kingdom', 'C': 'F...",C,EN
3,In which country did the event 'Battle of Miho...,"{'A': 'United Kingdom', 'B': 'France', 'C': 'J...",C,EN
4,In which country did the event 'Battle of Hang...,"{'A': 'United Kingdom', 'B': 'Germany', 'C': '...",D,EN
...,...,...,...,...
3939,'Battle of Roppagawa' olayı hangi yıl gerçekle...,"{'A': 471.0, 'B': 1579.0, 'C': 1732.0, 'D': 45...",B,TR
3940,'Battle at the Harzhorn' olayı hangi yıl gerçe...,"{'A': 47.0, 'B': 736.0, 'C': 235.0, 'D': 1936.0}",C,TR
3941,'Battle of Gilgal Church' olayı hangi yıl gerç...,"{'A': 1517.0, 'B': 1590.0, 'C': 841.0, 'D': 18...",D,TR
3942,'coronation of Queen Elizabeth I' olayı hangi ...,"{'A': 2022.0, 'B': 1043.0, 'C': 1559.0, 'D': 1...",C,TR


In [29]:
excel_filename1 = "all_questions_events_mc.csv"
all_questions_mc.to_csv(excel_filename1, index=False)

In [165]:
birth_year_dfs['EN']['options'].iloc[6]

{'A': 1981.0, 'B': 1932.0, 'C': 1939.0, 'D': 1944.0}

In [166]:
birth_year_dfs['EN']['question'].iloc[6]

'What year was Jeff Beck born?'

In [167]:
birth_year_dfs['EN'].iloc[6]

question                              What year was Jeff Beck born?
options           {'A': 1981.0, 'B': 1932.0, 'C': 1939.0, 'D': 1...
correct_answer                                               1944.0
correct_letter                                                    D
Name: 6, dtype: object