In [3]:
pip install datasets



In [4]:
pip install --upgrade openai



In [5]:
# Imports
import json
import re
import os
import numpy as np
import pandas as pd
from openai import OpenAI
from datasets import load_dataset

from google.colab import userdata
client = OpenAI(api_key=userdata.get('OPEN_API_KEY_IR'))

# Dataset

In [20]:
#load the data for all languages
data = load_dataset("Eloquent/HalluciGen-Translation", "test_generation")

print(data.keys())

data_de_en = data['test_generation_de_en'].to_pandas()
data_en_de = data['test_generation_en_de'].to_pandas()
data_fr_en = data['test_generation_fr_en'].to_pandas()
data_en_fr = data['test_generation_en_fr'].to_pandas()

display(data_de_en.head())
display(data_en_de.head())

dict_keys(['test_generation_de_en', 'test_generation_en_de', 'test_generation_fr_en', 'test_generation_en_fr'])


Unnamed: 0,id,langpair,source
0,0,de-en,In Ländern ist das Anbieten von Luxus-Gästehäu...
1,1,de-en,"Dies ist in Großbritannien gängige Praxis, abe..."
2,2,de-en,„Auf Grundlage dieses Fossils lässt sich sagen...
3,3,de-en,"Wir werden niemals ein Auto überholen, einen B..."
4,4,de-en,"Ein Minenabwehrschiff der Avenger-Klasse, das ..."


Unnamed: 0,id,langpair,source
0,0,en-de,He graduated from the College of Arts & Scienc...
1,1,en-de,"Most are independent nations, or associated wi..."
2,2,en-de,But there are a lot of things about birds that...
3,3,en-de,The police had trouble using their speed radar...
4,4,en-de,"""The Giza Plateau, or """"Giza Necropolis"""" in t..."


## Generate Data

In [7]:
def generate_hypothesis(source, target_language):

  answer_format = {"hyp+": "", "hyp-": ""}

  system_msg = "You are a text generator for translation"

  # prompt for translation generation task
  user_msg = f'''Your task is to generate two translation hypothesis given the 'src' below.
      The first translation labelled as 'hyp+' should be supported by 'src' and the second translation labelled as 'hyp-' should not be supported by 'src'.
      Provide the result in the following format: {answer_format}. Target language: {target_language}

       Src: {source}

       Result:
       '''

  response = client.chat.completions.create(model="gpt-3.5-turbo",
                                        messages=[{"role": "system", "content": system_msg},
                                         {"role": "user", "content": user_msg}])

  #print("Gen text: ", outputs[0]["generated_text"][len(prompt):])
  generated_translation = response.choices[0].message.content
  #print(generated_translation)

  return generated_translation

## Save Data as csv file

In [8]:
def extract_json_data(result_text):
    # Find the start and end indices of the JSON data
    start_index = result_text.find('{')
    end_index = result_text.find('}') + 1  # Add 1 to include the closing brace

    # Extract the JSON data from the string
    result_text = result_text[start_index:end_index]
    #print("result text: ", result_text)

    # Replace single quotes used as delimiters with double quotes
    result_text = json.loads(result_text.replace("'", "\""))
    return result_text

In [22]:
#save results
def save_results(target_language, df, csv_filename):
    skipped_rows = []
    # Split DataFrame into chunks of 10 rows
    chunks = np.array_split(df, len(df) // 10 if len(df) % 10 == 0 else len(df) // 10 + 1)

    for chunk in chunks:
        # Iterate through each row in the chunk
        for index, row in chunk.iterrows():
            source = row['source']

            try:
                # Generate translation
                result = generate_hypothesis(source, target_language)
                extracted_hyp = extract_json_data(result)

                hyp_support = extracted_hyp['hyp+']
                #print(hyp_support)
                hyp_hallu = extracted_hyp['hyp-']
                #print(hyp_hallu)
                #print('source:',source)

                # Update DataFrame with new values
                chunk.at[index, 'hyp+'] = hyp_support
                chunk.at[index, 'hyp-'] = hyp_hallu
                #chunk.at[index, 'prompt'] = prompt
            except:
                print(f"Skipping row {index} due to error in data extraction.")
                skipped_rows.append(index)
                continue

        # Append chunk to CSV file
        if not os.path.isfile(csv_filename):
            chunk.to_csv(csv_filename, mode='w', header=True, index=False)
        else:
            chunk.to_csv(csv_filename, mode='a', header=False, index=False)

        # Load CSV file
        df = pd.read_csv(csv_filename)
        #display(df)
    return skipped_rows

In [10]:
def fix_skipped_rows(target_language, df, csv_filename, skipped_rows):
    skipped_rows2 = []
    for index in skipped_rows:
        row = df.loc[index]

        source = row['source']

        try:
            result = generate_hypothesis(source, target_language)
            # Find the start and end indices of the JSON data
            start_index = result.find('{')
            end_index = result.find('}') + 1
            result_text = result[start_index:end_index]
            result_text = result_text.replace('"', "'").replace("{'hyp+': '", '{"hyp+": "').replace(".', 'hyp-': '", '.", "hyp-": "').replace(".',\n'hyp-': '", '.",\n"hyp-": "').replace(".', \n'hyp-': '", '.", \n"hyp-": "').replace("'}", '"}')
            print("result text ", result_text)
            #print("result text ", result_text)
            extracted_hyp = json.loads(result_text)
            df.at[index, 'hyp+'] =  extracted_hyp['hyp+']
            df.at[index, 'hyp-'] =  extracted_hyp['hyp-']
        except Exception as e:
            print(f"Skipping row {index} due to error in data extraction.")
            skipped_rows2.append(index)
            continue

    df.to_csv(csv_filename, index=False)
    return skipped_rows2

In [8]:
data_de_en.loc[10]

id                                                         10
langpair                                                de-en
source      "Perry erklärte, er werde "nach Texas zurückke...
Name: 10, dtype: object

In [9]:
data_de_en.loc[10].values

array([10, 'de-en',
       '"Perry erklärte, er werde "nach Texas zurückkehren, um das Ergebnis der Vorwahl von heute Abend auszuwerten und zu entscheiden, ob es in diesem Rennen einen weiterführenden Weg für mich gibt"", aber sagte später, er werde im Rennen bleiben und bei den Vorwahlen in South Carolina am 21. Januar antreten."'],
      dtype=object)

# Translation German - English

In [23]:
csv_filename = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_de_en.csv"

skipped_rows_de_en = save_results("English", data_de_en, csv_filename)

Skipping row 9 due to error in data extraction.
Skipping row 10 due to error in data extraction.
Skipping row 16 due to error in data extraction.
Skipping row 31 due to error in data extraction.
Skipping row 41 due to error in data extraction.
Skipping row 44 due to error in data extraction.
Skipping row 60 due to error in data extraction.
Skipping row 64 due to error in data extraction.
Skipping row 67 due to error in data extraction.
Skipping row 79 due to error in data extraction.
Skipping row 84 due to error in data extraction.
Skipping row 85 due to error in data extraction.


In [24]:
skipped_rows_de_en

[9, 10, 16, 31, 41, 44, 60, 64, 67, 79, 84, 85]

In [25]:
df_results_de_en = pd.read_csv("/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_de_en.csv")

In [26]:
csv_filename2 = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_de_en_fixed.csv"

skipped_rows_de_en = fix_skipped_rows("English", df_results_de_en, csv_filename2, skipped_rows_de_en)

result text  {"hyp+": "In the last 20 years, the selection in Uptown Charlotte has grown exponentially.", "hyp-": "Uptown Charlotte has not experienced significant growth in the past two decades."}
result text  {"hyp+": "Perry stated that he will return to Texas to assess the results of tonight's primary and decide if there is a viable path forward for him in this race, but later said he will stay in the race and compete in the South Carolina primaries on January 21.", "hyp-": "Perry announced he is dropping out of the race and will not be participating in any future primaries."}
result text  {"hyp+": "The clearance height under the bridge is 15 meters. The construction work was completed in August 2011, but the approval for traffic was only given in March 2017.", "hyp-": "The bridge clearance height is 20 meters. The construction work finished in October 2010, and the traffic approval was granted in December 2016."}
result text  {"hyp+": "In 2009, Coolidge played the dramatic role of 

In [29]:
skipped_rows_de_en

[31, 41, 44]

In [27]:
def fix_skipped_rows_manual(target_language, df, skipped_rows):
    for index in skipped_rows:
        row = df.loc[index]

        source = row['source']

        result = generate_hypothesis(source, target_language)
        # Find the start and end indices of the JSON data
        start_index = result.find('{')
        end_index = result.find('}') + 1
        result_text = result[start_index:end_index]
        #result_text = result_text.replace('"', "'").replace("{'hyp+': '", '{"hyp+": "').replace(".', 'hyp-': '", '.", "hyp-": "').replace(".',\n'hyp-': '", '.",\n"hyp-": "').replace(".', \n'hyp-': '", '.", \n"hyp-": "').replace("'}", '"}')
        print("result text ", result_text)
        #print("result text ", result_text)
        #extracted_hyp = json.loads(result_text)
        #df.at[index, 'hyp+'] =  extracted_hyp['hyp+']
        #df.at[index, 'hyp-'] =  extracted_hyp['hyp-']


    #df.to_csv(csv_filename, index=False)

In [28]:
fix_skipped_rows_manual("English", df_results_de_en, skipped_rows_de_en)

result text  {'hyp+': 'In 2009, Coolidge played the dramatic role of “Geneviève McDonagh.”', 'hyp-': 'Coolidge played a comedic role in 2009.'}
result text  {'hyp+': 'In 2009, Coolidge played the dramatic role of "Geneviève McDonagh."', 'hyp-': 'In 2009, Coolidge directed the dramatic film "Geneviève McDonagh".'}
result text  {'hyp+': 'The next version was created in 1986 by Robert Fuller's brother Ron.', 'hyp-': 'The following edition was designed by Robert Fuller in 1986.'}


# Translation English - German

In [31]:
csv_filename = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_en_de.csv"

skipped_rows_en_de = save_results("German", data_en_de, csv_filename)

Skipping row 4 due to error in data extraction.
Skipping row 5 due to error in data extraction.
Skipping row 10 due to error in data extraction.
Skipping row 21 due to error in data extraction.
Skipping row 33 due to error in data extraction.
Skipping row 45 due to error in data extraction.
Skipping row 57 due to error in data extraction.
Skipping row 62 due to error in data extraction.
Skipping row 89 due to error in data extraction.
Skipping row 98 due to error in data extraction.


In [35]:
df_results_en_de = pd.read_csv("/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_en_de.csv")

In [36]:
csv_filename2 = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_en_de_fixed.csv"

skipped_rows_en_de = fix_skipped_rows("German", df_results_en_de, csv_filename2, skipped_rows_en_de)

result text  {"hyp+": "Das Gizeh-Plateau oder die ''Gizeh-Nekropole'' im ägyptischen Tal der Toten enthält mehrere Pyramiden (von denen die Pyramide die größte ist), mehrere kleine Gräber, mehrere Tempel und die große Sphinx.", "hyp-": "Der Gizeh-Plateau oder die 'Gizeh-Nekropole' im ägyptischen Tal der Toten enthält keine Pyramiden, sondern nur kleine Tempel und Gräber."}
result text  {"hyp+": "Der Premierminister Stephen Harper hat zugestimmt, das Regierungsgesetz 'Clean Act' zur Überprüfung an einen Ausschuss aller Parteien zu senden, bevor es zur zweiten Lesung kommt, nach einem 25-minütigen Treffen am Dienstag mit dem NDP-Führer Jack Layton im PMO.", "hyp-": "Der Premierminister Stephen Harper hat entschieden, das Regierungsgesetz 'Clean Act' ohne jegliche Überprüfung durch ein allparteiliches Komitee zur zweiten Lesung zu schicken, nach einem 25-minütigen Treffen am Dienstag mit dem NDP-Führer Jack Layton im PMO."}
result text  {"hyp+": "Am 24. September 1759 unterzeichnete Arthu

In [37]:
fix_skipped_rows_manual("German", df_results_en_de, skipped_rows_en_de)

result text  {'hyp+': 'Und schließlich, nach vollen vier Stunden, in denen ich gelogen und es angestarrt habe, klingelte das Telefon, und irgendwie schaffte ich es aufzuheben, und es war mein Vater, und ich sagte: "Ich bin in ernsthaften Schwierigkeiten. Wir müssen etwas unternehmen."', 'hyp-': 'Nach vier vollen Stunden des Lügens und Starrens klingelte das Telefon und als ich abhob, war es meine Mutter, und ich sagte: "Alles in Ordnung hier."'}
result text  {'hyp+': 'Sie sagte: "Früher hatte ich einen Job, aber ich musste aufhören, weil ich nicht aus dem Haus gehen konnte."', 'hyp-': 'Sie erzählte, dass sie immer Arbeit hatte und gerne zur Arbeit ging, bevor sie nicht mehr aus dem Haus gehen konnte.'}


# Translation French - English

In [38]:
csv_filename = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_fr_en.csv"

skipped_rows_fr_en = save_results("English", data_fr_en, csv_filename)

Skipping row 11 due to error in data extraction.
Skipping row 23 due to error in data extraction.
Skipping row 32 due to error in data extraction.
Skipping row 35 due to error in data extraction.
Skipping row 36 due to error in data extraction.
Skipping row 42 due to error in data extraction.
Skipping row 48 due to error in data extraction.
Skipping row 50 due to error in data extraction.
Skipping row 51 due to error in data extraction.
Skipping row 58 due to error in data extraction.
Skipping row 60 due to error in data extraction.
Skipping row 71 due to error in data extraction.
Skipping row 74 due to error in data extraction.
Skipping row 75 due to error in data extraction.
Skipping row 83 due to error in data extraction.
Skipping row 86 due to error in data extraction.
Skipping row 88 due to error in data extraction.
Skipping row 89 due to error in data extraction.
Skipping row 90 due to error in data extraction.


In [39]:
df_results_fr_en = pd.read_csv("/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_fr_en.csv")
csv_filename2 = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_fr_en_fixed.csv"

skipped_rows_fr_en = fix_skipped_rows("English", df_results_fr_en, csv_filename2, skipped_rows_fr_en)

result text  {"hyp+": "The issue began on January 1st when dozens of local residents started complaining to the Obanazawa post office that they had not received their traditional and regular New Year's cards.", "hyp-": "The problem started on December 25th when a few local residents complained to the Obanazawa post office that they had not received their Christmas cards."}
result text  {"hyp+": "The chef became friends with the assistant.", "hyp-": "The chef fired the assistant."}
result text  {"hyp+": "Octavius Warre Malet's second daughter, Alice Anna Catherine, married Thomas at the British Consulate in Cologne on June 24, 1852.", "hyp-": "Alice Anna Catherine, the second daughter of Octavius Warre Malet, married Thomas at the French Consulate in Cologne on June 24, 1852."}
result text  {"hyp+": "In 2012, Gil joined the TV program - Remake of the Salvador-Royales-Films 'Mundo Man ay Magunaw' in the role of Jennifer la Pena.", "hyp-": "Gil was a contestant on a cooking show in 2012."

In [40]:
fix_skipped_rows_manual("English", df_results_fr_en, skipped_rows_fr_en)

result text  {'hyp+': 'Of the twelve stories included, six had already been published in the author's first collection, "Evening News".', 'hyp-': 'The stories are all new and have never been published before.'}
result text  {'hyp+': 'Quentin Collins, Ed Griffin, in "House of Despair", recounts how the residents of Collinsport have not forgotten how "the winter girl inexplicably disappeared."', 'hyp-': 'Quentin Collins, Ed Griffin, in "House of Despair", recounts how the residents of Collinsport have not forgotten the mysterious disappearance of a young girl in the winter.'}
result text  {'hyp+': 'Daniil Ostrogski also states that Dmytro is the father of Bilinsky, also known as "Danylo Dmytrovych."', 'hyp-': 'Daniil Ostrogski claims that Bilinsky is the father of Dmytro, also known as "Danylo Dmytrovych."'}


# Translation English - French

In [41]:
csv_filename = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_en_fr.csv"

skipped_rows_en_fr =  save_results("French", data_en_fr, csv_filename)

Skipping row 0 due to error in data extraction.
Skipping row 1 due to error in data extraction.
Skipping row 2 due to error in data extraction.
Skipping row 4 due to error in data extraction.
Skipping row 6 due to error in data extraction.
Skipping row 8 due to error in data extraction.
Skipping row 9 due to error in data extraction.
Skipping row 10 due to error in data extraction.
Skipping row 11 due to error in data extraction.
Skipping row 13 due to error in data extraction.
Skipping row 14 due to error in data extraction.
Skipping row 15 due to error in data extraction.
Skipping row 16 due to error in data extraction.
Skipping row 17 due to error in data extraction.
Skipping row 18 due to error in data extraction.
Skipping row 19 due to error in data extraction.
Skipping row 20 due to error in data extraction.
Skipping row 21 due to error in data extraction.
Skipping row 22 due to error in data extraction.
Skipping row 24 due to error in data extraction.
Skipping row 26 due to erro

In [42]:
df_results_en_fr = pd.read_csv("/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_en_fr.csv")
csv_filename2 = "/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Generation_task/gpt/results_gpt_en_fr_fixed.csv"

skipped_rows_en_fr = fix_skipped_rows("French", df_results_en_fr, csv_filename2, skipped_rows_en_fr)

result text  {"hyp+": "Né dans la capitale croate, Bobek a acquis la renommée en jouant pour le Partizan de Belgrade.", "hyp-": "Bobek est né en Serbie et a déménagé à Zagreb où il est devenu célèbre."}
result text  {"hyp+": "Utiliser des applications de navigation GPS peut être le moyen le plus facile et le plus pratique de se déplacer lorsque l'on est en dehors de son pays d'origine.", "hyp-": "Les applications de navigation GPS sont rarement utiles en voyage à l'étranger."}
result text  {"hyp+": "Seuls les effets du catastrophisme, et non de la dépression et de la peur, étaient conditionnels des séances hebdomadaires régulières de PA.", "hyp-": "Seuls les effets de la dépression et de la peur, pas le catastrophisme, étaient conditionnels des séances hebdomadaires régulières de PA."}
result text  {"hyp+": "Les étoiles émettent de la lumière et de la chaleur en raison de l'énergie produite lorsque les atomes sont fusionnés pour former des éléments plus lourds.", "hyp-": "Les étoiles é

In [43]:
fix_skipped_rows_manual("French", df_results_en_fr, skipped_rows_en_fr)