In [None]:
import pandas as pd
import random
import os
import re
import json
import requests
import inspect
from openai import OpenAI
import openai
from dotenv import load_dotenv
import sys
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Dataset Preparation

In [11]:
# set seed
seed = 42

### Wikilarge (own) - English

In [12]:
df_wikiEN = pd.read_csv("../data/TS_datasets/WikiMB_EN.csv", sep=",")

df_wikiEN = df_wikiEN.drop(columns=["Source Link"])
df_wikiEN.columns = ['complex_sentence']

df_wikiEN.head()

Unnamed: 0,complex_sentence
0,"The N1 road also designated as RN1, is a road ..."
1,"Originating in the capital city of Bangui, the..."
2,As a key component of the country's road netwo...
3,"The N1 route commences in the heart of Bangui,..."
4,The terrain along this stretch is characterize...


### Wikilarge (own) - German

In [13]:
df_wikiDE = pd.read_csv("../data/TS_datasets/WikiMB_DE.csv", sep=",")

df_wikiDE = df_wikiDE.drop(columns=["Source Link"])

df_wikiDE.columns = ['complex_sentence']

df_wikiDE.head()

Unnamed: 0,complex_sentence
0,Am 22. März 1985 unterzeichnete die Europäisch...
1,Im folgenden Jahr wurde die VERORDNUNG (EWG) N...
2,Zur Anpassung an die Änderungen des Montrealer...
3,Unternehmen müssen die Emissionen von geregelt...
4,Francesco Calzolari stammte aus einer alteinge...


### Cochrane Database (Medical) (own)

In [14]:
df_cochrane = pd.read_csv("../data/TS_datasets/CochraneMB.csv", sep=",")

df_cochrane.drop(columns=['Source Link'], inplace=True)
df_cochrane.columns = ['complex_sentence', 'original_simple_sentence']

df_cochrane.head()

Unnamed: 0,complex_sentence,original_simple_sentence
0,TEP technique may carry a higher risk of conve...,"If the TEP technique is used, there may be a h..."
1,An inguinal hernia occurs when part of the int...,A groin hernia is a weakness or defect in the ...
2,There may be little to no difference between T...,The evidence suggests there may be little to n...
3,TEP technique may carry a higher risk of conve...,The evidence suggests that the risk of needing...
4,CBT may result in a small‐to‐moderate reductio...,Evidence suggests that cognitive behavioural t...


### Legal Dataset (own)

In [15]:
df_scotus = pd.read_csv("../data/TS_datasets/ScotusMB.csv", sep=",")

df_scotus.drop(columns=['Source Link'], inplace=True)
df_scotus.columns = ['complex_sentence']

df_scotus.head()

Unnamed: 0,complex_sentence
0,To determine whether Corner Post’s APA claim i...
1,While the Board argues that §2401(a) should no...
2,"Importantly, contemporaneous dictionaries also..."
3,The Board leaves open the possibility that som...
4,The APA provisions grant Corner Post a cause o...


## API Calls

In [16]:
# Load environment variables from .env file
load_dotenv()
# Setup API Keys
openai.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=openai.api_key)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [17]:
# a general function to run a specified model on a set of datasets and save the output

def run_model_on_multiple_datasets(model_function, datasets, input_column, output_column, save_dir, model_name):
    """
    Run a given model function on multiple datasets and save the results.
    """
    processed_datasets = {}
    
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    for dataset_name, dataset in datasets.items():
        # Create a copy of the dataset to avoid modifying the original
        dataset_copy = dataset.copy()
        
        # Apply the model function to the input column and store results in the output column
        dataset_copy[output_column] = dataset_copy[input_column].apply(model_function)
        
        # Store the processed dataset in the result dictionary
        processed_datasets[dataset_name] = dataset_copy
        
        # Generate filename
        filename = f"{dataset_name}_{model_name}_simplified.csv"
        file_path = os.path.join(save_dir, filename)
        
        # Save the dataset
        dataset_copy.to_csv(file_path, index=False)
        
        print(f"Processed and saved dataset: {filename}")
    
    return processed_datasets

In [18]:
datasets = {
    'wikiEN': df_wikiEN,
    'cochrane': df_cochrane,
    'scotus': df_scotus
}

datasets_DE = {
    'wikiDE': df_wikiDE
}

In [19]:
# load asset (alva-manchego-etal-2020-asset) for prompt few shot examples 

# read original complex sentences
with open("../data/TS_datasets/asset-main/dataset/asset.valid.orig", "r") as file:
    asset_original = file.readlines()

# Reading user generated simplified sentences

directory = "../data/TS_datasets/asset-main/dataset/"

search_string = 'asset.valid.simp.'

# Dictionary to store lists for each file
file_lines_dict = {}

# Loop through all files in the specified directory
for filename in os.listdir(directory):
    # Check if the filename contains the search string
    if search_string in filename:
        print(f"Processing file: {filename}")
        # Extract the identifier (e.g., simp1, simp2, etc.)
        identifier = filename.split('asset.valid.')[1]
        print("Identifier:  ", identifier)
        
        # Create a list for this file
        file_lines_dict[identifier] = []

        # Open and read the file
        with open(os.path.join(directory, filename), 'r') as file:
            # Read all lines and store them in the list
            file_lines_dict[identifier] = file.readlines()

Processing file: asset.valid.simp.1
Identifier:   simp.1
Processing file: asset.valid.simp.6
Identifier:   simp.6
Processing file: asset.valid.simp.8
Identifier:   simp.8
Processing file: asset.valid.simp.9
Identifier:   simp.9
Processing file: asset.valid.simp.7
Identifier:   simp.7
Processing file: asset.valid.simp.0
Identifier:   simp.0
Processing file: asset.valid.simp.5
Identifier:   simp.5
Processing file: asset.valid.simp.2
Identifier:   simp.2
Processing file: asset.valid.simp.3
Identifier:   simp.3
Processing file: asset.valid.simp.4
Identifier:   simp.4


In [20]:
# generate a random number to select a random few shot example (testing out)

# set number based on length of asset_original
random_number = random.randint(0, len(asset_original))
print("Random number: ", random_number)

# select random entry in file_lines_dict based on length of file_lines_dict
random_entry = random.choice(list(file_lines_dict.keys()))
print("Random entry: ", random_entry)

# print the original and simplified sentences on the same line
print(" Original:", asset_original[random_number], "Simplified:", file_lines_dict[random_entry][random_number])

Random number:  1239
Random entry:  simp.7
 Original: He was an exceptionally active campaigner and critic in the crisis years in Italy after the First World War and into the early years of Fascist rule.
 Simplified: He was an active campaigner and critic in the crisis years. He was vocal in Italy after the First World War. As well as into the early years of Fascist rule.



In [21]:
# This function will generate a prompt sentence for few shot examples
def generate_few_shot_example_prompt():
    random_number = random.randint(0, len(asset_original) - 1)
    random_entry = random.choice(list(file_lines_dict.keys()))

    return f" Original: {asset_original[random_number]} Simplified: {file_lines_dict[random_entry][random_number]}"


generate_few_shot_example_prompt()

' Original: This flag was the same, except the golden stripe had a human stick figure, a kanaga, in black, with arms raised to the sky.\n Simplified: This flag was the same, a human stick figure on it was the only difference. He was a kanaga, in black, with arms raised to the sky.\n'

In [22]:
# same for german examples (using TextComplexityDE)

df_TCDE = pd.read_csv("../data/TS_datasets/TextComplexityDE-master/TextComplexityDE19/parallel_corpus.csv", sep=",", encoding="ISO-8859-1")

df_TCDE

Unnamed: 0,Sentence_Id,Article_ID,Article,Original_Sentence,Simplification,Rating
0,5,1,Seifenblase,"Wegen dieser leichten Vergänglichkeit wurde ,S...","Weil Seifenblasen nicht lange halten, wurden s...",Etwas einfacher
1,7,1,Seifenblase,In der Kunst wird spätestens seit dem Barock d...,In der Kunst wird die Seifenblase spätestens s...,Deutlich einfacher
2,11,1,Seifenblase,"Eine Seifenblase entsteht, wenn sich ein dünne...","Eine Seifenblase entsteht, wenn sich eine klei...",Etwas einfacher
3,13,1,Seifenblase,Infolge des gravitationsbedingten Auslaufens (...,Wegen der Erdanziehungskraft läuft die Flüssig...,Etwas einfacher
4,15,1,Seifenblase,Zudem erfolgt im Laufe des Auslaufprozesses ei...,Beim Auslaufen von einer Seifenblase sammeln s...,Deutlich einfacher
...,...,...,...,...,...,...
245,974,22,Geschichte der Europäischen Union,"Kohl war es, der Mitte 1988 Jacques Delors für...",Im Jahr 1988 schlug der deutsche Bundeskanzler...,Deutlich einfacher
246,979,23,Martin Luther King,Wesentlich durch Kings Einsatz und Wirkkraft i...,Das Civil Rights Movement ist durch den Einsat...,Deutlich einfacher
247,990,23,Martin Luther King,Der Vater änderte beide Namen nach einer Europ...,Der Vater änderte beide Namen nach einer Europ...,Etwas einfacher
248,1004,23,Martin Luther King,Am 20. September 1944 begann King sein Studium...,Am 20. September 1944 begann King sein Studium...,Deutlich einfacher


In [23]:
def generate_few_shot_example_prompt_DE():
    random_number = random.randint(0, len(df_TCDE) - 1)

    return f" Original: {df_TCDE['Original_Sentence'][random_number]}\n Vereinfacht: {df_TCDE['Simplification'][random_number]}\n"

# testing
generate_few_shot_example_prompt_DE()

' Original: In der Appekunny Formation im Osten des Parks, die auf ein Alter von 1,5\x961,3 Milliarden Jahre datiert wird, wurden 1982 Abdrücke gefunden, die von den Entdeckern als Metazoa interpretiert und nach neuen Untersuchungen 2002 als Horodyskia moniliformis beschrieben wurden.\n Vereinfacht: In der Appekunny Formation im Osten des Parks, die auf ein Alter von 1,5-1,3 Milliarden Jahre datiert wird, wurden 1982 Abdrücke gefunden. Diese Abdrücke wurden von den Entdeckern als vielzellige Lebenwesen (Metazoa) interpretiert. Nach neuen Untersuchungen wurden diese Metazoa als Horodyskai moniliformis beschrieben. \n'

In [24]:
save_directory = "../data/TS_model_outputs"

### OpenAI - GPT 4o

In [25]:
main_prompt  = ( # inspired by BLESS paper -> which was inspired by Maddela et al. (2023) "LENS"
    "Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. ",
    "You can do so by reordering parts of the sentence, replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. ",
    "The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.\n",
    "Respond only and always with the simplified sentence, nothing else. Do not start the response with any introduction.\n",
    "Examples:\n"
)

main_prompt_DE = ( 
    "Bitte schreibe den folgenden komplexen Satz um, um ihn für Nicht-Muttersprachler leichter verständlich zu machen. ",
    "Du kannst dies tun, indem du Teile des Satzes umstellst, komplexe Wörter durch einfachere Synonyme ersetzt (d.h. umformulieren), unwichtige Informationen weglässt (d.h. komprimieren) und/oder einen langen, komplexen Satz in mehrere einfachere Sätze aufteilst. ",
    "Der vereinfachte Satz muss grammatikalisch korrekt, fließend und verständlich sein und die Hauptideen des ursprünglichen Satzes beibehalten, ohne dessen Bedeutung zu verändern.\n",
    "Antworte ausschließlich und immer nur mit dem vereinfachten Satz, nichts weiter. Beginne die Antwort nicht mit einer Einleitung.\n",
    "Beispiele:\n"
)

In [26]:
def generate_main_prompt(main_prompt, language="EN"):
    """ Generate the main prompt for the task including the main instruction and 3 examples.
    Depending on the language, the prompt will be generated in English or German.
    """
    if language == "DE":
        prompt = ''.join(main_prompt_DE)
        prompt += generate_few_shot_example_prompt_DE()
        prompt += "\n"
        prompt += generate_few_shot_example_prompt_DE()
        prompt += "\n"
        prompt += generate_few_shot_example_prompt_DE()
        prompt += "\n"
    else:
        prompt = ''.join(main_prompt)
        prompt += generate_few_shot_example_prompt()
        prompt += "\n"
        prompt += generate_few_shot_example_prompt()
        prompt += "\n"
        prompt += generate_few_shot_example_prompt()
        prompt += "\n"

    return prompt

# testing
generate_main_prompt(main_prompt)

'Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by reordering parts of the sentence, replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.\nRespond only and always with the simplified sentence, nothing else. Do not start the response with any introduction.\nExamples:\n Original: In Augustan Rome, Quirinus was also an epithet of Janus, as Janus Quirinus.\n Simplified: Quirinus was another name used for Janus.\n\n Original: She first rose to prominence as an academic, barrister, campaigner and member of the Irish senate (1969 – 1989).\n Simplified: She was an academic, barrister, campaigner and member of the Irish senate (19

In [None]:
# testing for german
generate_main_prompt(main_prompt, language="DE")

'Bitte schreibe den folgenden komplexen Satz um, um ihn für Nicht-Muttersprachler leichter verständlich zu machen. Du kannst dies tun, indem du Teile des Satzes umstellst, komplexe Wörter durch einfachere Synonyme ersetzt (d.h. umformulieren), unwichtige Informationen weglässt (d.h. komprimieren) und/oder einen langen, komplexen Satz in mehrere einfachere Sätze aufteilst. Der vereinfachte Satz muss grammatikalisch korrekt, fließend und verständlich sein und die Hauptideen des ursprünglichen Satzes beibehalten, ohne dessen Bedeutung zu verändern.\nAntworte ausschließlich und immer nur mit dem vereinfachten Satz, nichts weiter. Beginne die Antwort nicht mit einer Einleitung.\nBeispiele:\n Original: Darüber hinaus sind Würfel in einer Vielzahl von Brettspielen bedeutend, um etwa die Bewegungsgeschwindigkeit von Spielfiguren oder den Ausgang von Zufallsereignissen zu bestimmen.\n Vereinfacht: Außerdem sind Würfel wichtig für viele Brettspiele. Zum Beispiel um die Bewegung von Spielfiguren 

In [None]:
# generate simplified sentences through GPT-4 API

def generate_simplified_sentence_OpenAI(complex_sentence):
    # Custom prompt to generate simplified sentences
    #system_prompt = "You are a helpful assistant."

    prompt = generate_main_prompt(main_prompt)
    prompt += f"Original sentences to simplify:\n\"{complex_sentence}\".\n Simplified: "

    print("Prompt: ", prompt)
    
    response = client.chat.completions.create(
        model="gpt-4o-2024-05-13",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that is an expert in simplifying sentences for non-native speakers."},
            {"role": "user", "content": prompt},
        ]
        )

    # Extract the simplified sentence from the response

    response_text = response.choices[0].message.content
    print("Response text:" + response_text) 
    simplified_sentence = response_text
    
    print(f"\n\nSimplified OG sentence: {complex_sentence} to {simplified_sentence}")
    print("----------------------------\n\n")

    return simplified_sentence

In [57]:
# Process all datasets with the OpenAI API and save them
run_model_on_multiple_datasets(
    model_function = generate_simplified_sentence_OpenAI, 
    datasets = datasets, 
    input_column = 'complex_sentence', 
    output_column = 'simplified_sentence', 
    save_dir = save_directory, 
    model_name = 'OpenAI')

Prompt:  Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by reordering parts of the sentence, replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.
Respond only and always with the simplified sentence, nothing else. Do not start the response with any introduction.
Examples:
 Original: Arunachal Pradesh (,) is the easternmost state of India.
 Simplified: Arunachal Pradesh is the Indian state farthest to the east.

 Original: C. L. Barber, “Shakespearian Comedy in the Comedy of Errors, ” College English 25.7 (1964), p. 493.
 Simplified: C. L. Barber, “Shakespearian Comedy in the Comedy of Errors,” in College English book.

 O

KeyboardInterrupt: 

In [None]:
# now run on GERMAN dataset(s)

def generate_simplified_sentence_OpenAI_DE(complex_sentence):
    prompt = generate_main_prompt(main_prompt_DE, language="DE")
    prompt += f"Originaler Satz zu vereinfachen:\n\"{complex_sentence}\".\n Vereinfacht: "

    print("Prompt: ", prompt)
    
    response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "Du bist ein hilfreicher Assistent und Experte im vereinfachen von Sätzen für nicht-muttersprachliche Leser."}, #different system prompt!
        {"role": "user", "content": prompt},
    ]
    )

    # Extract the simplified sentence from the response
    response_text = response.choices[0].message.content
    print("Response text:" + response_text) 
    simplified_sentence = response_text

    # Remove leading and trailing quotation marks if present
    simplified_sentence = response_text.strip('"')
    
    print(f"\n\nSimplified OG sentence: {complex_sentence} to {simplified_sentence}")
    print("----------------------------\n\n")

    return simplified_sentence

In [32]:
# GERMAN Execution for OpenAI's ChatGPT model

run_model_on_multiple_datasets(
    model_function = generate_simplified_sentence_OpenAI_DE, 
    datasets = datasets_DE, 
    input_column = 'complex_sentence', 
    output_column = 'simplified_sentence', 
    save_dir = save_directory, 
    model_name = 'OpenAI')

Prompt:  Bitte schreibe den folgenden komplexen Satz um, um ihn für Nicht-Muttersprachler leichter verständlich zu machen. Du kannst dies tun, indem du Teile des Satzes umstellst, komplexe Wörter durch einfachere Synonyme ersetzt (d.h. umformulieren), unwichtige Informationen weglässt (d.h. komprimieren) und/oder einen langen, komplexen Satz in mehrere einfachere Sätze aufteilst. Der vereinfachte Satz muss grammatikalisch korrekt, fließend und verständlich sein und die Hauptideen des ursprünglichen Satzes beibehalten, ohne dessen Bedeutung zu verändern.
Antworte ausschließlich und immer nur mit dem vereinfachten Satz, nichts weiter. Beginne die Antwort nicht mit einer Einleitung.
Beispiele:
 Original: Zudem erfolgt im Laufe des Auslaufprozesses eine Anreicherung von Seifenfilm-stabilisierenden Tensidmolekülen im unteren Bereich der Seifenblase, sodass deren obere Region infolge des relativen Mangels von an die Oberfläche adsorbierten Tensidmolekülen zusätzlich destabilisiert wird.
 Ver

{'wikiDE':                                      complex_sentence  \
 0   Am 22. März 1985 unterzeichnete die Europäisch...   
 1   Im folgenden Jahr wurde die VERORDNUNG (EWG) N...   
 2   Zur Anpassung an die Änderungen des Montrealer...   
 3   Unternehmen müssen die Emissionen von geregelt...   
 4   Francesco Calzolari stammte aus einer alteinge...   
 5   Francescos kindliche Neugierde beschränkte sic...   
 6   Die Apotheke seines Vaters wurde auch von Gele...   
 7   Calzolari trat in die Fußstapfen seines Vaters...   
 8   Auch wenn ihn seine Arbeit als Apotheker davon...   
 9   Um den Monte Baldo noch näher zu sein, ließ si...   
 10  Es gilt als erstes und ältestes Naturkundemuse...   
 11  Diese kostete nur 20 Pfennig (ab 1871) und war...   
 12  Im Mittelalter lag südöstlich des heutigen Hof...   
 13  Nach der Auflösung der Garnison Weingarten nac...   
 14  In den Wäldern um Nessenreben findet man häufi...   
 15  In den Wäldern leben Rehe, Wildschweine und Fü...   
 16 

### LLama 3.1 (8bn)

In [20]:
# LLAMA simpler few shot prompt --> mention source in paper that similarly did this in a simpler fashion

smaller_main_prompt  = ( # inspired by BLESS paper -> which was inspired by Maddela et al. (2023) "LENS"
    "Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English.\n",
    "Keep the meaning same, but make it simpler.\n",
    "Respond only and always with the simplified sentence, nothing else.\n",
    "Examples:\n"
)

smaller_main_prompt_DE = (
    "Bitte schreibe den folgenden komplexen Satz um, damit er für Nicht-Muttersprachler leichter verständlich wird.\n",
    "Behalte die Bedeutung bei, aber mache ihn einfacher.\n",
    "Antworte ausschließlich und immer nur mit dem vereinfachten Satz, nichts weiter.\n",
    "Beispiele:\n"
)

In [None]:
# Hint: Uses locally installed LLama model (with LM Studio)

from openai import OpenAI

LMstudio_client = OpenAI(
    base_url="http://localhost:1234/v1",
    api_key="lm-studio"
)

def generate_simplified_sentence_LLama(complex_sentence):

    prompt = generate_main_prompt(smaller_main_prompt)
    prompt += f"Original sentences to simplify:\n\"{complex_sentence}\".\n Simplified: "

    print("Prompt: ", prompt)
    
    response = LMstudio_client.chat.completions.create(
        model="someString", 
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the capital of Paris?"},
        ])
    # Extract the simplified sentence from the response

    response_text = response.choices[0].message.content
    print("Response text:" + response_text) 
    simplified_sentence = response_text
    
    print(f"\n\nSimplified OG sentence: {complex_sentence} to {simplified_sentence}")
    print("----------------------------\n\n")

    return simplified_sentence

In [401]:
# Process all datasets with the LLama model and save them

run_model_on_multiple_datasets(
    model_function = generate_simplified_sentence_LLama, 
    datasets = datasets, 
    input_column = 'complex_sentence', 
    output_column = 'simplified_sentence', 
    save_dir = save_directory, 
    model_name = 'LLAMA31')

Prompt:  Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English.
Keep the meaning same, but make it simpler.
Respond only and always with the simplified sentence, nothing else.
Examples:
 Original: The year 1976 saw Punk arrive on what some people perceived to be a stagnant music scene and NME, like other "specialist" publications, was slow to cover this new phenomenon.
 Simplified: Punk arrived in 1976 on what was thought to be a stale music scene. NME was slow to cover this new situation.

 Original: It is the fastest growing census division in Canada, expected to surpass 1.5 million residents by 2031.
 Simplified: It is the fastest growing census division in Canada. It is expected to surpass 1.5 million residents by 2031.

 Original: The BCG matrix (aka B. C. G. analysis, BCG-matrix, Boston Box, Boston Matrix, Boston Consulting Group analysis) is a chart that had been created by Bruce Henderson for the Boston Consulti

{'wikiEN':                                      complex_sentence  \
 0   The N1 road also designated as RN1, is a road ...   
 1   Originating in the capital city of Bangui, the...   
 2   As a key component of the country's road netwo...   
 3   The N1 route commences in the heart of Bangui,...   
 4   The terrain along this stretch is characterize...   
 5   She married Marcus Claudius Marcellus Aesernin...   
 6   Asinia was a noblewoman of ancient Roman who w...   
 7   Influenced by his father Jinghui, whom was als...   
 8   In 1853, Saišangga was stripped of official po...   
 9   2-Chloronicotinic acid (2-CNA) is a halogenate...   
 10  Dasymutilla sicheliana is a species of velvet ...   
 11  He is known for the brutality of his raids, hi...   
 12  His actions led to the forced relocation of th...   
 13  After the death of Cochise, Pionsenay remained...   
 14  In 1876, the Sulphur Springs overland mail sta...   
 15  The attack was instigated after an argument ov...   
 16 

In [None]:
# GERMAN prompt building and execution

def generate_simplified_sentence_LLama_DE(complex_sentence):
   
    prompt = generate_main_prompt(smaller_main_prompt_DE, language="DE")
    prompt += f"Originaler Satz zu vereinfachen:\n\"{complex_sentence}\".\n Vereinfacht: "

    print("Prompt: ", prompt)
    
    response = LMstudio_client.chat.completions.create(
        model="someString", 
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the capital of Paris?"},
        ])
    # Extract the simplified sentence from the response

    response_text = response.choices[0].message.content
    print("Response text:" + response_text) 
    simplified_sentence = response_text
    
    print(f"\n\nSimplified OG sentence: {complex_sentence} to {simplified_sentence}")
    print("----------------------------\n\n")

    return simplified_sentence

In [None]:
# GERMAN Execution

run_model_on_multiple_datasets(
    model_function = generate_simplified_sentence_LLama_DE, 
    datasets = datasets_DE, 
    input_column = 'complex_sentence', 
    output_column = 'simplified_sentence', 
    save_dir = save_directory, 
    model_name = 'LLAMA31')

Prompt:  Bitte schreibe den folgenden komplexen Satz um, um ihn für Nicht-Muttersprachler leichter verständlich zu machen. Du kannst dies tun, indem du Teile des Satzes umstellst, komplexe Wörter durch einfachere Synonyme ersetzt (d.h. umformulieren), unwichtige Informationen weglässt (d.h. komprimieren) und/oder einen langen, komplexen Satz in mehrere einfachere Sätze aufteilst. Der vereinfachte Satz muss grammatikalisch korrekt, fließend und verständlich sein und die Hauptideen des ursprünglichen Satzes beibehalten, ohne dessen Bedeutung zu verändern.
Antworte ausschließlich und immer nur mit dem vereinfachten Satz, nichts weiter. Beginne die Antwort nicht mit einer Einleitung.
Beispiele:
 Original: Werden keine Maßnahmen zur Bekämpfung des Klimawandels getroffen, sind weltweit 16 % aller Arten vom Aussterben bedroht, wie eine 2015 in Science erschienene Übersichtsarbeit ergab.
 Vereinfacht: Ohne  Maßnahmen zur Bekämpfung des Klimawandels sind weltweit 16 % aller Arten vom Aussterben

{'wikiDE':                                      complex_sentence  \
 0   Am 22. März 1985 unterzeichnete die Europäisch...   
 1   Im folgenden Jahr wurde die VERORDNUNG (EWG) N...   
 2   Zur Anpassung an die Änderungen des Montrealer...   
 3   Unternehmen müssen die Emissionen von geregelt...   
 4   Francesco Calzolari stammte aus einer alteinge...   
 5   Francescos kindliche Neugierde beschränkte sic...   
 6   Die Apotheke seines Vaters wurde auch von Gele...   
 7   Calzolari trat in die Fußstapfen seines Vaters...   
 8   Auch wenn ihn seine Arbeit als Apotheker davon...   
 9   Um den Monte Baldo noch näher zu sein, ließ si...   
 10  Es gilt als erstes und ältestes Naturkundemuse...   
 11  Diese kostete nur 20 Pfennig (ab 1871) und war...   
 12  Im Mittelalter lag südöstlich des heutigen Hof...   
 13  Nach der Auflösung der Garnison Weingarten nac...   
 14  In den Wäldern um Nessenreben findet man häufi...   
 15  In den Wäldern leben Rehe, Wildschweine und Fü...   
 16 

### MUSS

RUN FILE WITH FOLLOWING COMMAND:
python scripts/simplify.py scripts/examples.en --model-name "muss_en_wikilarge_mined"

In [28]:
# Export data to file for processing by MUSS

MUSS_export_dir = "../data/MUSS_inputs"

def export_complex_sentences(datasets, export_directory):
    os.makedirs(export_directory, exist_ok=True)
    
    for dataset_name, dataset in datasets.items():
        filename = f"{dataset_name}_complex_sentences.txt"
        file_path = os.path.join(export_directory, filename)
        
        with open(file_path, 'w', encoding='utf-8') as f:
            for sentence in dataset['complex_sentence']:
                f.write(f"{sentence}\n")
        
        print(f"Exported complex sentences for {dataset_name} to {file_path}")

export_complex_sentences(datasets, MUSS_export_dir)

Exported complex sentences for wikiEN to ../data/MUSS_inputs/wikiEN_complex_sentences.txt
Exported complex sentences for cochrane to ../data/MUSS_inputs/cochrane_complex_sentences.txt
Exported complex sentences for scotus to ../data/MUSS_inputs/scotus_complex_sentences.txt


In [None]:
# 1. Paste the exported data file into the MUSS Input folder

# 2. Run commands in terminal in dedicated env (py38muss):

# conda activate py38muss

# python scripts/simplify.py FOLDER/cochrane_complex_sentences.txt --model-name "muss_en_wikilarge_mined"
# python scripts/simplify.py FOLDER/wikiEN_complex_sentences.txt --model-name "muss_en_wikilarge_mined"
# python scripts/simplify.py FOLDER/scotus_complex_sentences.txt --model-name "muss_en_wikilarge_mined"

# python scripts/simplify.py FOLDER/FILENAME --model-name "muss_en_wikilarge_mined"

# Copy the simplified sentences from the MUSS_system_output folder back to the TS data folder and name accordingly

### mBART DE (similar to MUSS; for German)

https://huggingface.co/DEplain/trimmed_mbart_sents_apa_web

In [29]:
# Load model
tokenizer = AutoTokenizer.from_pretrained("DEplain/trimmed_mbart_sents_apa_web",  use_fast=False)
model = AutoModelForSeq2SeqLM.from_pretrained("DEplain/trimmed_mbart_sents_apa_web")

In [17]:
def run_mBART_on_sentence(sentence): 
    # Tokenize and generate output
    print("Komplexer Satz: " + sentence)
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model.generate(**inputs)
    simplified_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Simpler Satz: " + simplified_sentence + "\n")
    
    return simplified_sentence

In [18]:
run_mBART_on_sentence("Dies ist ein komplexer Satz, den ich, sofern möglich und irgendwie machbar, vereinfachen möchte.")

Komplexer Satz: Dies ist ein komplexer Satz, den ich, sofern möglich und irgendwie machbar, vereinfachen möchte.
Simpler Satz: Dies ist ein komplexer Satz, den ich vereinfachen möchte.



'Dies ist ein komplexer Satz, den ich vereinfachen möchte.'

In [21]:
run_model_on_multiple_datasets(
    model_function = run_mBART_on_sentence, 
    datasets = datasets_DE, 
    input_column = 'complex_sentence', 
    output_column = 'simplified_sentence', 
    save_dir = save_directory, 
    model_name = 'mBART_DE')

Komplexer Satz: Am 22. März 1985 unterzeichnete die Europäische Gemeinschaft gemeinsam mit mehreren Mitgliedstaaten das Wiener Übereinkommen zum Schutz der Ozonschicht und darauf folgend am 16. September 1987 auch das Montrealer Protokoll.
Simpler Satz: Am 22. März 1985 unterzeichnete die Europäische Gemeinschaft das Wiener Übereinkommen zum Schutz der Ozonschicht.

Komplexer Satz: Im folgenden Jahr wurde die VERORDNUNG (EWG) Nr. 3322/88, als erste europäische Verordnung zur Regelung ozonschädigender Stoffe verabschiedet.
Simpler Satz: Im Jahr 2020 wurde die Verordnung zur Regelung ozonschädigender Stoffe verabschiedet.

Komplexer Satz: Zur Anpassung an die Änderungen des Montrealer Protokolls wurden durch nachfolgende Verordnungen die geregelten Substanzen immer weiter ergänzt und Ein- und Ausfuhr, Produktion und Verwendung weiter eingeschränkt, Fristen verkürzt und Substanzen letztendlich verboten.
Simpler Satz: Zur Anpassung an die Änderungen des Montrealer Protokolls wurden die ger

{'wikiDE':                                      complex_sentence  \
 0   Am 22. März 1985 unterzeichnete die Europäisch...   
 1   Im folgenden Jahr wurde die VERORDNUNG (EWG) N...   
 2   Zur Anpassung an die Änderungen des Montrealer...   
 3   Unternehmen müssen die Emissionen von geregelt...   
 4   Francesco Calzolari stammte aus einer alteinge...   
 5   Francescos kindliche Neugierde beschränkte sic...   
 6   Die Apotheke seines Vaters wurde auch von Gele...   
 7   Calzolari trat in die Fußstapfen seines Vaters...   
 8   Auch wenn ihn seine Arbeit als Apotheker davon...   
 9   Um den Monte Baldo noch näher zu sein, ließ si...   
 10  Es gilt als erstes und ältestes Naturkundemuse...   
 11  Diese kostete nur 20 Pfennig (ab 1871) und war...   
 12  Im Mittelalter lag südöstlich des heutigen Hof...   
 13  Nach der Auflösung der Garnison Weingarten nac...   
 14  In den Wäldern um Nessenreben findet man häufi...   
 15  In den Wäldern leben Rehe, Wildschweine und Fü...   
 16 

### 4th Approach: DisSim by Niklaus et al.

In [None]:
# (first: intall DisSim: https://github.com/Lambda-3/DiscourseSimplification)

# open Terminal and run following commands in the DisSim folder:


# (installation; first time only)
# mvn clean install -DskipTests

# Create a text file input.txt (in the main DisSi folder) with the complex sentences to simplify
# hint: use the files in the "TS_MUSS_Sentences" folder, as the structure is identical

# run the program
# mvn clean compile exec:java

# copy the file JSON output file from the DisSim folder to the TS_model_outputs and name accordinlgy
# (e.g. wikiEN_DisSim_simplified.json)

In [None]:
import re
import pandas as pd
import json

def extract_simplified_sentences(json_data):
    sentences = []
    if 'sentences' in json_data:
        for sentence in json_data['sentences']:
            sentences.extend(extract_from_sentence(sentence))
    elif isinstance(json_data, list):
        for sentence in json_data:
            sentences.extend(extract_from_sentence(sentence))
    elif isinstance(json_data, dict):
        sentences.extend(extract_from_sentence(json_data))
    return sentences

def extract_from_sentence(sentence):
    extracted = []
    simple_contexts = {}
    element_texts = []

    for element in sentence['elementMap'].values():
        # Check if the element is a simplified version or part of the original sentence broken down
        if element['contextLayer'] > 0 or (element['contextLayer'] == 0 and element['text'] != sentence.get('originalSentence', '')):
            element_text = replace_brackets(element['text'].strip())
            element_texts.append(element_text)

        # Track simpleContexts
        if 'simpleContexts' in element:
            for context in element['simpleContexts']:
                if 'text' in context:
                    context_text = replace_brackets(context['text'].strip())
                    simple_contexts[context_text] = len(element_texts) - 1

    # Insert simpleContexts after their last mention
    for context_text, last_index in simple_contexts.items():
        element_texts.insert(last_index + 1, context_text)

    return element_texts

def replace_brackets(text):
    text = re.sub(r'-LRB- ', '(', text)
    text = re.sub(r' -RRB-', ')', text)
    text = re.sub(r'-LSB- ', '[', text)
    text = re.sub(r' -RSB-', ']', text)
    return text

def format_sentences(sentences):
    if not sentences:
        return "No simplified sentences found."
    formatted_sentences = ' '.join(sentences)
    formatted_sentences = re.sub(r'\s+([.,!?])', r'\1', formatted_sentences)
    return formatted_sentences

def process_json(json_data):
    results = []
    if 'sentences' in json_data:
        sentences = json_data['sentences']
    elif isinstance(json_data, list):
        sentences = json_data
    else:
        sentences = [json_data]
    
    for sentence in sentences:
        original_sentence = replace_brackets(sentence.get('originalSentence', 'No original sentence found.'))
        simplified = extract_from_sentence(sentence)
        formatted_simplified = format_sentences(simplified)
        results.append({
            'original_sentence': original_sentence,
            'simplified_sentences': formatted_simplified
        })
    return pd.DataFrame(results)

def run_DisSim_on_dataset(dataset_path):
    with open(dataset_path, "r") as file:
        data = json.load(file)
    return process_json(data)

In [None]:
lst_DisSim_files = [
    "../data/wikiEN_DisSim_simplified.json", 
    "../data/scotus_DisSim_simplified.json", 
    "../data/cochrane_DisSim_simplified.json"]

# turn into dataframe format and save as CSV
for file in lst_DisSim_files:
    df = run_DisSim_on_dataset(file)
    df.to_csv(file.replace(".json", ".csv"), index=False)

NameError: name 'run_DisSim_on_dataset' is not defined

---

## Convert CSVs to SALSA Annotation Tool Format

In [None]:
data_f_wikiEN_DisSim = pd.read_csv("../data/TS_model_outputs/wikiEN_DisSim_simplified.csv")
data_f_wikiEN_DisSim.head(10)

Unnamed: 0,original_sentence,simplified_sentences
0,"The N1 road also designated as RN1, is a road ...","The N1 road also designated as RN1, is a road ..."
1,"Originating in the capital city of Bangui, the...",The road is originating in the capital city of...
2,As a key component of the country's road netwo...,The N1 facilitates connectivity between the ca...
3,"The N1 route commences in the heart of Bangui,...","The N1 route commences in the heart of Bangui,..."
4,The terrain along this stretch is characterize...,The terrain along this stretch is characterize...
5,She married Marcus Claudius Marcellus Aesernin...,She married Marcus Claudius Marcellus Aesernin...
6,Asinia was a noblewoman of ancient Roman who w...,Asinia was a noblewoman of ancient Roman. A no...
7,"Influenced by his father Jinghui, whom was als...","Jinghui, Saišangga was proficient in Manchu, M..."
8,"In 1853, Saišangga was stripped of official po...",Saišangga was stripped of official position. S...
9,2-Chloronicotinic acid (2-CNA) is a halogenate...,2-Chloronicotinic acid (2-CNA) is a halogenate...


In [None]:
data_f_SCOTUS_DisSim = pd.read_csv("../data/TS_model_outputs/scotus_DisSim_simplified.csv")
data_f_SCOTUS_DisSim.head(10)

Unnamed: 0,original_sentence,simplified_sentences
0,To determine whether Corner Post’s APA claim i...,"The Court must interpret § 2401 (a), which pro..."
1,While the Board argues that §2401(a) should no...,§ 2401 (a) should not be interpreted to adopt ...
2,"Importantly, contemporaneous dictionaries also...",A cause of action accrues `` on [the] date tha...
3,The Board leaves open the possibility that som...,The Board leaves open the possibility. Someone...
4,The APA provisions grant Corner Post a cause o...,Corner Post a cause of action subject to certa...
5,Statutes of limitations ‘require plaintiffs to...,Statutes of limitations ` require plaintiffs t...
6,The Board’s interpretation would thereby decou...,The Board 's interpretation would thereby deco...
7,The Board’s arguments to the contrary lack merit.,The Board 's arguments to the contrary lack me...
8,Departing from the traditional rule is particu...,Departing from the traditional rule is particu...
9,While §702 equips injured parties with a cause...,§ 702 equips injured parties. This is with a c...


In [None]:
data_f_Cochrane_DisSim = pd.read_csv("../data/TS_model_outputs/cochrane_DisSim_simplified.csv")
data_f_Cochrane_DisSim.head(10)

Unnamed: 0,original_sentence,simplified_sentences
0,TEP technique may carry a higher risk of conve...,TEP technique may carry a higher risk of conve...
1,An inguinal hernia occurs when part of the int...,An inguinal hernia occurs. Part of the intesti...
2,There may be little to no difference between T...,There may be little to no difference between T...
3,TEP technique may carry a higher risk of conve...,TEP technique may carry a higher risk of conve...
4,CBT may result in a small‐to‐moderate reductio...,CBT may result in a small‐to‐moderate reductio...
5,"However, compared to this benefit with CBT imm...",There was little to no difference between CBT ...
6,CBT may result in a reduction of comorbid depr...,CBT may result in a reduction of comorbid depr...
7,There is not enough evidence to determine whet...,There is not enough evidence to determine whet...
8,To assess the effects of Cognitive Behavioural...,To assess the effects of Cognitive Behavioural...
9,Percutaneous and surgical interventions combin...,Percutaneous can be used to treat uncomplicate...


In [30]:
# read in model generated output files

data_f_wikiEN_ChatGPT = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiEN_OpenAI_simplified.csv")
data_f_wikiEN_LLAMA = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiEN_LLAMA31_simplified.csv")
data_f_wikiEN_MUSS = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiEN_MUSS_simplified.csv") # TODO: change to TS_final_sentences folder
data_f_wikiEN_MUSS.columns = ['complex_sentence', 'simplified_sentence']
data_f_wikiEN_DisSim = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiEN_DisSim_simplified.csv")
data_f_wikiEN_DisSim.columns = ['complex_sentence', 'simplified_sentence']

data_f_Cochrane_ChatGPT = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/cochrane_OpenAI_simplified.csv")
data_f_Cochrane_LLAMA = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/cochrane_LLAMA31_simplified.csv")
data_f_Cochrane_MUSS = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/cochrane_MUSS_simplified.csv")
data_f_Cochrane_MUSS.columns = ['complex_sentence', 'simplified_sentence']
data_f_Cochrane_DisSim = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/cochrane_DisSim_simplified.csv")
data_f_Cochrane_DisSim.columns = ['complex_sentence', 'simplified_sentence']

data_f_SCOTUS_ChatGPT = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/scotus_OpenAI_simplified.csv")
data_f_SCOTUS_LLAMA = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/scotus_LLAMA31_simplified.csv")
data_f_SCOTUS_MUSS = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/scotus_MUSS_simplified.csv")
data_f_SCOTUS_MUSS.columns = ['complex_sentence', 'simplified_sentence']
data_f_SCOTUS_DisSim = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/scotus_DisSim_simplified.csv")
data_f_SCOTUS_DisSim.columns = ['complex_sentence', 'simplified_sentence']

data_f_wikiDE_ChatGPT = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiDE_OpenAI_simplified.csv")
data_f_wikiDE_LLAMA = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiDE_LLAMA31_simplified.csv")
data_f_wikiDE_MBART = pd.read_csv("../data/TS_model_outputs/TS_final_sentences/wikiDE_mBART_DE_simplified.csv")
# No DisSim for German



In [31]:
# rename data_f_SCOTUS_DisSim.columns to complex_sentence and simplified_sentence
#data_f_SCOTUS_DisSim.columns = ['complex_sentence', 'simplified_sentence']

data_f_SCOTUS_DisSim.columns

Index(['complex_sentence', 'simplified_sentence'], dtype='object')

In [32]:
def convert_and_save_to_json(df, output_directory, df_name):
    result = []

    # Extract the model name from the DataFrame name
    model_name = df_name.split('_')[-1] if '_' in df_name else 'unknown_model'
    
    for _, row in df.iterrows():
        entry = {
            "source": row["complex_sentence"],
            "target": row["simplified_sentence"],
            "metadata": {
                "annotator": "annotator_0",
                "system": model_name
            },
            "edits": []
        }
        result.append(entry)

    # for all target sentences, replace ". " with ". || "
    for entry in result:
        entry['target'] = entry['target'].replace(". ", ". || ")
    
    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Generate filename based on the DataFrame name
    filename = f"{df_name}.json"
    
    # Full path for the output file
    output_path = os.path.join(output_directory, filename)
    
    # Save the JSON to the file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)
    
    #print(f"JSON file saved to: {output_path}")
    print(f"Exported {var_name} to {filename}")
    return json.dumps(result, indent=2)

In [33]:
output_directory = "../data/salsa_annotations/raw_unannotated"

# for ds in datasets_complete:
#     convert_and_save_to_json(ds, output_directory, df_name)

# Get all global variables
global_vars = sys.modules['__main__'].__dict__.copy()

# Iterate through global variables
for var_name, var_value in global_vars.items():
    # Check if the variable is a pandas DataFrame and starts with 'data_'
    if isinstance(var_value, pd.DataFrame) and var_name.startswith('data_f'):
        print(".. trying to convert: ", var_name)
        convert_and_save_to_json(var_value, output_directory, var_name)
        

.. trying to convert:  data_f_wikiEN_ChatGPT
Exported data_f_wikiEN_ChatGPT to data_f_wikiEN_ChatGPT.json
.. trying to convert:  data_f_wikiEN_LLAMA
Exported data_f_wikiEN_LLAMA to data_f_wikiEN_LLAMA.json
.. trying to convert:  data_f_wikiEN_MUSS
Exported data_f_wikiEN_MUSS to data_f_wikiEN_MUSS.json
.. trying to convert:  data_f_wikiEN_DisSim
Exported data_f_wikiEN_DisSim to data_f_wikiEN_DisSim.json
.. trying to convert:  data_f_Cochrane_ChatGPT
Exported data_f_Cochrane_ChatGPT to data_f_Cochrane_ChatGPT.json
.. trying to convert:  data_f_Cochrane_LLAMA
Exported data_f_Cochrane_LLAMA to data_f_Cochrane_LLAMA.json
.. trying to convert:  data_f_Cochrane_MUSS
Exported data_f_Cochrane_MUSS to data_f_Cochrane_MUSS.json
.. trying to convert:  data_f_Cochrane_DisSim
Exported data_f_Cochrane_DisSim to data_f_Cochrane_DisSim.json
.. trying to convert:  data_f_SCOTUS_ChatGPT
Exported data_f_SCOTUS_ChatGPT to data_f_SCOTUS_ChatGPT.json
.. trying to convert:  data_f_SCOTUS_LLAMA
Exported data_f