In [1]:
import pandas as pd
from openai import OpenAI
import time

In [2]:
client = OpenAI(api_key='sk-proj-###')

In [3]:
df = pd.read_excel('sentence_pairs.xlsx')

As there are 21 examples in the data, the last one being an example for the prompt we will build so the model has a reference for what "simplified" means in this context, the dataset will take the first 20 rows.

In [4]:
df = df.iloc[:20]
print(len(df))

20


The example below will be passed to the model.

In [5]:
def simplify_text(complex_text, example_complex, example_simple):
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an expert at simplifying complex sentences while maintaining their core meaning. Simplify the given text in the style of the example provided."},
                {"role": "user", "content": f"""Example simplification:
Complex: {example_complex}
Simple: {example_simple}

Please simplify this text in a similar way:
Complex: {complex_text}
Simple:"""}
            ],
            temperature=0
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example text for reference
example_complex = "Coal is likely to rival oil as the world's biggest source of energy in the next five years, with potentially disastrous consequences for the climate, according to the world's leading authority on energy economics."
example_simple = "Coal will probably rival oil as the world's biggest source of energy in the next five years. This might be a disaster for the climate."

# Rename the original 'simple' column to 'original_simple' before generating new simplifications
df = df.rename(columns={'simple': 'original_simple'})

# Generate simplifications
df['generated_simple'] = None

for idx in df.index:
    complex_text = df.loc[idx, 'complex']
    simplified = simplify_text(complex_text, example_complex, example_simple)
    df.loc[idx, 'generated_simple'] = simplified
    time.sleep(1)  # API token rate limiter
    print(f"Processed row {idx + 1}/20")

# Save intermediate results
df.to_csv('results_with_simplifications.csv', index=False)


Processed row 1/20
Processed row 2/20
Processed row 3/20
Processed row 4/20
Processed row 5/20
Processed row 6/20
Processed row 7/20
Processed row 8/20
Processed row 9/20
Processed row 10/20
Processed row 11/20
Processed row 12/20
Processed row 13/20
Processed row 14/20
Processed row 15/20
Processed row 16/20
Processed row 17/20
Processed row 18/20
Processed row 19/20
Processed row 20/20


#### Sentence alterations
Alterations to the simplified sentence will be generated in a similar manner, for this we first need to change the example as to give the OpenAI model an idea of what our intention is.

##### Actual simplification:
Coal will probably rival oil as the world's biggest source of energy in the next five years. This might be a disaster for the climate. 

##### Fact Reversal:
"Coal will be much less important than oil as the world's biggest source of energy in the next five years. This might be great for the climate."


##### Critical Information Omission:
"Coal will probably rival oil as the world's biggest source of energy in the next five years."
(Omits the crucial climate impact information)

##### Addition of Unsupported Information:
"Coal will probably rival oil as the world's biggest source of energy in the next five years. This might be a disaster for the climate, but new carbon capture technology will completely solve this problem."

##### Subject/Object Reversal:
"Oil will probably be rivaled by coal as the world's biggest source of energy in the next five years. The climate might be a disaster for coal production."

##### Partial Meaning Preservation with Altered Conclusions:
"Coal will probably rival oil as the world's biggest source of energy in the next five years. This will create many new jobs in the mining sector."
(Keeps the energy fact but changes the conclusion from climate impact to economic impact)

In [6]:
def generate_alterations(simplified_text):
    example = """Original: Coal will probably rival oil as the world's biggest source of energy in the next five years. This might be a disaster for the climate.

1. Fact Reversal: Coal will be much less important than oil as the world's biggest source of energy in the next five years. This might be great for the climate.

2. Critical Information Omission: Coal will probably rival oil as the world's biggest source of energy in the next five years.

3. Addition of Unsupported Information: Coal will probably rival oil as the world's biggest source of energy in the next five years. This might be a disaster for the climate, but new carbon capture technology will completely solve this problem.

4. Subject/Object Reversal: Oil will probably be rivaled by coal as the world's biggest source of energy in the next five years. The climate might be a disaster for coal production.

5. Partial Meaning Preservation with Altered Conclusions: Coal will probably rival oil as the world's biggest source of energy in the next five years. This will create many new jobs in the mining sector."""

    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "Generate five alterations of the simplified text following the exact pattern shown in the example. Keep the same numbering and labels."},
                {"role": "user", "content": f"""Here's an example of the five alterations:

{example}

Now generate the same five types of alterations for this text:
{simplified_text}"""}
            ],
            temperature=0
        )
        
        alterations = response.choices[0].message.content.strip().split('\n')
        result = {
            'fact_reversal': None,
            'info_omission': None,
            'unsupported_info': None,
            'subject_object_reversal': None,
            'partial_meaning': None
        }
        
        current_alteration = None
        for line in alterations:
            if "1. Fact Reversal:" in line:
                result['fact_reversal'] = line.replace("1. Fact Reversal:", "").strip()
            elif "2. Critical Information Omission:" in line:
                result['info_omission'] = line.replace("2. Critical Information Omission:", "").strip()
            elif "3. Addition of Unsupported Information:" in line:
                result['unsupported_info'] = line.replace("3. Addition of Unsupported Information:", "").strip()
            elif "4. Subject/Object Reversal:" in line:
                result['subject_object_reversal'] = line.replace("4. Subject/Object Reversal:", "").strip()
            elif "5. Partial Meaning Preservation with Altered Conclusions:" in line:
                result['partial_meaning'] = line.replace("5. Partial Meaning Preservation with Altered Conclusions:", "").strip()
        
        return result
        
    except Exception as e:
        print(f"Error: {e}")
        return None

# Add columns for each alteration type
df['fact_reversal'] = None
df['info_omission'] = None
df['unsupported_info'] = None
df['subject_object_reversal'] = None
df['partial_meaning'] = None

# Generate alterations
for idx in df.index:
    simplified_text = df.loc[idx, 'generated_simple']
    alterations = generate_alterations(simplified_text)
    
    if alterations:
        for column, value in alterations.items():
            df.loc[idx, column] = value
    
    time.sleep(1)  # Similar delay for token rate limiting
    print(f"Processed row {idx + 1}/{len(df)}")

column_order = ['complex', 'original_simple', 'generated_simple', 
                'fact_reversal', 'info_omission', 'unsupported_info', 
                'subject_object_reversal', 'partial_meaning']
df = df[column_order]

df.to_csv('results_with_alterations.csv', index=False)

Processed row 1/20
Processed row 2/20
Processed row 3/20
Processed row 4/20
Processed row 5/20
Processed row 6/20
Processed row 7/20
Processed row 8/20
Processed row 9/20
Processed row 10/20
Processed row 11/20
Processed row 12/20
Processed row 13/20
Processed row 14/20
Processed row 15/20
Processed row 16/20
Processed row 17/20
Processed row 18/20
Processed row 19/20
Processed row 20/20


We'll also save the dataframe to an excel file for readability.

In [7]:
df.to_excel('results_with_alterations.xlsx', index=False)