# Task
Simplify PDF documents from "https://api.github.com/repos/feliperussi/bridging-the-gap-in-health-literacy/contents/llms_testing/Cochrane/ground_truth" using the Gemini 2.5 Pro API and evaluate the results using BERTScore for relevance, AlignScore for factuality, and for readability: CLI FRE, GFI, SMOG, FKGL, DCRS.

## Setup and authentication

Install necessary libraries and set up authentication for Gemini API.


In [1]:
%pip install google-generativeai evaluate[bertscore] transformers sentence-transformers matplotlib pandas numpy textstat

Collecting textstat
  Downloading textstat-0.7.11-py3-none-any.whl.metadata (15 kB)
Collecting evaluate[bertscore]
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.11-py3-none-any.whl (176 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat, evaluate
Successfully installed evaluate-0.4.6 pyphen-0.17.2 textstat-0.7.11


In [2]:
import os
from google.colab import userdata

os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')

In [3]:
import torch
print(torch.__version__)

2.8.0+cu126


## Data Ingestion - Cochrane
Load text files from the GitHub repository "https://api.github.com/repos/feliperussi/bridging-the-gap-in-health-literacy/".

In [4]:
%pip install requests



Load abstracts



In [5]:
import csv

file_names = []
with open('/content/abstract_names.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        file_names.append(row[0]) # Assuming each row contains one filename

print(f"Loaded {len(file_names)} filenames from abstract_names.csv")

Loaded 300 filenames from abstract_names.csv


In [6]:
import requests
import os

github_base_urls = [
    "https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/llms_testing/Cochrane/ground_truth/",
    "https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/data_collection_and_processing/Data%20Sources/Cochrane/test/non_pls/",
    "https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/data_collection_and_processing/Data%20Sources/Cochrane/train/non_pls/"
]
local_download_dir = 'downloaded_texts'

# Create the local directory if it doesn't exist
os.makedirs(local_download_dir, exist_ok=True)

downloaded_count = 0

for file_name in file_names:
    local_file_path = os.path.join(local_download_dir, file_name)
    downloaded = False

    if os.path.exists(local_file_path):
        print(f"File {file_name} already exists locally. Skipping download.")
        downloaded_count += 1
        downloaded = True
        continue # Skip to the next file_name if already downloaded


    for base_url in github_base_urls:
        file_url = base_url + file_name

        try:
            print(f"Attempting to download {file_name} from {base_url}")
            file_content_response = requests.get(file_url)
            file_content_response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

            with open(local_file_path, 'w', encoding='utf-8') as f:
                f.write(file_content_response.text)
            downloaded_count += 1
            downloaded = True
            print(f"Successfully downloaded {file_name}")
            break  # Move to the next file_name if download is successful

        except requests.exceptions.RequestException as e:
            print(f"Could not download {file_name} from {base_url}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {file_name} from {base_url}: {e}")

    if not downloaded:
        print(f"Warning: Could not download {file_name} from any of the provided URLs.")


print(f"Download complete. Successfully downloaded {downloaded_count} out of {len(file_names)} files.")

Attempting to download 10.1002-14651858.CD000371.pub7-abstract.txt from https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/llms_testing/Cochrane/ground_truth/
Successfully downloaded 10.1002-14651858.CD000371.pub7-abstract.txt
Attempting to download 10.1002-14651858.CD001218.pub3-abstract.txt from https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/llms_testing/Cochrane/ground_truth/
Successfully downloaded 10.1002-14651858.CD001218.pub3-abstract.txt
Attempting to download 10.1002-14651858.CD001977.pub2-abstract.txt from https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/llms_testing/Cochrane/ground_truth/
Successfully downloaded 10.1002-14651858.CD001977.pub2-abstract.txt
Attempting to download 10.1002-14651858.CD002201.pub6-abstract.txt from https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/llms_testing/Cochrane/ground_truth/
Successfully dow

### Load abstract text

Load the content of the downloaded text files into a dictionary.

In [7]:
import os

downloaded_texts_content = {}

for filename in os.listdir(local_download_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(local_download_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                downloaded_texts_content[filename] = f.read()
            print(f"Loaded text from {filename}")
        except Exception as e:
            print(f"Error loading text from {filename}: {e}")


Loaded text from 10.1002-14651858.CD013170.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD004957.pub3-abstract.txt
Loaded text from 10.1002-14651858.CD013717.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD013384.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD013674.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD006251.pub4-abstract.txt
Loaded text from 10.1002-14651858.CD013285.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD015397-abstract.txt
Loaded text from 10.1002-14651858.CD003552.pub4-abstract.txt
Loaded text from 10.1002-14651858.CD013699-abstract.txt
Loaded text from 10.1002-14651858.CD013515.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD013251.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD013190-abstract.txt
Loaded text from 10.1002-14651858.CD012979.pub3-abstract.txt
Loaded text from 10.1002-14651858.CD013664.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD013196.pub2-abstract.txt
Loaded text from 10.1002-14651858.CD013

In [8]:
print("Downloaded filenames:")
for filename in downloaded_texts_content.keys():
    print(filename)

Downloaded filenames:
10.1002-14651858.CD013170.pub2-abstract.txt
10.1002-14651858.CD004957.pub3-abstract.txt
10.1002-14651858.CD013717.pub2-abstract.txt
10.1002-14651858.CD013384.pub2-abstract.txt
10.1002-14651858.CD013674.pub2-abstract.txt
10.1002-14651858.CD006251.pub4-abstract.txt
10.1002-14651858.CD013285.pub2-abstract.txt
10.1002-14651858.CD015397-abstract.txt
10.1002-14651858.CD003552.pub4-abstract.txt
10.1002-14651858.CD013699-abstract.txt
10.1002-14651858.CD013515.pub2-abstract.txt
10.1002-14651858.CD013251.pub2-abstract.txt
10.1002-14651858.CD013190-abstract.txt
10.1002-14651858.CD012979.pub3-abstract.txt
10.1002-14651858.CD013664.pub2-abstract.txt
10.1002-14651858.CD013196.pub2-abstract.txt
10.1002-14651858.CD013501.pub2-abstract.txt
10.1002-14651858.CD013172.pub2-abstract.txt
10.1002-14651858.CD013679-abstract.txt
10.1002-14651858.CD007263.pub3-abstract.txt
10.1002-14651858.CD013829.pub2-abstract.txt
10.1002-14651858.CD013512.pub2-abstract.txt
10.1002-14651858.CD013487-abst

In [9]:
print(len(downloaded_texts_content))

300


In [10]:
import pandas as pd

# Create a DataFrame from the dictionary
# Assuming the dictionary keys are filenames and values are text content
df_downloaded = pd.DataFrame.from_dict(downloaded_texts_content, orient='index', columns=['text_content'])

# Save the DataFrame to a CSV file
csv_filepath = 'downloaded_texts_content.csv'
df_downloaded.to_csv(csv_filepath, index_label='filename')

print(f"Downloaded texts content saved to {csv_filepath}")

Downloaded texts content saved to downloaded_texts_content.csv


## Text Simplification

### Subtask:
Use the Gemini API with the provided prompt to simplify the extracted text.

In [24]:
import time
from google import genai
from google.genai import types
from google.api_core import exceptions as core_exceptions

simplified_texts = {}
prompt = """Using the following abstract of a biomedical study as input, generate a Plain Language Summary
(PLS) understandable by any patient, regardless of their health literacy. Ensure that the generated text
adheres to the following instructions which should be followed step-by-step:
a. Specific Structure: The generated PLS should be presented in a logical order, using the following
order:
1. Plain Title
2. Rationale
3. Trial Design
4. Results
b. Sections should be authored following these parameters:
1. Plain Title: Simplified title understandable to a layperson that summarizes the research that was
done.
2. Rationale: Include: background or study rationale providing a general description of the
condition, what it may cause or why it is a burden for the patients; the reason and main hypothesis
for the study; and why the study is needed, and why the study medication has the potential to
treat the condition.
3. Trial Design: Answer ‘How is this study designed?’ Include the description of the design,
description of study and patient population (age, health condition, gender), and the expected
amount of time a person will be in the study.
4. Results: Answer ‘What were the main results of the study’, include the benefits for the patients,
how the study was relevant for the area of study, and the conclusions from the investigator.
c. Consistency and Replicability: The generated PLS should be consistent regardless of the order of
sentences or the specific phrasing used in the input protocol text.
d. Compliance with Plain Language Guidelines: The generated PLS must follow all these plain
language guidelines:
• Have readability grade level of 6 or below.
• Do not have jargon. All technical or medical words or terms should be defined or broken down
into simple and logical explanations.
• Active voice, not passive.
• Mostly one or two syllable words.
• Sentences of 15 words or less.
• Short paragraphs of 3-5 sentences.
• Simple numbers (e.g., ratios, no percentages).
e. Do not invent Content: The AI model should not invent information. If the AI model includes data
other than the one given in the input abstract, the AI model should guarantee such data is verified and
real.
f. Aim for an approximate PLS length of 500-900 words.


Abstract of a biomedical study text: {text}
"""

client = genai.Client(api_key=os.environ['GOOGLE_API_KEY'])
counter = 0
sum_times = 0

for filename, text in downloaded_texts_content.items():
    counter += 1
    retries = 3  # Set a number of retries


    while retries > 0:
        init_time = time.time()

        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro",
                contents=prompt.format(text=text),
                config=types.GenerateContentConfig()
            )
            simplified_texts[filename] = response.text
            response_time = time.time() - init_time
            sum_times += response_time
            print(f"Simplified text # {counter} for {filename}. Duration: {response_time}")
            break  # Exit the retry loop on success
        except core_exceptions.ServiceUnavailable as e:
            print(f"ServiecUnavailable error for {filename}: {e}. Retrying in 60 seconds...")
            retries -= 1
            time.sleep(60)
            if retries == 0:
                print(f"Failed to simplify {filename} after multiple retries.")
        except Exception as e:
            print(f"An unexpected error occurred while simplifying {filename}: {e}")
            break # Exit the retry loop for other errors

print(f"Average time: {sum_times/counter}")


Simplified text # 1 for 10.1002-14651858.CD013170.pub2-abstract.txt. Duration: 28.35413360595703
Simplified text # 2 for 10.1002-14651858.CD004957.pub3-abstract.txt. Duration: 30.7657253742218
Simplified text # 3 for 10.1002-14651858.CD013717.pub2-abstract.txt. Duration: 38.24468636512756
Simplified text # 4 for 10.1002-14651858.CD013384.pub2-abstract.txt. Duration: 31.463656187057495
Simplified text # 5 for 10.1002-14651858.CD013674.pub2-abstract.txt. Duration: 25.135801076889038
Simplified text # 6 for 10.1002-14651858.CD006251.pub4-abstract.txt. Duration: 26.888251304626465
Simplified text # 7 for 10.1002-14651858.CD013285.pub2-abstract.txt. Duration: 27.35495138168335
Simplified text # 8 for 10.1002-14651858.CD015397-abstract.txt. Duration: 25.36493992805481
Simplified text # 9 for 10.1002-14651858.CD003552.pub4-abstract.txt. Duration: 27.485113859176636
Simplified text # 10 for 10.1002-14651858.CD013699-abstract.txt. Duration: 25.361170291900635
Simplified text # 11 for 10.1002-14

In [25]:
import pandas as pd

# Create a DataFrame from the extracted and simplified texts
data = {'original_text': downloaded_texts_content, 'simplified_text': simplified_texts}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_filepath = 'simplified_texts_g25pro.csv'
df.to_csv(csv_filepath, index_label='filename')

print(f"Results saved to {csv_filepath}")

Results saved to simplified_texts_g25pro.csv


# Task
Load original pls and compare to simplified texts from "/content/simplified_texts.csv" and provide a Relevance score using BERTScore, Factual consistence score using AlignScore and Readability score using Flesch–Kincaid Grade Level for each row.

## Load pls text

Load pls

In [31]:
import csv

pls_file_names = []
with open('/content/pls_names.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        pls_file_names.append(row[0]) # Assuming each row contains one filename

print(f"Loaded {len(pls_file_names)} filenames from pls_names.csv")

Loaded 300 filenames from pls_names.csv


In [32]:
import requests
import os

github_base_urls = [
    "https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/llms_testing/Cochrane/ground_truth/",
    "https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/data_collection_and_processing/Data%20Sources/Cochrane/test/pls/",
    "https://raw.githubusercontent.com/feliperussi/bridging-the-gap-in-health-literacy/main/data_collection_and_processing/Data%20Sources/Cochrane/train/pls/"
]
local_download_dir = 'pls_downloaded_texts'

# Create the local directory if it doesn't exist
os.makedirs(local_download_dir, exist_ok=True)

downloaded_count = 0

for file_name in pls_file_names:
    local_file_path = os.path.join(local_download_dir, file_name)
    downloaded = False

    if os.path.exists(local_file_path):
        print(f"File {file_name} already exists locally. Skipping download.")
        downloaded_count += 1
        downloaded = True
        continue # Skip to the next file_name if already downloaded


    for base_url in github_base_urls:
        file_url = base_url + file_name

        try:
            print(f"Attempting to download {file_name} from {base_url}")
            file_content_response = requests.get(file_url)
            file_content_response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

            with open(local_file_path, 'w', encoding='utf-8') as f:
                f.write(file_content_response.text)
            downloaded_count += 1
            downloaded = True
            print(f"Successfully downloaded {file_name}")
            break  # Move to the next file_name if download is successful

        except requests.exceptions.RequestException as e:
            print(f"Could not download {file_name} from {base_url}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {file_name} from {base_url}: {e}")

    if not downloaded:
        print(f"Warning: Could not download {file_name} from any of the provided URLs.")


print(f"Download complete. Successfully downloaded {downloaded_count} out of {len(pls_file_names)} files.")

File 10.1002-14651858.CD000371.pub7-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD001218.pub3-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD001977.pub2-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD002201.pub6-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD002779.pub3-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD002948.pub2-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD003147.pub5-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD003315.pub3-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD003459.pub4-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD003552.pub4-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD003737.pub4-pls.txt already exists locally. Skipping download.
File 10.1002-14651858.CD004019.pub4-pls.txt

In [33]:
import os

pls_downloaded_texts_content = {}

for filename in os.listdir(local_download_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(local_download_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                clean_name = filename.replace("-pls.txt", "")
                pls_downloaded_texts_content[clean_name] = f.read()
            print(f"Loaded text from {filename}")
        except Exception as e:
            print(f"Error loading text from {filename}: {e}")


Loaded text from 10.1002-14651858.CD013190-pls.txt
Loaded text from 10.1002-14651858.CD009134.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013168.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013491.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013246.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013270.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013172.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013320.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013826.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013836.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013699-pls.txt
Loaded text from 10.1002-14651858.CD013845.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013497.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013438-pls.txt
Loaded text from 10.1002-14651858.CD013247.pub2-pls.txt
Loaded text from 10.1002-14651858.CD014257.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013376.pub2-pls.txt
Loaded text from 10.1002-14651858.CD013305.pub2-pls.txt
Loaded 

In [34]:
print(pls_downloaded_texts_content)



In [35]:
import pandas as pd

# Load the simplified_texts.csv file
df_simplified = pd.read_csv('simplified_texts_g25pro.csv')

# Remove "-abstract.txt" from the 'filename' column
df_simplified['filename'] = df_simplified['filename'].str.replace('-abstract.txt', '', regex=False)

# Set 'filename' as the index for both dataframes for merging
df_simplified = df_simplified.set_index('filename')
df_pls = pd.DataFrame.from_dict(pls_downloaded_texts_content, orient='index', columns=['pls_text_content'])


# Merge the two dataframes
df_merged = df_simplified.join(df_pls)

# Display the head of the merged DataFrame
display(df_merged.head())

Unnamed: 0_level_0,original_text,simplified_text,pls_text_content
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.1002-14651858.CD013170.pub2,Background\nPeople with neuromuscular disorder...,Of course. Here is the Plain Language Summary ...,The safety and effectiveness of techniques to ...
10.1002-14651858.CD004957.pub3,Multidisciplinary rehabilitation programmes fo...,Of course. Here is a Plain Language Summary ba...,Multidisciplinary rehabilitation programmes fo...
10.1002-14651858.CD013717.pub2,"Background\nIn late 2019, the first cases of c...",**Plain Title**\nDo Rules for Travelers Help S...,Can international travel‐related control measu...
10.1002-14651858.CD013384.pub2,Background\nChronic obstructive pulmonary dise...,Of course. Here is a Plain Language Summary ba...,Approaches to help people with COPD who have o...
10.1002-14651858.CD013674.pub2,Background\nMajor depressive disorders have a ...,**Plain Title**\nComparing Newer Medicines for...,Newer generation antidepressants for depressio...


In [36]:
display(df_merged.head())
display(df_merged.info())

Unnamed: 0_level_0,original_text,simplified_text,pls_text_content
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.1002-14651858.CD013170.pub2,Background\nPeople with neuromuscular disorder...,Of course. Here is the Plain Language Summary ...,The safety and effectiveness of techniques to ...
10.1002-14651858.CD004957.pub3,Multidisciplinary rehabilitation programmes fo...,Of course. Here is a Plain Language Summary ba...,Multidisciplinary rehabilitation programmes fo...
10.1002-14651858.CD013717.pub2,"Background\nIn late 2019, the first cases of c...",**Plain Title**\nDo Rules for Travelers Help S...,Can international travel‐related control measu...
10.1002-14651858.CD013384.pub2,Background\nChronic obstructive pulmonary dise...,Of course. Here is a Plain Language Summary ba...,Approaches to help people with COPD who have o...
10.1002-14651858.CD013674.pub2,Background\nMajor depressive disorders have a ...,**Plain Title**\nComparing Newer Medicines for...,Newer generation antidepressants for depressio...


<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 10.1002-14651858.CD013170.pub2 to 10.1002-14651858.CD015270
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   original_text     300 non-null    object
 1   simplified_text   300 non-null    object
 2   pls_text_content  300 non-null    object
dtypes: object(3)
memory usage: 17.5+ KB


None

In [37]:
# Save the DataFrame to a CSV file
csv_filepath = 'abstract_generated_pls_gemini25pro.csv'
df_merged.to_csv(csv_filepath, index_label='filename')

print(f"DataFrame saved to {csv_filepath}")

DataFrame saved to abstract_generated_pls_gemini25pro.csv


## Load data

Load the data from "/content/abstract_generated_pls_gemini25.csv" into a pandas DataFrame.


In [38]:
import pandas as pd

df = pd.read_csv("abstract_generated_pls_gemini25pro.csv", index_col='filename')
display(df.head())
display(df.info())

Unnamed: 0_level_0,original_text,simplified_text,pls_text_content
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.1002-14651858.CD013170.pub2,Background\nPeople with neuromuscular disorder...,Of course. Here is the Plain Language Summary ...,The safety and effectiveness of techniques to ...
10.1002-14651858.CD004957.pub3,Multidisciplinary rehabilitation programmes fo...,Of course. Here is a Plain Language Summary ba...,Multidisciplinary rehabilitation programmes fo...
10.1002-14651858.CD013717.pub2,"Background\nIn late 2019, the first cases of c...",**Plain Title**\nDo Rules for Travelers Help S...,Can international travel‐related control measu...
10.1002-14651858.CD013384.pub2,Background\nChronic obstructive pulmonary dise...,Of course. Here is a Plain Language Summary ba...,Approaches to help people with COPD who have o...
10.1002-14651858.CD013674.pub2,Background\nMajor depressive disorders have a ...,**Plain Title**\nComparing Newer Medicines for...,Newer generation antidepressants for depressio...


<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 10.1002-14651858.CD013170.pub2 to 10.1002-14651858.CD015270
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   original_text     300 non-null    object
 1   simplified_text   300 non-null    object
 2   pls_text_content  300 non-null    object
dtypes: object(3)
memory usage: 9.4+ KB


None

## Calculate bertscore

Compute BERTScore for each pair of original and simplified texts.


In [39]:
%pip install google-generativeai evaluate[bertscore] transformers sentence-transformers matplotlib pandas numpy textstat



In [40]:
%pip install bert_score



In [41]:
from evaluate import load

bertscore = load("bertscore")

precision_scores = []
recall_scores = []
f1_scores = []

for index, row in df.iterrows():
    original_text = row['pls_text_content']
    simplified_text = row['simplified_text']

    # BERTScore expects lists of strings
    results = bertscore.compute(predictions=[simplified_text], references=[original_text], model_type="allenai/longformer-large-4096-finetuned-triviaqa")

    f1_scores.append(results['f1'][0])

df['bertscore'] = f1_scores

display(df.head())

config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Unnamed: 0_level_0,original_text,simplified_text,pls_text_content,bertscore
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10.1002-14651858.CD013170.pub2,Background\nPeople with neuromuscular disorder...,Of course. Here is the Plain Language Summary ...,The safety and effectiveness of techniques to ...,0.821945
10.1002-14651858.CD004957.pub3,Multidisciplinary rehabilitation programmes fo...,Of course. Here is a Plain Language Summary ba...,Multidisciplinary rehabilitation programmes fo...,0.796075
10.1002-14651858.CD013717.pub2,"Background\nIn late 2019, the first cases of c...",**Plain Title**\nDo Rules for Travelers Help S...,Can international travel‐related control measu...,0.831442
10.1002-14651858.CD013384.pub2,Background\nChronic obstructive pulmonary dise...,Of course. Here is a Plain Language Summary ba...,Approaches to help people with COPD who have o...,0.823051
10.1002-14651858.CD013674.pub2,Background\nMajor depressive disorders have a ...,**Plain Title**\nComparing Newer Medicines for...,Newer generation antidepressants for depressio...,0.830407


In [42]:
print(sum(df['bertscore']) / len(df['bertscore']))

0.8297694844007492


## Calculate AlignScore

Compute AlignScore for each pair of original and simplified texts.

In [43]:
# Clone the alignscore repository
!pip install git+https://github.com/yuh-zha/AlignScore.git --no-deps
!pip install pytorch-lightning==1.9.5

import torch
import transformers
if not hasattr(transformers, "AdamW"):
    from torch.optim import AdamW
    transformers.AdamW = AdamW


Collecting git+https://github.com/yuh-zha/AlignScore.git
  Cloning https://github.com/yuh-zha/AlignScore.git to /tmp/pip-req-build-40_tuw1x
  Running command git clone --filter=blob:none --quiet https://github.com/yuh-zha/AlignScore.git /tmp/pip-req-build-40_tuw1x
  Resolved https://github.com/yuh-zha/AlignScore.git to commit a0936d5afee642a46b22f6c02a163478447aa493
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: alignscore
  Building wheel for alignscore (pyproject.toml) ... [?25l[?25hdone
  Created wheel for alignscore: filename=alignscore-0.1.3-py3-none-any.whl size=18369 sha256=97fddaa7c81702c0c9870c7772cf3f383060db904bdafa4493ecc4d8f0bace49
  Stored in directory: /tmp/pip-ephem-wheel-cache-fqqq49tj/wheels/61/78/cf/7adbff432a586bfb8a1673348218105047a4845e89a536007d
Successfully built alignscore
Installing collected pack

In [44]:
from alignscore import AlignScore

In [45]:
# Download the spaCy language model
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m130.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [46]:
# Install textstat if not already installed
%pip install textstat

import nltk

nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [47]:
import torch

torch.cuda.empty_cache()
print("CUDA cache cleared.")

CUDA cache cleared.


In [48]:
import textstat

device_id = 'cuda' if torch.cuda.is_available() else 'cpu'

alignscorer = AlignScore(model='roberta-large',
    batch_size=32,
    device=device_id,
    ckpt_path='https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt',
    evaluation_mode='nli_sp',
    verbose=False
)

align_scores = []

for index, row in df.iterrows():
    original_text = row['original_text']
    simplified_text = row['simplified_text']
    # Pass texts as lists to the score method
    align_scores.append(alignscorer.score([original_text], [simplified_text])[0])
    print(align_scores[-1])

df['alignscore'] = align_scores

display(df.head())

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading: "https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt" to /root/.cache/torch/hub/checkpoints/AlignScore-large.ckpt


100%|██████████| 4.56G/4.56G [00:10<00:00, 450MB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.0.post1 to v1.9.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file https:/huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

0.6857905983924866
0.5358808040618896
0.6752579808235168
0.535833477973938
0.6054209470748901
0.6989210844039917
0.6749808192253113
0.4956766664981842
0.6019885540008545
0.5797251462936401
0.5848668217658997
0.6238995790481567
0.6197251677513123
0.5571577548980713
0.708121120929718
0.6184607148170471
0.6357120871543884
0.5248801112174988
0.6019692420959473
0.5426196455955505
0.6032261848449707
0.6243612170219421
0.46362149715423584
0.6131362915039062
0.5514811873435974
0.6464672088623047
0.525438129901886
0.7017285227775574
0.4835911691188812
0.4249909520149231
0.6315527558326721
0.6364691853523254
0.7980251908302307
0.4181959927082062
0.6506932973861694
0.7265092730522156
0.745640754699707
0.42831480503082275
0.5763954520225525
0.5331505537033081
0.6799107789993286
0.7596796751022339
0.5214750170707703
0.5396844744682312
0.7254374623298645
0.5321499705314636
0.578877866268158
0.5453670024871826
0.5373035073280334
0.4635982811450958
0.6381949782371521
0.5359413623809814
0.5988406538963

Unnamed: 0_level_0,original_text,simplified_text,pls_text_content,bertscore,alignscore
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1002-14651858.CD013170.pub2,Background\nPeople with neuromuscular disorder...,Of course. Here is the Plain Language Summary ...,The safety and effectiveness of techniques to ...,0.821945,0.685791
10.1002-14651858.CD004957.pub3,Multidisciplinary rehabilitation programmes fo...,Of course. Here is a Plain Language Summary ba...,Multidisciplinary rehabilitation programmes fo...,0.796075,0.535881
10.1002-14651858.CD013717.pub2,"Background\nIn late 2019, the first cases of c...",**Plain Title**\nDo Rules for Travelers Help S...,Can international travel‐related control measu...,0.831442,0.675258
10.1002-14651858.CD013384.pub2,Background\nChronic obstructive pulmonary dise...,Of course. Here is a Plain Language Summary ba...,Approaches to help people with COPD who have o...,0.823051,0.535833
10.1002-14651858.CD013674.pub2,Background\nMajor depressive disorders have a ...,**Plain Title**\nComparing Newer Medicines for...,Newer generation antidepressants for depressio...,0.830407,0.605421


In [49]:
print(sum(df['alignscore']) / len(df['alignscore']))

0.6022462669014931


In [50]:

flesch_kincaid_scores = []
coleman_liau_scores = []
flesch_reading_ease_scores = []
gunning_fog_scores = []
smog_index_scores = []
dale_chall_scores = []


for index, row in df.iterrows():
    simplified_text = row['simplified_text']

    # Calculate Flesch-Kincaid Grade Level
    flesch_kincaid_scores.append(textstat.flesch_kincaid_grade(simplified_text))

    # Calculate Coleman-Liau Index
    coleman_liau_scores.append(textstat.coleman_liau_index(simplified_text))

    # Calculate Flesch Reading Ease
    flesch_reading_ease_scores.append(textstat.flesch_reading_ease(simplified_text))

    # Calculate Gunning Fog Index
    gunning_fog_scores.append(textstat.gunning_fog(simplified_text))

    # Calculate SMOG Index
    smog_index_scores.append(textstat.smog_index(simplified_text))

    # Calculate Dale-Chall Readability Score
    dale_chall_scores.append(textstat.dale_chall_readability_score(simplified_text))

df['flesch_kincaid_grade'] = flesch_kincaid_scores
df['coleman_liau_index'] = coleman_liau_scores
df['flesch_reading_ease'] = flesch_reading_ease_scores
df['gunning_fog_index'] = gunning_fog_scores
df['smog_index'] = smog_index_scores
df['dale_chall_score'] = dale_chall_scores


display(df.head())

Unnamed: 0_level_0,original_text,simplified_text,pls_text_content,bertscore,alignscore,flesch_kincaid_grade,coleman_liau_index,flesch_reading_ease,gunning_fog_index,smog_index,dale_chall_score
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10.1002-14651858.CD013170.pub2,Background\nPeople with neuromuscular disorder...,Of course. Here is the Plain Language Summary ...,The safety and effectiveness of techniques to ...,0.821945,0.685791,5.853158,8.300888,75.888466,7.391716,8.841846,8.393943
10.1002-14651858.CD004957.pub3,Multidisciplinary rehabilitation programmes fo...,Of course. Here is a Plain Language Summary ba...,Multidisciplinary rehabilitation programmes fo...,0.796075,0.535881,6.138661,7.916469,74.894021,7.188162,8.507473,8.294878
10.1002-14651858.CD013717.pub2,"Background\nIn late 2019, the first cases of c...",**Plain Title**\nDo Rules for Travelers Help S...,Can international travel‐related control measu...,0.831442,0.675258,6.060379,8.791054,75.335553,7.627979,8.656518,8.736491
10.1002-14651858.CD013384.pub2,Background\nChronic obstructive pulmonary dise...,Of course. Here is a Plain Language Summary ba...,Approaches to help people with COPD who have o...,0.823051,0.535833,5.412784,9.019847,76.99566,6.108159,7.635375,8.734103
10.1002-14651858.CD013674.pub2,Background\nMajor depressive disorders have a ...,**Plain Title**\nComparing Newer Medicines for...,Newer generation antidepressants for depressio...,0.830407,0.605421,7.80057,10.347826,62.000885,10.06242,11.113648,8.964457


In [51]:
# Save the DataFrame to a CSV file
csv_filepath = 'simplified_texts_with_scores_g25pro.csv'
df.to_csv(csv_filepath, index_label='filename')

print(f"DataFrame saved to {csv_filepath}")

DataFrame saved to simplified_texts_with_scores_g25pro.csv


In [52]:
print(sum(df['bertscore']) / len(df['bertscore']))
print(sum(df['alignscore']) / len(df['alignscore']))
print(sum(df['flesch_reading_ease']) / len(df['flesch_reading_ease']))
print(sum(df['flesch_kincaid_grade']) / len(df['flesch_kincaid_grade']))
print(sum(df['coleman_liau_index']) / len(df['coleman_liau_index']))
print(sum(df['gunning_fog_index']) / len(df['gunning_fog_index']))
print(sum(df['smog_index']) / len(df['smog_index']))
print(sum(df['dale_chall_score']) / len(df['dale_chall_score']))


0.8297694844007492
0.6022462669014931
74.32918226236988
5.969264010220906
8.133881299696958
7.539820972000171
8.760984741292814
8.58676943335424
