In [7]:
pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [2]:
pip install transformers



In [36]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m51.2/61.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [3]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


# Import Libraries

In [4]:
from transformers import pipeline, set_seed
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch
from datasets import load_dataset, load_metric
import textstat
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq

# Load eLife Data

In [6]:
train_key='train.json'
test_key='test.json'
val_key='val.json'

In [5]:
data_location=f'/content/drive/MyDrive/Thesis/Data/elife/'

## Train

In [8]:
train_data=pd.read_json(f'{data_location}{train_key}')
print("Train Shape: ", train_data.shape)
train_data.head(1)

Train Shape:  (4346, 8)


Unnamed: 0,id,year,title,sections,headings,abstract,summary,keywords
0,elife-35500-v1,2018,National and regional seasonal dynamics of all...,[[It is well-established that death rates vary...,"[Introduction, Results, Discussion, Materials ...","[In temperate climates , winter deaths exceed ...","[In the USA , more deaths happen in the winter...",[epidemiology and global health]


## Test

In [9]:
test_data=pd.read_json(f'{data_location}{test_key}')
print("Test Shape: ", test_data.shape)
test_data.head(1)

Test Shape:  (241, 8)


Unnamed: 0,id,year,title,sections,headings,abstract,summary,keywords
0,elife-37443-v3,2018,Cerebellar implementation of movement sequence...,"[[Most movements are comprised of sequences .,...","[Introduction, Results, Discussion, Materials ...","[Most movements are not unitary , but are comp...",[Imagine a gymnastics competition in which par...,[neuroscience]


The article below can be found here: https://elifesciences.org/articles/37443#s3 Note that some paragraphs are missing.

In [15]:
# test_data['sections'][0]

## Validation

In [16]:
val_data=pd.read_json(f'{data_location}{val_key}')
print("Val Shape: ", val_data.shape)
val_data.head(1)

Val Shape:  (241, 8)


Unnamed: 0,id,year,title,sections,headings,abstract,summary,keywords
0,elife-15477-v3,2016,Increasing Notch signaling antagonizes PRC2-me...,"[[Cell-fate decisions are controlled , on the ...","[Introduction, Results, Discussion, Materials ...",[Cell-fate reprograming is at the heart of dev...,[The DNA in genes encodes the basic informatio...,[developmental biology]


# Preprocessing

## Reshape DataFrames based on the headings

In [17]:
def reshape_dataframe(df, columns_to_keep):
    """
    Reshapes a dataframe based on its 'sections' and 'headings' columns. Each unique heading
    becomes a column in the reshaped dataframe, where the entries are the corresponding sections.

    Parameters:
    - df (pd.DataFrame): The input dataframe, which  have columns named 'id', 'sections',
                         and 'headings' etc. The 'sections' column should contain lists of strings,
                         while the 'headings' column should contain lists of headings corresponding
                         to the sections.

    - columns_to_keep (list of str): List of columns from the original dataframe that should
                                     be retained in the final reshaped dataframe.

    Returns:
    - pd.DataFrame: A reshaped dataframe where each unique heading from the 'headings' column
                    is now its own column. Original columns specified in 'columns_to_keep' are
                    also retained.
    """

    section_dicts = []
    for index, row in df.iterrows():
        temp_dict = {'id': row['id']}
        row_headings = row['headings']

        row_sections = row['sections']

        for heading, section in zip(row_headings, row_sections):
            temp_dict[heading] = section

        section_dicts.append(temp_dict)

    section_df = pd.DataFrame(section_dicts)
    result_df = pd.merge(df, section_df, on='id', how='outer')
    result_df.rename(columns={'summary': 'lay summary'}, inplace=True)
    result_df = result_df[columns_to_keep]
    result_df = result_df.dropna(subset=['Introduction', 'abstract'])
    result_df = result_df.reset_index(drop=True)
    return result_df

Observations: Materials and methods (not only) have many duplicates because the authors use similar names-> requires preprocess to the headings.

In [18]:
# The columns you want to keep BASED ON IITR:
cols_to_keep = ['Introduction', 'abstract', 'lay summary']
IITR_train_df = reshape_dataframe(train_data, cols_to_keep)
IITR_val_df = reshape_dataframe(val_data, cols_to_keep)
IITR_test_df = reshape_dataframe(test_data, cols_to_keep)

## Combine Introduction & Abstract based on fragment

In [19]:
def combine_texts(row, k=0.6):
    intro_part = row['Introduction'][:int(len(row['Introduction']) * k)]
    return intro_part + row['abstract']

In [20]:
IITR_train_df['intro_abstract_combined'] = IITR_train_df.apply(lambda row: combine_texts(row, k=0.6), axis=1)
IITR_test_df['intro_abstract_combined'] = IITR_test_df.apply(lambda row: combine_texts(row, k=0.6), axis=1)
IITR_val_df['intro_abstract_combined'] = IITR_val_df.apply(lambda row: combine_texts(row, k=0.6), axis=1)

In [21]:
IITR_train_df.drop(['Introduction', 'abstract'],inplace=True, axis=1)
IITR_test_df.drop(['Introduction', 'abstract'],inplace=True, axis=1)
IITR_val_df.drop(['Introduction', 'abstract'],inplace=True, axis=1)
IITR_train_df

Unnamed: 0,lay summary,intro_abstract_combined
0,"[In the USA , more deaths happen in the winter...",[It is well-established that death rates vary ...
1,[Most people have likely experienced the disco...,[Dysregulated complement activation is increas...
2,[The immune system protects an individual from...,"[HOIL-1 ( encoded by the RBCK1 gene ) , HOIP (..."
3,[The brain adapts to control our behavior in d...,[Flexible control of cognitive processes is fu...
4,[Cells use motor proteins that to move organel...,[Myosin 5a moves in a hand-over-hand fashion w...
...,...,...
4311,[To defend itself against bacteria and viruses...,[Antibodies are immunogenic proteins expressed...
4312,[DNA is tightly packaged in a material called ...,[The eukaryotic genome is packaged into chroma...
4313,[Associative learning is a simple learning abi...,[The temporal and spatial heterogeneity of any...
4314,"[In 1848 , a railroad worker named Phineas Gag...",[Correlates of decision variables are routinel...


## Convert list instances to String

In [22]:
import ast

def str_list_to_str(s):
    # If it's a list, then join its items into a string
    if isinstance(s, list):
        return ' '.join(s)
    # If it's already a string, return as it is
    return s
IITR_train_df = IITR_train_df.applymap(str_list_to_str)
IITR_test_df = IITR_test_df.applymap(str_list_to_str)
IITR_val_df = IITR_val_df.applymap(str_list_to_str)

In [23]:
IITR_val_df

Unnamed: 0,lay summary,intro_abstract_combined
0,The DNA in genes encodes the basic information...,"Cell-fate decisions are controlled , on the on..."
1,Klebsiella pneumoniae is a type of bacteria th...,Klebsiella pneumoniae is an opportunistic Gram...
2,Malaria is one of the world's most deadly infe...,The study of the transmission dynamics of vect...
3,The Amazon rainforest in South America is the ...,"With on-going climate change , attention is in..."
4,Neurons that arise in the adult nervous system...,Neural stem cells ( NSCs ) reside in two regio...
...,...,...
234,Our genomes contain a record of historical eve...,Advances in DNA analysis technology and the dr...
235,"In the earliest stages of development , animal...",Cell size varies widely among different organi...
236,Life on Earth is magnificently complex . The i...,A variety of riboswitch classes regulate gene ...
237,Our nervous system allows us to rapidly sense ...,Excitatory glutamatergic synapses mediate the ...


# Bart Model

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [25]:
model_ckpt = "facebook/bart-large-cnn"

In [26]:
model_bart= AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [28]:
from transformers import pipeline

#need to call pipeline and tell task u r performing, so initiliaze configuration
#we need to use the pretrained model
pipe=pipeline("summarization", model=model_ckpt)

In [29]:
intro_abstract_test= IITR_test_df['intro_abstract_combined'][0]

In [30]:
pipe_out=pipe(intro_abstract_test)

Create Chunks

In [31]:
def generate_batch_sized_chunks(df, batch_size):
    """Yield successive batch-sized chunks from dataframe."""

    for i in range(0, len(df), batch_size):
        yield df.iloc[i : i + batch_size]

## Metrics

### Rouge

In [32]:
rouge_metric = load_metric('rouge')

  rouge_metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

### FKGL

In [33]:
# Function to compute FKGL
def compute_fkgl(text):
    return textstat.flesch_kincaid_grade(text)

### DCRS

In [34]:
def compute_dale_chall(text):
    return textstat.dale_chall_readability_score(text)

### Bert Score

In [37]:
bert_score=load_metric('bertscore')

## Evaluation

In [38]:
def calculate_all_metrics_on_test_ds(dataset, model, tokenizer, batch_size=16, device='cuda',
                                     column_text="intro_abstract_combined",
                                     column_summary="lay summary"):

    results = {
        "ROUGE 1": None,
        "ROUGE 2": None,
        "ROUGE L": None,
        "ROUGE L SUM": None,
        "BERTScore": None,
        "FKGL": [],
        "Dale-Chall": []
    }

    for batch in tqdm(generate_batch_sized_chunks(dataset, batch_size), total=len(dataset) // batch_size):

        inputs = tokenizer(batch[column_text].tolist(), max_length=1024, truncation=True,
                           padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=600)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True) for s in summaries]

        # Ensure spaces are properly formatted
        decoded_summaries = [d.replace("  ", " ") for d in decoded_summaries]

        # Compute FKGL and Dale-Chall for decoded summaries
        results["FKGL"].extend([compute_fkgl(summary) for summary in decoded_summaries])
        results["Dale-Chall"].extend([compute_dale_chall(summary) for summary in decoded_summaries])

        # Add to rouge and bertscore metrics
        rouge_metric.add_batch(predictions=decoded_summaries, references=batch[column_summary].tolist())
        bert_score.add_batch(predictions=decoded_summaries, references=batch[column_summary].tolist())

    rouge_output = rouge_metric.compute()
    for rouge_name, key in zip(["rouge1", "rouge2", "rougeL", "rougeLsum"], ["ROUGE 1", "ROUGE 2", "ROUGE L"]):
        results[key] = rouge_output[rouge_name].mid.fmeasure

    results["BERTScore"] = bert_score.compute()

    # Average FKGL and Dale-Chall scores
    results["FKGL"] = sum(results["FKGL"]) / len(results["FKGL"])
    results["Dale-Chall"] = sum(results["Dale-Chall"]) / len(results["Dale-Chall"])

    return results


metrics_results = calculate_all_metrics_on_test_ds(IITR_test_df, model_bart, tokenizer)
metrics_results

  0%|          | 0/15 [00:07<?, ?it/s]


OutOfMemoryError: ignored

I

## Fine Tuning

### Numerical Representation - Batches

In [39]:
def convert_examples_to_features(example_batch):
  # calling tokenizer for dialogue -1024
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
  #calling for summary
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 600, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [None]:
def dataframe_to_batches(df, batch_size):
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size else 0)
    for i in range(num_batches):
        yield df.iloc[i*batch_size : (i+1)*batch_size].to_dict(orient='list')

batch_size = 16

batches = list(dataframe_to_batches(IITR_train_df, batch_size))

batched_encodings = [convert_examples_to_features(batch) for batch in batches]

In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_bart)

In [None]:
# training arguments  - output directory...
trainer_args = TrainingArguments(
    output_dir='bart-elife-infoabstract', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16
)

In [None]:
# wil take everything and train
trainer = Trainer(
   model_bart,
   trainer_args,
   train_dataset=IITR_train_df,
   eval_dataset=IITR_test_df,
   data_collator=seq2seq_data_collator,
   tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
model_bart.save_pretrained("/content/drive/MyDrive/Thesis/Notebooks/models/bart-elife-infoabstract-model")

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/Thesis/Notebooks/models/tokenizer")