In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
nbdir = "/content/drive/MyDrive/DS_ML_Project/text_summarization/"

In [3]:
%cd /content/drive/MyDrive/DS_ML_Project/text_summarization/

/content/drive/MyDrive/DS_ML_Project/text_summarization


In [None]:
!pip install transformers



In [None]:
!pip install accelerate -U



In [None]:
!pip install rouge_score



In [None]:
!pip install datasets



In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq, pipeline
from datasets import Dataset, load_metric

In [None]:
df = pd.read_csv("./data/Reviews.csv")

In [None]:
# Function to remove outliers based on IQR
def remove_outliers(df):
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

In [None]:
# Perform statistical analysis before removing outliers
print("Statistical analysis before removing outliers:")
print(df.describe())

# Remove outliers
df_cleaned = remove_outliers(df)

# Perform statistical analysis after removing outliers
print("\nStatistical analysis after removing outliers:")
print(df_cleaned.describe())

Statistical analysis before removing outliers:
                  Id  HelpfulnessNumerator  HelpfulnessDenominator  \
count  568454.000000         568454.000000            568454.00000   
mean   284227.500000              1.743817                 2.22881   
std    164098.679298              7.636513                 8.28974   
min         1.000000              0.000000                 0.00000   
25%    142114.250000              0.000000                 0.00000   
50%    284227.500000              0.000000                 1.00000   
75%    426340.750000              2.000000                 2.00000   
max    568454.000000            866.000000               923.00000   

               Score          Time  
count  568454.000000  5.684540e+05  
mean        4.183199  1.296257e+09  
std         1.310436  4.804331e+07  
min         1.000000  9.393408e+08  
25%         4.000000  1.271290e+09  
50%         5.000000  1.311120e+09  
75%         5.000000  1.332720e+09  
max         5.000000  1.35

In [None]:
df_cleaned['Text'] = df_cleaned['Text'].astype(str)
df_cleaned['Summary'] = df_cleaned['Summary'].astype(str)

In [None]:
# Preparing our dataset for the Hugging Face model (Pegasus)
data = {'text': df_cleaned['Text'].tolist(), 'summary': df_cleaned['Summary'].tolist()}
dataset = Dataset.from_dict(data)

In [None]:
# Extract 5% of the dataset
dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.05)))

# Split the dataset into train (70%), validation (15%), and test (15%) sets
train_val_test_split = dataset.train_test_split(test_size=0.3)
val_test_split = train_val_test_split["test"].train_test_split(test_size=0.5)

train_dataset = train_val_test_split["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

In [None]:
dataset

Dataset({
    features: ['text', 'summary'],
    num_rows: 21844
})

In [None]:
train_dataset

Dataset({
    features: ['text', 'summary'],
    num_rows: 15290
})

In [None]:
val_dataset

Dataset({
    features: ['text', 'summary'],
    num_rows: 3277
})

In [None]:
test_dataset

Dataset({
    features: ['text', 'summary'],
    num_rows: 3277
})

In [None]:
# Set up tokenizer and model
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to("cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["text"], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128, truncation=True)

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

dataset = dataset.map(convert_examples_to_features, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])

Map:   0%|          | 0/21844 [00:00<?, ? examples/s]



In [None]:
train_dataset = train_dataset.map(convert_examples_to_features, batched=True)
val_dataset = val_dataset.map(convert_examples_to_features, batched=True)
test_dataset = test_dataset.map(convert_examples_to_features, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
val_dataset.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
test_dataset.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])

Map:   0%|          | 0/15290 [00:00<?, ? examples/s]

Map:   0%|          | 0/3277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3277 [00:00<?, ? examples/s]

In [None]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

**Model Training & Fine-Tuning**

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./pegasus",
    learning_rate=5e-05,
    num_train_epochs=5,
    warmup_steps=0,
    per_device_train_batch_size=1,
    per_gpu_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    push_to_hub=False,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16,
    report_to="none"
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# Training the model
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss
500,3.6209,3.233063
1000,3.3539,3.16801
1500,3.2012,3.124571
2000,3.1521,3.109488
2500,2.9564,3.097964
3000,2.7652,3.092759
3500,2.7416,3.081404
4000,2.8396,3.078016
4500,2.9082,3.079726


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_de

TrainOutput(global_step=4775, training_loss=3.0527611697531496, metrics={'train_runtime': 15667.5521, 'train_samples_per_second': 4.88, 'train_steps_per_second': 0.305, 'total_flos': 2.059703069778739e+16, 'train_loss': 3.0527611697531496, 'epoch': 4.99672988881622})

In [None]:
# Saving the model
torch.save(model.state_dict(), './pegasus/pegasus_model.pt')

# Saving tokenizer
tokenizer.save_pretrained('./pegasus/pegasus_tokenizer')

('./pegasus/pegasus_tokenizer/tokenizer_config.json',
 './pegasus/pegasus_tokenizer/special_tokens_map.json',
 './pegasus/pegasus_tokenizer/spiece.model',
 './pegasus/pegasus_tokenizer/added_tokens.json',
 './pegasus/pegasus_tokenizer/tokenizer.json')

In [None]:
# Loading the model and tokenizer
model_ckpt = "google/pegasus-cnn_dailymail"
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)
model.load_state_dict(torch.load('./pegasus/pegasus_model.pt'))
tokenizer = AutoTokenizer.from_pretrained('./pegasus/pegasus_tokenizer/')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

In [None]:
# Evaluating the model
rouge_metric = load_metric("rouge")
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

  rouge_metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i: i + batch_size]

**ROUGE Score Computation**

In [None]:
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=1, device="cuda" if torch.cuda.is_available() else "cpu", column_text="text", column_summary="summary"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
        summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=4, num_beams=2, max_length=16)
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [None]:
score = evaluate_summaries_pegasus(test_dataset, rouge_metric, model, tokenizer, batch_size=1)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

print(pd.DataFrame.from_records(rouge_dict, index=[f"pegasus"]))

100%|██████████| 3277/3277 [27:18<00:00,  2.00it/s]


           rouge1    rouge2    rougeL  rougeLsum
pegasus  0.138476  0.041943  0.130961   0.130827


In [None]:
# Generating a sample summary
gen_kwargs = {"length_penalty": 4, "num_beams": 2, "max_length": 16, "min_length": 8}
sample_text = test_dataset["text"][0]
reference = test_dataset["summary"][0]
pipe = pipeline("summarization", model=model, tokenizer=tokenizer)

print("Review:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])



Review:
Of all the gormet Fancy Feast varieties, this one seems to be the least favorite.  Not sure why - they won't eat salmon fresh from the market either.  Must not be on the regular cat's palate for some reason.  The smell is quite nice though - smells like people food really.  You can't really go wrong with these Fancy Feast varieties - they're the hit in this house!

Reference Summary:
Not the favorite, but passable

Model Summary:
Not a favorite - but they're the hit in this house!
