In [6]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import pandas as pd
import numpy as np
import re

# Load data (replace 'path_to_file.csv' with the actual file path)
nyt = pd.read_csv('/content/NYT_Dataset.csv')


In [8]:
# Check for missing values and remove rows with missing data in the 'Abstract' and 'Title' columns
nyt.dropna(subset=['abstract', 'title'], inplace=True)



# Apply the preprocessing function to the 'Title' and 'Abstract' columns

# Optional: Select only the columns with cleaned data
nyt_clean = nyt[['abstract', 'title']]

# Output the first few rows of the cleaned dataframe
print(nyt_clean.head())

                                            abstract  \
0  Pakistan’s ambassador to the U.S. said his gov...   
1  Kenya sank deeper into trouble, with a curfew ...   
2  Prime Minister Ehud Olmert has sent a letter t...   
3  The monthly club night known as Gayhane is an ...   
4  But even as partygoers embraced the New Year, ...   

                                               title  
0  In Reversal, Pakistan Welcomes Outside Help Wi...  
1       Fighting Intensifies After Election in Kenya  
2                   Israel: Olmert Curbs Settlements  
3        Gay Muslims Pack a Dance Floor of Their Own  
4                Iraqi Revelers Embrace the New Year  


In [9]:
nyt_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 105923 entries, 0 to 106505
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   abstract  105923 non-null  object
 1   title     105923 non-null  object
dtypes: object(2)
memory usage: 2.4+ MB


In [10]:
# Sample 5000 rows from the dataset
nyt_sample = nyt_clean.sample(n=2000, random_state=1)  # random_state for reproducibility

In [11]:
nyt_sample.head()

Unnamed: 0,abstract,title
99458,How different groups voted,Florida Exit Polls: How Different Groups Voted
38416,Plans by the United States and other countries...,International Effort Seeks to Counter Jihadist...
48370,"Teams from Russia, Ukraine and the Internation...",Inspection of Russian Trucks at Ukraine Border...
105082,Censured by his party and shunned by family me...,Adam Kinzinger’s Lonely Mission
12407,Dozens were arrested after martial law was imp...,Arrests in Philippine Province


In [12]:
pip install openai


Collecting openai
  Downloading openai-1.24.1-py3-none-any.whl (312 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/312.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m307.2/312.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0

In [13]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [14]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import time
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge

nltk.download('punkt')

# Load pre-trained DistilBART model and tokenizer
model_name = "sshleifer/distilbart-cnn-6-6"  # Adjusted to a smaller model variant
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_title_summaries_in_batches(titles, abstracts, batch_size=8, max_length=20):
    summaries = []
    start_time = time.time()
    for i in range(0, len(titles), batch_size):
        batch_titles = titles[i:i+batch_size]
        batch_abstracts = abstracts[i:i+batch_size]

        batch_texts = [title + ": " + abstract for title, abstract in zip(batch_titles, batch_abstracts)]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=1024)
        summary_ids = model.generate(inputs.input_ids, max_length=max_length, num_beams=2, early_stopping=True)
        batch_summaries = [tokenizer.decode(summary_id, skip_special_tokens=True) for summary_id in summary_ids]
        summaries.extend(batch_summaries)

    end_time = time.time()
    print(f"Total time taken: {end_time - start_time:.2f} seconds")
    return summaries

# Extract and generate as previously shown
titles = nyt_sample["title"].tolist()
abstracts = nyt_sample["abstract"].tolist()
title_summaries = generate_title_summaries_in_batches(titles, abstracts, max_length=10)

# Function to compute BLEU scores
def compute_bleu_scores(generated_titles, actual_titles):
    smoothie = SmoothingFunction().method1
    tokenized_references = [[nltk.word_tokenize(ref)] for ref in actual_titles]
    tokenized_candidates = [nltk.word_tokenize(cand) for cand in generated_titles]
    bleu_score = corpus_bleu(tokenized_references, tokenized_candidates, smoothing_function=smoothie)
    return bleu_score

# Function to compute ROUGE scores
def compute_rouge_scores(generated_titles, actual_titles):
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_titles, actual_titles, avg=True)
    return rouge_scores

# Evaluate generated titles
def evaluate_titles(generated_titles, actual_titles):
    bleu_score = compute_bleu_scores(generated_titles, actual_titles)
    rouge_score = compute_rouge_scores(generated_titles, actual_titles)
    return bleu_score, rouge_score

# Run evaluation
bleu_score, rouge_score = evaluate_titles(title_summaries, titles)

# Print evaluation results
print(f"BLEU Score: {bleu_score}")
print(f"ROUGE Scores: {rouge_score}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/460M [00:00<?, ?B/s]



Total time taken: 177.80 seconds
BLEU Score: 0.11712522347291397
ROUGE Scores: {'rouge-1': {'r': 0.2170229021064934, 'p': 0.3501892857142862, 'f': 0.26017320430332824}, 'rouge-2': {'r': 0.14858752150627155, 'p': 0.26791666666666675, 'f': 0.1840663693154553}, 'rouge-l': {'r': 0.2160669316880229, 'p': 0.3489404761904767, 'f': 0.2591140007016323}}


In [16]:
# Function to compare and print some example titles and summaries
def compare_examples(actual_titles, generated_summaries, num_examples=10):
    print("Comparing actual titles and generated summaries:")
    for i in range(num_examples):
        print(f"Example {i+1}:")
        print(f"Original Title: {actual_titles[i]}")
        print(f"Generated Summary: {generated_summaries[i]}")
        print("----")

# Compare some examples
compare_examples(titles, title_summaries, num_examples=10)

Comparing actual titles and generated summaries:
Example 1:
Original Title: Florida Exit Polls: How Different Groups Voted
Generated Summary: Florida Exit Polls: How different
----
Example 2:
Original Title: International Effort Seeks to Counter Jihadists in Africa
Generated Summary: International Effort Seeks to Counter
----
Example 3:
Original Title: Inspection of Russian Trucks at Ukraine Border Begins
Generated Summary:  Teams from Russia, Ukraine and the
----
Example 4:
Original Title: Adam Kinzinger’s Lonely Mission
Generated Summary: Adam Kinzinger, a six
----
Example 5:
Original Title: Arrests in Philippine Province
Generated Summary:  Dozens arrested after martial law was imposed
----
Example 6:
Original Title: An Ever-Bleaker Syria, From All Vantage Points 
Generated Summary:  Syria enters its fifth year of conflict
----
Example 7:
Original Title: Taliban Seize Building for Attack on Afghan Government Offices
Generated Summary: Taliban Seize Building for Attack
----
Example 8