In [1]:
!pip install transformers datasets

Defaulting to user installation because normal site-packages is not writeable


In [1]:
# !pip install transformers==4.37.2
# !pip install datasets==2.17.0
!pip install evaluate
!pip install rouge-score

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [1]:
#importing libraries

import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

import pandas as pd
import numpy as np
import warnings

from tqdm import tqdm
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Technology Data

In [2]:
#loading the training dataset

df = pd.read_csv('/storage/ice1/6/4/tchavan3/technology_train.csv')

In [3]:
#using the bart tokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Preprocess the input and output text
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], max_length=1024, truncation=True, padding='max_length')
    labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')

    # Set labels for training
    inputs['labels'] = labels['input_ids']
    return inputs

In [4]:
#loading the paraphrased datasets

df_food = pd.read_csv('food-aug.csv')
df_sports = pd.read_csv('sports-aug.csv')
df_entertainment = pd.read_csv('entertainment-aug.csv')
df_architecture = pd.read_csv('architecture_aug.csv')

In [5]:
#getting a combined dataset

result = pd.concat([df_food[['text', 'summary']], df_sports[['text', 'summary']], df_entertainment[['text', 'summary']], df_architecture[['text', 'summary']], df[['text', 'summary']]], ignore_index=True)

In [6]:
result

Unnamed: 0,text,summary
0,"**""Taste of the Past: Revival of Classic Briti...",Traditional British cuisine is experiencing a ...
1,**Federal Regulations Set to Revolutionize Ame...,Federal regulations aim to reduce food waste a...
2,**Sustainable Seafood: A Growing Concern for C...,Coastal communities face unprecedented pressur...
3,"""Foodies on the Move: The Rise of Plant-Based ...",Plant-based cuisine is revolutionizing the foo...
4,"**""Revolutionizing the World of Fruit: The Ris...",Apple's versatility is revolutionizing the cul...
...,...,...
53973,Perhaps Microsoft has finally gotten a little ...,...
53974,Microsoft today released updates to fix at lea...,...
53975,Microsoft today issued software updates to fix...,"The latest news on computer, technology and ne..."
53976,"Vote: Abolish Middle-East ""THEOCRACY & MONARCH...",Need to Know - PostGlobal on PostGlobal; blog ...


In [7]:
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(result)

# Tokenize the dataset
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True, remove_columns=["text", "summary"])


Map: 100%|██████████| 53978/53978 [02:25<00:00, 371.73 examples/s]


In [8]:
#splitting the dataset

from sklearn.model_selection import train_test_split
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
# We use the BART large dataset, loading it

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Define training arguments
training_args = TrainingArguments(
    output_dir="/storage/ice1/6/4/tchavan3/results-new",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='epoch',
#     logging_dir='/storage/ice1/6/4/tchavan3/logs',
#     logging_steps=10,
#     report_to="none",  # Disable unnecessary logging reports
    
)


In [None]:
# Code for metrics to evaluate the performance

from evaluate import load
# Load the ROUGE metric
import evaluate
metric = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels=eval_pred # obtaining predictions and true labels
    
    # decoding predictions
    decoded_preds=tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # obtaining the true labels tokens, while eliminating any possible masked token (i.e: label=-100)
    labels=np.where(labels!=-100, labels, tokenizer.pad_token_id)
    decoded_labels=tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # rouge expects a newline after each sentence
    decoded_preds=['\n'.join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels=['\n'.join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # computing rouge score
    result=metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result={key: value.mid.fmeasure*100 for key, value in result.items()} # extracting some results
    
    # add mean-genrated length
    prediction_lens=[np.count_nonzero(pred!=tokenizer.pad_token_id) for pred in predictions]
    result['gen_len']=np.mean(prediction_lens)
    return {k: round(v,4) for k,v in result.items()}

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # 20% data for evaluation
#     compute_metrics=compute_metrics,
)

In [None]:
# Train the model with a progress bar (tqdm is automatically included)
# trainer.train()
for epoch in tqdm(range(training_args.num_train_epochs), desc="Training Epochs"):
    trainer.train()
    checkpoint_path = f"/storage/ice1/6/4/tchavan3/results/checkpoint_epoch_{epoch}.pth"
    torch.save({
        'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             # Add any other necessary components to save
    }, checkpoint_path)

Training Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss
1,0.35,0.327209
2,0.3009,0.32013
3,0.2705,0.324577
4,0.2426,0.326094
5,0.217,0.332994
6,0.1954,0.341063
7,0.178,0.350172
8,0.1635,0.356447


In [None]:
#running evaluation
with torch.no_grad():
    model.eval()
    validation=trainer.evaluate(eval_dataset=eval_dataset)
print(validation)

In [44]:
#code to empty cache when cuda throws memory error after running several cells

torch.cuda.empty_cache()

In [None]:
# Testing evaluation pipelines

text = """ Like all gadgets, cellphones can break. In fact, our habit of carrying our phones constantly--even in bad weather--and stuffing them into our pockets and bags makes them more prone to breakage. Not all phone mishaps can be fixed, but many can, either at home or by professionals. For remedies to 10 common cellphone accidents, read on. """# Tokenize the input text

inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
 
# Generate summary (adjust max_length and min_length as needed)

summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=4, early_stopping=True)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 

inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
 
# Generate summary (adjust max_length and min_length as needed)
summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
from transformers import BartTokenizer, BartModel, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
# model = BartModel.from_pretrained('facebook/bart-large')
path = "/storage/ice1/6/4/tchavan3/results/" + "checkpoint-3750"
model = BartForConditionalGeneration.from_pretrained(path)
# model.load_state_dict(torch.load(checkpoint_file)['model'])

text = testdf["text"][2]

inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

# last_hidden_states = outputs.last_hidden_state



In [None]:
outputs

In [None]:
summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [53]:
summary

"Tim Cook thinks he knows how to put $59.7 billion to good use One of the things that's keeping Apple's market cap from overtaking Exxon Mobil's XOM -- besides Steve Jobs' health problems and the world's unquenchable thirst for petroleum products -- is the fear that the company will do something stupid with the nearly $60 billion in cash and marketable securities that seems to be burning a hole in Wall Street's pocket. The Street has made it abundantly clear what it thinks Apple should do with that cash: Declare a dividend or launch a stock repurchase program or both -- anything to drive up the value of institutional investors' Apple holdings. What the analysts who work for those institutions fear is that"

In [28]:
inputs["input_ids"]

tensor([[    0,  2011,    70, 21485,     6,  3551, 15797,    64,  1108,     4,
            96,   754,     6,    84, 10870,     9,  3406,    84,  4247,  5861,
          5579, 12963,    11,  1099,  1650,  5579,   463, 32189,   106,    88,
            84, 12189,     8,  5565,   817,   106,    55, 16292,     7,  1108,
          1580,     4,  1491,    70,  1028, 24601,  7527,    64,    28,  4460,
             6,    53,   171,    64,     6,  1169,    23,   184,    50,    30,
          5197,     4,   286, 26552,     7,   158,  1537, 13605,  9960,     6,
          1166,    15,     4,  1437,     2]])

In [17]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9996
})

In [27]:
inputs

{'input_ids': tensor([[    0,  2011,    70, 21485,     6,  3551, 15797,    64,  1108,     4,
            96,   754,     6,    84, 10870,     9,  3406,    84,  4247,  5861,
          5579, 12963,    11,  1099,  1650,  5579,   463, 32189,   106,    88,
            84, 12189,     8,  5565,   817,   106,    55, 16292,     7,  1108,
          1580,     4,  1491,    70,  1028, 24601,  7527,    64,    28,  4460,
             6,    53,   171,    64,     6,  1169,    23,   184,    50,    30,
          5197,     4,   286, 26552,     7,   158,  1537, 13605,  9960,     6,
          1166,    15,     4,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}

In [31]:
!pip install rouge_score

Defaulting to user installation because normal site-packages is not writeable
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6c289d05247247fe9b93160d4224c3a33346011aa9adf1e3a594d7350d6058d7
  Stored in directory: /home/hice1/tchavan3/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl-py-2.1.0 rouge_score-0.1.2


In [13]:
from rouge_score import rouge_scorer

In [14]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [38]:
a = 'Simple home remedies for repairing your mobile phone--and when to get professional help.'
b = 'How to fix cellphones that break, or fix them yourself.    For remedies to 10 common cellphone accidents, read on.'

In [35]:
scores

{'rouge1': Score(precision=0.15789473684210525, recall=0.21428571428571427, fmeasure=0.18181818181818182),
 'rougeL': Score(precision=0.10526315789473684, recall=0.14285714285714285, fmeasure=0.12121212121212122)}

In [39]:
scores = scorer.score(b,a)

In [40]:
scores

{'rouge1': Score(precision=0.21428571428571427, recall=0.15789473684210525, fmeasure=0.18181818181818182),
 'rougeL': Score(precision=0.14285714285714285, recall=0.10526315789473684, fmeasure=0.12121212121212122)}

In [None]:
tokenized_val=val_ds.map(preprocess_func, batched=True, remove_columns=['id', 'dialogue', 'summary'])

In [None]:
validation=trainer.evaluate(eval_dataset=tokenized_val)
print(validation)

In [23]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9996
})

In [12]:
testdf = pd.read_csv('/storage/ice1/6/4/tchavan3/technology_test.csv')

In [None]:

scores = []
for i in range(len(testdf['summary'])):
    text = testdf['text'][i] #torch.tensor(testdf['text'][i], device="cuda")
#     inputs = tokenizer(text, return_tensors="pt")
    inputs = tokenizer(text, max_length=1024, truncation=True, padding='max_length', return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=2, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summtest = testdf["summary"][i]
    score = scorer.score(summary, summtest)
    print(score)
    scores.append(score)


{'rouge1': Score(precision=0.35714285714285715, recall=0.2, fmeasure=0.25641025641025644), 'rougeL': Score(precision=0.14285714285714285, recall=0.08, fmeasure=0.10256410256410256)}
{'rouge1': Score(precision=0.26666666666666666, recall=0.3076923076923077, fmeasure=0.28571428571428575), 'rougeL': Score(precision=0.23333333333333334, recall=0.2692307692307692, fmeasure=0.25)}
{'rouge1': Score(precision=1.0, recall=0.4793388429752066, fmeasure=0.6480446927374302), 'rougeL': Score(precision=1.0, recall=0.4793388429752066, fmeasure=0.6480446927374302)}
{'rouge1': Score(precision=0.28, recall=0.07865168539325842, fmeasure=0.12280701754385967), 'rougeL': Score(precision=0.16, recall=0.0449438202247191, fmeasure=0.07017543859649122)}
{'rouge1': Score(precision=1.0, recall=0.8571428571428571, fmeasure=0.923076923076923), 'rougeL': Score(precision=1.0, recall=0.8571428571428571, fmeasure=0.923076923076923)}
{'rouge1': Score(precision=0.2, recall=0.12, fmeasure=0.15), 'rougeL': Score(precision=0

In [None]:
for i in range(len(testdf['summary'])):
    text = testdf['text'][i]
    # Move inputs to GPU after tokenization
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    # Generate summary on GPU
    summary_ids = model.generate(
        inputs["input_ids"], 
        max_length=150, 
        min_length=30, 
        num_beams=4, 
        early_stopping=True
    )
    
    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summtest = testdf["summary"][i]
    score = scorer.score(summary, summtest)
    scores.append(score)

In [46]:
testdf["text"][0]

'Like all gadgets, cellphones can break. In fact, our habit of carrying our phones constantly--even in bad weather--and stuffing them into our pockets and bags makes them more prone to breakage. Not all phone mishaps can be fixed, but many can, either at home or by professionals. For remedies to 10 common cellphone accidents, read on.'

In [None]:
import torch
from tqdm import tqdm
import os

# Set CUDA launch blocking for better error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Move model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Initialize lists to store results
generated_summaries = []
scores = []

def preprocess_text(text):
    # Remove any None or empty strings
    if text is None or not isinstance(text, str):
        return ""
    # Remove excessive whitespace
    text = ' '.join(text.split())
    return text

def generate_summary(text, max_length=150, min_length=30):
    # Tokenize with padding and truncation
    inputs = tokenizer(
        text,
        max_length=1024,  # BART's maximum input length
        truncation=True,
        padding=True,
        return_tensors="pt"
    )
    
    # Move to GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate summary
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    return summary_ids

# Process all texts with progress bar
for i in tqdm(range(len(testdf)), desc="Generating Summaries"):
    try:
        # Get and preprocess text
        text = preprocess_text(testdf["text"][i])
        
        # Skip empty texts
        if not text:
            print(f"Warning: Empty or invalid text at index {i}")
            generated_summaries.append("")
            scores.append(None)
            continue
            
        # Generate summary
        summary_ids = generate_summary(text)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        generated_summaries.append(summary)
        
        # Calculate score if reference summary exists
        if 'summary' in testdf.columns:
            reference_summary = testdf["summary"][i]
            score = scorer.score(summary, reference_summary)
            scores.append(score)
            
    except RuntimeError as e:
        if "CUDA" in str(e):
            print(f"CUDA error at index {i}. Text length: {len(text)}")
            print(f"Error details: {str(e)}")
            # Try processing on CPU as fallback
            try:
                model.to('cpu')
                inputs = tokenizer(text, return_tensors="pt")
                summary_ids = model.generate(
                    inputs["input_ids"],
                    max_length=150,
                    min_length=30,
                    num_beams=4,
                    early_stopping=True
                )
                summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                model.to(device)  # Move back to GPU
                generated_summaries.append(summary)
                if 'summary' in testdf.columns:
                    score = scorer.score(summary, reference_summary)
                    scores.append(score)
            except Exception as cpu_e:
                print(f"CPU fallback also failed: {str(cpu_e)}")
                generated_summaries.append(None)
                scores.append(None)
        else:
            print(f"Error processing text {i}: {str(e)}")
            generated_summaries.append(None)
            scores.append(None)
            
    except Exception as e:
        print(f"Unexpected error processing text {i}: {str(e)}")
        generated_summaries.append(None)
        scores.append(None)
        continue

# Add generated summaries to dataframe
testdf['generated_summary'] = generated_summaries
if scores:
    testdf['score'] = scores

# Calculate average score if scores exist
if scores:
    valid_scores = [s for s in scores if s is not None]
    if valid_scores:
        avg_score = sum(valid_scores) / len(valid_scores)
        print(f"\nAverage Score: {avg_score:.4f}")
        print(f"Successfully processed: {len(valid_scores)}/{len(scores)} texts")