In [1]:
!pip install opendatasets datasets transformers bitsandbytes accelerate peft --upgrade --quiet

In [2]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail")

Skipping, found downloaded files in "./newspaper-text-summarization-cnn-dailymail" (use force=True to force download)


In [3]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, GenerationConfig
from peft import LoraConfig, get_peft_model
import torch
import re

2024-04-13 13:54:26.610793: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-13 13:54:26.610853: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-13 13:54:26.612366: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
data = pd.read_csv('/kaggle/working/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
data = data.sample(1000)
data.head()

Unnamed: 0,id,article,highlights
195079,88864c4c813f280bf0a2773adc46924967b71528,By . Steve Robson and William Turvill . PUBLIS...,"Christi and Dave Cason, from Lake Elsinore, Ca..."
152903,51a10d69ad6716644c54fb6b9490a222d94b9139,"By . Tom Gardner . PUBLISHED: . 03:06 EST, 3 A...",Police investigating incident after Geoff Holt...
188552,802fb5b46df921647b546e72c7032c72436f9d93,Southampton climbed to third in the Premier Le...,Southampton moved up to third with their 1-0 w...
97470,09732b9d232aabdc7ea3ed6c5a4ed87fd71ab915,"By . Andrew Levy . With buttons, levers and fl...",The 61-year-old Harwell Dekatron (aka WITCH) c...
64097,b6060c602d01643adc500128eba223dfbbc57c14,Washington (CNN) -- Millions of phone calls ha...,"To date, Gov. Scott Walker has raised 7Â½ time..."


In [6]:
data.drop('id',axis=1,inplace=True)

In [7]:
data.head()

Unnamed: 0,article,highlights
195079,By . Steve Robson and William Turvill . PUBLIS...,"Christi and Dave Cason, from Lake Elsinore, Ca..."
152903,"By . Tom Gardner . PUBLISHED: . 03:06 EST, 3 A...",Police investigating incident after Geoff Holt...
188552,Southampton climbed to third in the Premier Le...,Southampton moved up to third with their 1-0 w...
97470,"By . Andrew Levy . With buttons, levers and fl...",The 61-year-old Harwell Dekatron (aka WITCH) c...
64097,Washington (CNN) -- Millions of phone calls ha...,"To date, Gov. Scott Walker has raised 7Â½ time..."


In [8]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+',' ',text)
  return text

In [9]:
data['article'] = data['article'].apply(preprocess_text)
data['highlights'] = data['highlights'].apply(preprocess_text)

In [10]:
data.head()

Unnamed: 0,article,highlights
195079,by steve robson and william turvill published ...,christi and dave cason from lake elsinore cali...
152903,by tom gardner published 03 06 est 3 april 201...,police investigating incident after geoff holt...
188552,southampton climbed to third in the premier le...,southampton moved up to third with their 1 0 w...
97470,by andrew levy with buttons levers and flashin...,the 61 year old harwell dekatron aka witch com...
64097,washington cnn millions of phone calls have be...,to date gov scott walker has raised 7 times th...


In [11]:
data.reset_index(drop = True,inplace = True)

In [12]:
data['final_statement'] = ""

for i in range(len(data['final_statement'])):
    data.at[i,'final_statement'] = "Summarize the following article: \n\n" + str(data.at[i,'article']) + "\nSummary: " + str(data.at[i,'highlights'])

In [13]:
data.sample()['final_statement'].iloc[0]

'Summarize the following article: \n\n cnn there are approximately 600 million catholic women in the world but none will have a direct say in who the next pope will be the 115 cardinals voting for the pope are men one of these men will be chosen to succeed benedict xvi continuing an exclusively male club or is it two movies have been made about pope joan who according to legend was a ninth century englishwoman who disguised herself under voluminous clerical robes to become a priest something women are not allowed to be in the catholic church as the story goes joan outdid all the men in her religious studies and rose in the ranks of the cardinals to become pope she then went into labor during a papal procession and the mob descended on her and her child ending her reign diarmaid macculloch a theologian and historian from oxford university said the story of pope joan is a myth nothing but satirical fiction it keeps appealing to new anxieties and new interests he said so first it s mediev

In [14]:
data.head()

Unnamed: 0,article,highlights,final_statement
0,by steve robson and william turvill published ...,christi and dave cason from lake elsinore cali...,Summarize the following article: \n\nby steve ...
1,by tom gardner published 03 06 est 3 april 201...,police investigating incident after geoff holt...,Summarize the following article: \n\nby tom ga...
2,southampton climbed to third in the premier le...,southampton moved up to third with their 1 0 w...,Summarize the following article: \n\nsouthampt...
3,by andrew levy with buttons levers and flashin...,the 61 year old harwell dekatron aka witch com...,Summarize the following article: \n\nby andrew...
4,washington cnn millions of phone calls have be...,to date gov scott walker has raised 7 times th...,Summarize the following article: \n\nwashingto...


In [15]:
data = data[['final_statement']]
data.head()

Unnamed: 0,final_statement
0,Summarize the following article: \n\nby steve ...
1,Summarize the following article: \n\nby tom ga...
2,Summarize the following article: \n\nsouthampt...
3,Summarize the following article: \n\nby andrew...
4,Summarize the following article: \n\nwashingto...


In [16]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-1b1")

In [17]:
def tokenize(example):
  example['input_ids'] = tokenizer(example['final_statement'],padding = 'max_length', max_length = 512,truncation = True, return_tensors = 'pt').input_ids
  example['labels'] = tokenizer(example['final_statement'],padding = 'max_length', max_length = 512,truncation = True, return_tensors = 'pt').input_ids
  return example

In [18]:
train_data = Dataset.from_pandas(data)
train_tokenized = train_data.map(tokenize,batched=True,remove_columns=train_data.column_names)
train_tokenized

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1000
})

In [19]:
quant_config = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_type = 'nf4')

model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-1b1",quantization_config = quant_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [20]:
peft_params = LoraConfig(lora_alpha = 16,lora_dropout = 0.1,r=8,bias='none',task_type='CAUSAL_LM')
peft_model = get_peft_model(model,peft_params)

In [21]:
peft_model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 1,066,493,952 || trainable%: 0.11060990995662018


In [22]:
training_args = TrainingArguments(
    output_dir = './model_checkpoints',
    save_total_limit = 1,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_train_epochs = 2
)

trainer = Trainer(model=peft_model,args=training_args,train_dataset=train_tokenized)

trainer.train()

trainer.model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Currently logged in as: [33momarmohamedrandoms[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
500,3.5808


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/tokenizer.json')

In [27]:
del peft_model
torch.cuda.empty_cache()

In [23]:
tokenizer = AutoTokenizer.from_pretrained("final_model")
trained_model = AutoModelForCausalLM.from_pretrained("final_model")

In [40]:
article = """
Summarize the following article: \n\n
L1 & L2 regularization add constraints to the optimization problem. The curve H0 is the hypothesis. The solution to this system is the set of points where the H0 meets the constraints.

Now, in the case of L2 regularization, in most cases, the the hypothesis is tangential to the ||w||_2. The point of intersection has both x1 and x2 components. On the other hand, in L1, due to the nature of ||w||_1, the viable solutions are limited to the corners, which are on one axis only - in the above case x1. Value of x2 = 0. This means that the solution has eliminated the role of x2 leading to sparsity. Extend this to higher dimensions and you can see why L1 regularization leads to solutions to the optimization problem where many of the variables have value 0.

In other words, L1 regularization leads to sparsity.

\nSummary:\n 
"""

article = preprocess_text(article)
input_ids = tokenizer(article,padding = 'max_length', max_length = 250,truncation = True, return_tensors = 'pt').input_ids

In [41]:
outputs = trained_model.generate(input_ids,max_new_tokens=50)

In [42]:
tokenizer.decode(outputs[0],skip_special_tokens=True)

' summarize the following article l1 l2 regularization add constraints to the optimization problem the curve h0 is the hypothesis the solution to this system is the set of points where the h0 meets the constraints now in the case of l2 regularization in most cases the the hypothesis is tangential to the w 2 the point of intersection has both x1 and x2 components on the other hand in l1 due to the nature of w 1 the viable solutions are limited to the corners which are on one axis only in the above case x1 value of x2 0 this means that the solution has eliminated the role of x2 leading to sparsity extend this to higher dimensions and you can see why l1 regularization leads to solutions to the optimization problem where many of the variables have value 0 in other words l1 regularization leads to sparsity summary  the l1 regularization problem is a constrained optimization problem where the hypothesis is tangential to the w 2 the point of intersection has both x1 and x2 components on the o