In [None]:
!nvidia-smi

Tue Nov 30 06:18:35 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --quiet transformers==4.5.0
!pip install --quiet pytorch-lightning==1.2.7

[K     |████████████████████████████████| 2.1 MB 5.1 MB/s 
[K     |████████████████████████████████| 895 kB 48.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 33.0 MB/s 
[K     |████████████████████████████████| 830 kB 5.3 MB/s 
[K     |████████████████████████████████| 329 kB 45.1 MB/s 
[K     |████████████████████████████████| 596 kB 34.4 MB/s 
[K     |████████████████████████████████| 132 kB 41.2 MB/s 
[K     |████████████████████████████████| 829 kB 31.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 28.7 MB/s 
[K     |████████████████████████████████| 160 kB 49.7 MB/s 
[K     |████████████████████████████████| 271 kB 44.9 MB/s 
[K     |████████████████████████████████| 192 kB 43.4 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


In [None]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import (AdamW,
                          T5ForConditionalGeneration,
                          T5TokenizerFast as T5Tokenizer)
from tqdm.auto import tqdm

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rcParams

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 16, 10

In [None]:
pl.seed_everything(42)

Global seed set to 42


42

In [None]:
wiki = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP/Project/Text Summarization/wikihowAll.csv')

wiki.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


In [None]:
wiki = wiki[['text', 'headline']]
wiki.head()

Unnamed: 0,text,headline
0,"If you're a photographer, keep all the necess...","\nKeep related supplies in the same area.,\nMa..."
1,See the image for how this drawing develops s...,\nCreate a sketch in the NeoPopRealist manner ...
2,It is possible to become a VFX artist without...,"\nGet a bachelor’s degree.,\nEnroll in a studi..."
3,The best art investors do their research on t...,\nStart with some experience or interest in ar...
4,"As you start planning for a project or work, ...","\nKeep your reference materials, sketches, art..."


In [None]:
wiki.columns = [['text', 'summary']]
wiki = wiki.dropna()
wiki.head()

Unnamed: 0,text,summary
0,"If you're a photographer, keep all the necess...","\nKeep related supplies in the same area.,\nMa..."
1,See the image for how this drawing develops s...,\nCreate a sketch in the NeoPopRealist manner ...
2,It is possible to become a VFX artist without...,"\nGet a bachelor’s degree.,\nEnroll in a studi..."
3,The best art investors do their research on t...,\nStart with some experience or interest in ar...
4,"As you start planning for a project or work, ...","\nKeep your reference materials, sketches, art..."


In [None]:
wiki_dataset = wiki.sample(frac=.02333)
wiki_dataset = wiki_dataset.applymap(str)

len(wiki_dataset), wiki_dataset.dtypes

(5024, text        object
 headline    object
 dtype: object)

In [None]:
train, test = train_test_split(wiki_dataset, test_size=0.1)
train.shape, test.shape

((4521, 2), (503, 2))

In [None]:
train.reset_index(drop=True, inplace=True)
train.columns = ['text', 'summary']

test.reset_index(drop=True, inplace=True)
test.columns = ['text', 'summary']

train.columns, test.columns

(Index(['text', 'summary'], dtype='object'),
 Index(['text', 'summary'], dtype='object'))

In [None]:
class WikiHowDataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 128
    ):

        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['text']

        text_encoding = tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        summary_encoding = tokenizer(
            data_row['summary'],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            text=text,
            summary=data_row['summary'],
            text_input_ids=text_encoding['input_ids'].flatten(),
            text_attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding['attention_mask'].flatten()
        )

In [None]:
class WikiHowDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
       tokenizer: T5Tokenizer,
       batch_size: int = 8,
       text_max_token_len: int = 512,
       summary_max_token_len: int = 128 
    ):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df

        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):

        self.train_dataset = WikiHowDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

        self.test_dataset = WikiHowDataset(
            self.test_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )

In [None]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [None]:
# text_token_counts, summary_token_counts = [], []

# for _, row in train.iterrows():
#     text_token_count = len( tokenizer.encode(str(row['text'])) )
#     text_token_counts.append(text_token_count)

#     summary_token_count = len(tokenizer.encode(str(row['summary'])))
#     summary_token_counts.append(summary_token_count)


In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2)

# sns.histplot(text_token_counts, ax=ax1)
# ax1.set_title("full text token counts")

# sns.histplot(summary_token_counts, ax=ax2)
# ax2.set_title("full summary token counts")

In [None]:
N_EPOCHS = 3
BATCH_SIZE = 4

data_module = WikiHowDataModule(train, test, tokenizer, batch_size=BATCH_SIZE)

In [None]:
class WikiHowModel(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):

        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=labels_attention_mask,
             labels=labels
        )

        self.log('train_loss', loss, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=labels_attention_mask,
             labels=labels
        )

        self.log('val_loss', loss, prog_bar=True, logger=True)
        
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=labels_attention_mask,
             labels=labels
        )

        self.log('test_loss', loss, prog_bar=True, logger=True)
        
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [None]:
model = WikiHowModel()

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

logger = TensorBoardLogger('lightning_logs', name='wikihow')

trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=1
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 1124: val_loss reached 2.18688 (best 2.18688), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 2249: val_loss reached 2.17171 (best 2.17171), saving model to "/content/checkpoints/best-checkpoint.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, step 3374: val_loss was not in top 1


1

In [None]:
# trained_model = WikiHowModel()
trained_model.load_from_checkpoint("/content/checkpoints/best-checkpoint.ckpt")

WikiHowModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseReluDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_featu

In [None]:
def summarize(text):
  text_encoding =tokenizer(
      text,
      max_length=1000,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"

  )

  generated_ids =trained_model.model.generate(input_ids=text_encoding["input_ids"],
                                attention_mask=text_encoding["attention_mask"],
                                max_length=500,
                                num_beams=2,
                                repetition_penalty=2.5,
                                length_penalty=0.05,
                                early_stopping=True)
  

  preds = [
  tokenizer.decode(gen_id,skip_special_tokens=True,clear_up_tokenization_spaces=True)
  for gen_id in generated_ids
  ]  

  return "".join(preds)

sample_row= test.iloc[188][0]
model_summary=summarize(sample_row)
model_summary

  next_indices = next_tokens // vocab_size


'you can use a brush or a washcloth to apply your mask. Take a hot shower before applying your mask. If you do not have a brush, then use your fingertips., After applying your mask, remove any makeup and oil from your face., If using a store-bought mask, read the instructions carefully., If using a homemade mask, read the instructions carefully., When using a homemade mask, follow the directions carefully., For best results, use an eye'

In [None]:
print("MODEL SUMMARY\n",
      model_summary,
      "GROUND TRUTH\n",
      test.iloc[188][1],
      sep='\n')

MODEL SUMMARY

you can use a brush or a washcloth to apply your mask. Take a hot shower before applying your mask. If you do not have a brush, then use your fingertips., After applying your mask, remove any makeup and oil from your face., If using a store-bought mask, read the instructions carefully., If using a homemade mask, read the instructions carefully., When using a homemade mask, follow the directions carefully., For best results, use an eye
GROUND TRUTH


Assess your skin.,
Prepare your mask.,
Get a brush.,
Cut some cucumber (optional).,
Refrigerate your items.,
Wash your face.,
Exfoliate.,
Open your pores.,
Apply the mask.,
Set cucumber slices on your eyes (optional).,
Set a timer and wait.,
Remove the mask.,
Follow with a toner and a moisturizer.,
Repeat the process once per week.
