In [None]:
!pip install torch==1.13.1 --quiet
!pip install torchdata==0.5.1 --quiet
!pip install transformers==4.27.2 --quiet
!pip install datasets==2.11.0 --quiet

## Import Libraries

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

## Load Dataset

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

Downloading readme:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading and preparing dataset csv/knkarthick--dialogsum to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-c8fac5d84cd35861/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
example_indices = [40, 200]
dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
  print(dash_line)
  print("Ex", i+1)
  print(dash_line)
  print('Input')
  print(dataset['test'][index]['dialogue'])
  print('Human summary')
  print(dataset['test'][index]['summary'])

---------------------------------------------------------------------------------------------------
Ex 1
---------------------------------------------------------------------------------------------------
Input
#Person1#: What time is it, Tom?
#Person2#: Just a minute. It's ten to nine by my watch.
#Person1#: Is it? I had no idea it was so late. I must be off now.
#Person2#: What's the hurry?
#Person1#: I must catch the nine-thirty train.
#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there.
Human summary
#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.
---------------------------------------------------------------------------------------------------
Ex 2
---------------------------------------------------------------------------------------------------
Input
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Pers

# Load Model & Tokenizer

In [None]:
model_name = 'google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
# How to use tokenizer?
sentence = "What time is it?"
sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_encoded

{'input_ids': tensor([[363,  97,  19,  34,  58,   1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
sentence_decoded = tokenizer.decode(sentence_encoded["input_ids"][0],
                                    skip_special_tokens=True)

In [None]:
sentence_decoded

'What time is it?'

# Inference

In [None]:
# select an example
example_index = [40]

# get the dialogue
dialogue = dataset['test'][example_index]['dialogue']

# get the human summary
summary = dataset['test'][example_index]['summary']

# Configurations
generation_config = GenerationConfig(max_new_tokens=50,
                                     do_sample=True,
                                     temperature=0.7)

# Encode input:
inputs_encoded = tokenizer(dialogue, return_tensors='pt')

# Model Output:
model_output = model.generate(inputs_encoded["input_ids"], generation_config=generation_config)[0]

# Decode the output
output = tokenizer.decode(model_output, skip_special_tokens=True)


print("Input: ", dialogue)
print(dash_line)
print( "Human summary: ", summary)
print(dash_line)
print("Model Output: ", output)

Input:  ["#Person1#: What time is it, Tom?\n#Person2#: Just a minute. It's ten to nine by my watch.\n#Person1#: Is it? I had no idea it was so late. I must be off now.\n#Person2#: What's the hurry?\n#Person1#: I must catch the nine-thirty train.\n#Person2#: You've plenty of time yet. The railway station is very close. It won't take more than twenty minutes to get there."]
---------------------------------------------------------------------------------------------------
Human summary:  ['#Person1# is in a hurry to catch a train. Tom tells #Person1# there is plenty of time.']
---------------------------------------------------------------------------------------------------
Model Output:  #Person1#: I'm sorry, Tom. The train leaves at 10 and I'm on my way home.
