In [1]:
# !pip install accelerate -U
# !pip install evaluate
# !pip install protobuf==3.20.0
# ! pip install datasets
# !pip install wandb -U
# !pip uninstall torch-gpu
# !pip install sentencepiece

In [4]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer
import evaluate
import nltk
import tqdm as tqdm
import torch
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\papu_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Uses pegasus model trained by google for text summarization task. Trained by Masked Language Modelling and Gap-Sentence Prediction methods. https://huggingface.co/google/pegasus-xsum

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [6]:
from transformers import AutoModelForSeq2SeqLM

# model_name = 'google/pegasus-xsum' # This is large model, trying smaller models

model_name = 'google/mt5-small'
model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_mt = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cpu')

In [7]:
# Finetune on samsum dataset
data = load_dataset('samsum')

Found cached dataset samsum (C:/Users/papu_/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 31.99it/s]


In [8]:
data

# To set on gpu
# ds.set_format("pt")

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [9]:
print('Features in dataset: {}'.format(data['train'].column_names))
print('Dialogue: {}'.format(data['train'][10]['dialogue']))
print('Summary: {}'.format(data['train'][10]['summary']))



Features in dataset: ['id', 'dialogue', 'summary']
Dialogue: Lucas: Hey! How was your day?
Demi: Hey there! 
Demi: It was pretty fine, actually, thank you!
Demi: I just got promoted! :D
Lucas: Whoa! Great news!
Lucas: Congratulations!
Lucas: Such a success has to be celebrated.
Demi: I agree! :D
Demi: Tonight at Death & Co.?
Lucas: Sure!
Lucas: See you there at 10pm?
Demi: Yeah! See you there! :D
Summary: Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.


## Use of text_target in tokenizer: This is because the languages you're translating between (Engish and German in this case) have different tokenization vocabularies. This implies that tokens will get tokenized differently. MarianMT models have seq2seq (encoder-decoder) architectures, and both the encoder and decoder each have their own embedding matrix. This means that the encoder will have an embedding vector for the token '▁doctor', whereas the decoder will learn an embedding vector for the token '▁do', an embedding vector for the token 'ctor', etc.

In [10]:
# Tokenize the data by passing in batches

def tokenize_text_batches(batch_data):
    input_encodings = tokenizer(text = batch_data['dialogue'], padding=True, truncation=True, max_length=1024)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(text = batch_data['summary'], padding=True, truncation=True, max_length=512)
    
    # Need to pass target encodings within the dict as labels key for the transformers model input
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }



In [11]:
# with tokenizer.as_target_tokenizer():
#     a = tokenizer( text = data['train'][10]['dialogue'], padding=True, truncation=True, max_length=1024)
# a    

In [12]:
data_enc = data.map(tokenize_text_batches, batched=True, batch_size = 500)


  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
                                                                                                                                                 

In [13]:
print(type(data_enc), data_enc)


<class 'datasets.dataset_dict.DatasetDict'> DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})


In [14]:
rouge = evaluate.load("rouge")

## Training model

## Use Data Collector Seq2Seq for transforming data into batches
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.

To be able to build batches, data collators may apply some processing (like padding). Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) on the formed batch.


In [15]:
from transformers import DataCollatorForSeq2Seq

seq2seq_dc = DataCollatorForSeq2Seq(tokenizer, model = model_mt)
seq2seq_dc

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id

## Use hugging face to train and deploy model

In [6]:
import sagemaker
import boto3






<botocore.client.IAM at 0x274ad32a548>

In [1]:
from sagemaker.huggingface import HuggingFace

In [2]:
hyperparameters = {'output_dir': 'mt5_model_files', 'num_train_epochs': 1, 'warmup_steps': 100, 
                                  'per_device_train_batch_size': 10, 'per_device_eval_batch_size': 10, 'weight_decay': 0.02,
                                 'logging_steps': 10, 'gradient_accumulation_steps': 20}

In [3]:
HuggingFace

sagemaker.huggingface.estimator.HuggingFace

In [22]:
from transformers import Seq2SeqTrainingArguments, Trainer

training_args = Seq2SeqTrainingArguments(output_dir = 'mt5_model_files', num_train_epochs = 1, warmup_steps = 100, 
                                  per_device_train_batch_size = 10, per_device_eval_batch_size = 10, weight_decay = 0.02,
                                 logging_steps = 10, gradient_accumulation_steps=20, 
                                 evaluation_strategy = 'epoch',
                                  predict_with_generate= True)


In [23]:
trainer = Trainer(model = model_mt.to('cpu'), args = training_args, tokenizer = tokenizer, data_collator = seq2seq_dc, 
                 train_dataset = data_enc['test'], eval_dataset = data_enc['validation'])

In [24]:
trainer.train()


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



### Evaluate: Custom way of inference

In [34]:
trainer.model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [83]:
def compute_metrics(dataset, metric, model, tokenizer, device, batch_size = 20, max_len=200):
    content_data = []
    target_data = []
    print(len(dataset['id']))
    
    for i in range(0, len(dataset['id']), batch_size):
        content_data.append(dataset['dialogue'][i:i+batch_size])
        target_data.append(dataset['summary'][i:i+batch_size])
        
    for content, target in tqdm.tqdm(zip(content_data, target_data), total = len(content_data)):
        content_enc = tokenizer(text = content, padding=True, truncation=True, max_length=1024, return_tensors = 'pt')
#         target_enc = tokenizer(target_text = target, padding=True, truncation=True, max_length=512)
        
        prediction_tokens = model.generate(input_ids = content_enc['input_ids'].to(device),
                                           attention_mask = content_enc['attention_mask'].to(device),
                                           length_penalty = 0.6, max_length = max_len)
        
        # To decode the generated tokens to words
        prediction_summary = [tokenizer.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces = True) 
                             for token in prediction_tokens]
        
        # Compute ROUGE score
        metric.add_batch(predictions = prediction_summary, references=target)
        
    result = metric.compute(use_stemmer=True)
    return result 

In [84]:
compute_metrics(data['validation'][:100], rouge, model_mt, tokenizer, device = 'cpu', batch_size = 20, max_len = 200)


5



  0%|                                                                                                                      | 0/5 [00:00<?, ?it/s][A
 20%|██████████████████████                                                                                        | 1/5 [00:12<00:50, 12.72s/it][A
 40%|████████████████████████████████████████████                                                                  | 2/5 [00:20<00:30, 10.02s/it][A
 60%|██████████████████████████████████████████████████████████████████                                            | 3/5 [00:29<00:18,  9.27s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████                      | 4/5 [00:38<00:09,  9.11s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:44<00:00,  8.94s/it][A


{'rouge1': 0.005072661600525378,
 'rouge2': 0.0009090909090909091,
 'rougeL': 0.004950136035275355,
 'rougeLsum': 0.004953325827938831}

### Pipeline way of inference

In [1]:
from transformers import pipeline
summarizer = pipeline("summarization", model="google/mt5-small")

  from .autonotebook import tqdm as notebook_tqdm
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [5]:
import os
os.chdir("../")


In [6]:
pwd

'C:\\Users\\papu_\\OneDrive\\Desktop\\University Courses\\3rd Semester\\AWS ML Speciality\\Text-Summarizer-AWS-Deployment'

In [14]:
Path('artifacts/t5_model_files')

WindowsPath('artifacts/t5_model_files')

In [25]:
from pathlib import Path
from transformers import pipeline
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

pipe = pipeline(task='summarization', model = 'artifacts/t5_model_files',
                        tokenizer = 'artifacts/t5_model_files/tokenizer', 
                        device = device, length_penalty = 0.6,
                        batch_size = 8)


In [26]:
text = """Nathan: Hey, Alicia?   Alicia: Oh hey, I didn’t see you there. Did you already get a table?   Nathan: Yeah, right over here.   Alicia: I’m glad we had time to meet up.   Nathan: Me too. So, what’s going on?   Alicia: Oh, not much. You?   Nathan: Not much. Hey, how did your interview go? Wasn’t that today?   Alicia: Oh, yeah. I think it went well. I don’t know if I got the job yet, but they said they would call in a few days.   Nathan: Well, I’m sure you did great. Good luck.   Alicia: Thanks. I’m just happy that it’s over. I was really nervous about it.   Nathan: I can understand that. I get nervous before interviews, too.   Alicia: Well, thanks for being supportive. I appreciate it.   Nathan: Sure, no problem. """



In [29]:
pipe(text)[0]['summary_text']

'Alicia: hey, I didn’t see you there. Did you already get a table? Nathan: Yeah, right over here. I think it went well. I don’t know if I got the job yet, but they said they would call in a few days .'

In [2]:
for i in data['test']['dialogue'][:10]:
    print('Dialogue: {}, Summary: {} '.format(i, summarizer(i)[0]['summary_text']))

NameError: name 'data' is not defined

In [93]:
summarizer(i)

[{'summary_text': '<extra_id_0>'}]

In [39]:
import numpy as np
from sklearn import datasets
import pandas as pd

iris = datasets.load_iris(as_frame =True)
df = pd.DataFrame(iris['data'])
df['target'] = iris['target']

df = df.reset_index(drop=True)

In [43]:
def get_grouped_mean(data):
     



In [44]:
get_grouped_mean(df)


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
[0 1 2]


In [66]:
# print(df.head())
print(np.unique(df['target']))  
all_cats = np.unique(df['target'])
mean_cat = []

def means_cat():
    for cat in all_cats:
        mean_cat.append( np.mean(df.iloc[np.where(df.target.isin([cat]))]['sepal width (cm)']) )

    return mean_cat

[0 1 2]


In [67]:
means_cat()

[3.428, 2.7700000000000005, 2.974]

In [106]:
import re

text = "Today, is the greatest day everrrr aaaaa!"
l = text.split(' ')

# remove non alpha
l = [re.sub('[^a-z A-Z 0-9]', '', i) for i in l]
print(l)

min_length = -1
output = -1

for word in l:
    rep = {}
    if(len(word) >= min_length):
        for char in word:
            if(not rep.get(char)):
                rep[char] = 1
            else:
                rep[char]+=1
        
        if(rep[max(rep, key = rep.get)] > min_length):
            output = word
            min_length = rep[max(rep, key = rep.get)]
    
    



['Today', 'is', 'the', 'greatest', 'day', 'everrrr', 'aaaaa']


In [107]:
output

'aaaaa'

In [None]:
1. RUN pip install
2. REDIS in-memory data store
3. Hash
4. git cherry pick
5. git rebase


