In [12]:
import os, sys
from pathlib import Path
import urllib
from textSummarizer.logging import logger
from zipfile import ZipFile
from dataclasses import dataclass
import numpy as np

In [13]:
pwd

'C:\\Users\\papu_\\OneDrive\\Desktop\\University Courses\\3rd Semester\\AWS ML Speciality\\Text-Summarizer-AWS-Deployment\\research'

In [14]:
os.chdir("../")

In [15]:
pwd

'C:\\Users\\papu_\\OneDrive\\Desktop\\University Courses\\3rd Semester\\AWS ML Speciality\\Text-Summarizer-AWS-Deployment'

In [6]:
# Entity.yaml
from dataclasses import dataclass
import numpy as np

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_dir: str
    local_data_file: Path
    unzip_dir: Path


In [6]:
ad = DataIngestionConfig('a', 'b', 'c', 'd')
ad.unzip_dir

'd'

In [7]:
# Configuration manager
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories, get_size

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])
        
    def getDataIngestionConfig(self) -> DataIngestionConfig:
        
        data_ingestion_config = self.config.data_ingestion
        
        dic = DataIngestionConfig(root_dir = data_ingestion_config.root_dir, 
                                 source_dir = data_ingestion_config.source_dir, 
                                 local_data_file = data_ingestion_config.local_data_file, 
                                 unzip_dir = data_ingestion_config.unzip_dir)
        
        return dic
    


In [12]:
# Components creation

class DataIngestion:
    def __init__(self, dic : DataIngestionConfig):
        self.dic = dic
        
    # Download the data from url and save to the local file name and extract the zip contents, save them all in artifacts folder
    def download_data(self):
        if(not os.path.exists(self.dic.local_data_file)):
            filename, header = urllib.request.urlretrieve(url = self.dic.source_dir, filename = self.dic.local_data_file)
            logger.info('Data file {} downloaded, with return header {}'.format(filename, header))
        else:
            size = get_size(Path(self.dic.local_data_file))
            logger.info('Data file {} already exists, with {}'.format(self.dic.local_data_file, size))
    # Extract
    def extract_zip(self):
        with ZipFile(self.dic.local_data_file, 'r') as zObject:
            zObject.extractall(path=self.dic.unzip_dir)
        logger.info('Zip file extracted at {}'.format(self.dic.unzip_dir))
        
        

In [13]:
# Update the pipeline
try:
    config = ConfigurationManager()
    dic = config.getDataIngestionConfig()
    di = DataIngestion(dic)

    di.download_data()
    di.extract_zip()
except Exception as e:
    raise e
    

[2023-08-16 15:57:33,438] @ [INFO] : common : Path to YAML config\config.yaml loaded correctly
[2023-08-16 15:57:33,440] @ [INFO] : common : Path to YAML params.yaml loaded correctly
[2023-08-16 15:57:33,441] @ [INFO] : common : Directory artifacts created correctly
[2023-08-16 15:57:33,443] @ [INFO] : common : Directory artifacts/data_ingestion created correctly
[2023-08-16 15:57:33,444] @ [INFO] : <ipython-input-12-532bbc926797> : Data file artifacts/data_ingestion/summarizer-data.zip already exists, with File size: 7718.353515625 kB
[2023-08-16 15:57:33,604] @ [INFO] : <ipython-input-12-532bbc926797> : Zip file extracted at artifacts/data_ingestion


In [60]:
from dataclasses import dataclass
from ensure import ensure_annotations

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    data_dir: Path
    status_dir: str
    required_files: list


In [61]:
# Configuration manager
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories, get_size
from pathlib import Path

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])

    def getDataValidationConfig(self) -> DataValidationConfig:
        
        data_validation_config = self.config.data_validation  
        create_directories([data_validation_config.root_dir])
        
        dic = DataValidationConfig(root_dir = Path(data_validation_config.root_dir), 
                                   data_dir = Path(data_validation_config.data_dir),
                                 status_dir = str(data_validation_config.status_dir), 
                                 required_files = data_validation_config.required_files)
        
        return dic

In [68]:

class DataValidation:
    def __init__(self, dic : DataValidationConfig):
        self.dic = dic
    
    def validate_data(self):
        try:
            # Validate files in folder, status is set to true only when all the required files are found
            # Further validation should include column level, handling missing values, row level error fixing
            list_of_files = os.listdir(os.path.join(self.dic.data_dir, 'samsum_dataset'))
            data_validation_status = True

            with open(self.dic.status_dir, 'w') as f:
                f.write('Data validation status: {}'.format(data_validation_status))
            
#             print(self.dic.required_files)
            for req_file in self.dic.required_files:
                if(req_file not in list_of_files):
                    print(req_file)
                    data_validation_status = False
                    with open(self.dic.status_dir, 'w') as f:
                        f.write('Data validation status: {}'.format(data_validation_status))

                logger.info('Data Validation status: {}'.format(data_validation_status))
                return data_validation_status
            
            logger.info('Data Validation status: {}'.format(data_validation_status))
            
        except Exception as e:
            logger.error(e)
            raise e
    
    

In [69]:
try:
    config = ConfigurationManager()
    dic = config.getDataValidationConfig()
    di = DataValidation(dic)
    di.validate_data()
    
except Exception as e:
    raise e

[2023-08-16 22:09:39,308] @ [INFO] : common : Path to YAML config\config.yaml loaded correctly
[2023-08-16 22:09:39,312] @ [INFO] : common : Path to YAML params.yaml loaded correctly
[2023-08-16 22:09:39,316] @ [INFO] : common : Directory artifacts created correctly
[2023-08-16 22:09:39,316] @ [INFO] : common : Directory artifacts/data_ingestion created correctly
[2023-08-16 22:09:39,321] @ [INFO] : common : Directory artifacts/data_validation created correctly
[2023-08-16 22:09:39,324] @ [INFO] : <ipython-input-68-a47d829e1e0e> : Data Validation status: True


In [70]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_dir: Path
    model_name: str


In [111]:

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])
        
    def getDataTransformationConfig(self) -> DataTransformationConfig:

        data_transformation_config = self.config.data_transformation

        create_directories([data_transformation_config.root_dir])

        dc = DataTransformationConfig(root_dir = Path(data_transformation_config.root_dir), 
                                   data_dir = Path(data_transformation_config.data_dir),
                                 model_name = str(data_transformation_config.model_name))

        return dc

In [119]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
import evaluate
import nltk
import tqdm as tqdm
import torch
nltk.download('punkt')


class DataTransformation:
    def __init__(self, dc : DataTransformationConfig):
        self.dc = dc
        self.tokenizer = AutoTokenizer.from_pretrained(self.dc.model_name)
    
    
    def tokenize_text_batches(self, batch_data):
        input_encodings = self.tokenizer(text = batch_data['dialogue'], padding=True, truncation=True, max_length=1024)
        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(text = batch_data['summary'], padding=True, truncation=True, max_length=512)

        # Need to pass target encodings within the dict as labels key for the transformers model input
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }


    def transform_data(self):
        try:
            # Convert text to word encodings using autotokenizer of the pre-trained model
            # dataset = Dataset.from_file(Path(self.dc.data_dir))
            dataset = load_from_disk(Path(self.dc.data_dir))
            data_enc = dataset.map(self.tokenize_text_batches, batched=True, batch_size = 500)
            logger.info('Details about loaded data: {}'.format(data_enc))
            return data_enc
            
        except Exception as e:
            logger.error(e)
            raise e
    
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\papu_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [120]:
try:
    config = ConfigurationManager()
    dc = config.getDataTransformationConfig()
    di = DataTransformation(dc)
    di.transform_data()
    
except Exception as e:
    raise e

[2023-08-17 02:53:21,992] @ [INFO] : common : Path to YAML config\config.yaml loaded correctly
[2023-08-17 02:53:21,994] @ [INFO] : common : Path to YAML params.yaml loaded correctly
[2023-08-17 02:53:21,995] @ [INFO] : common : Directory artifacts created correctly
[2023-08-17 02:53:21,997] @ [INFO] : common : Directory artifacts/data_ingestion created correctly
[2023-08-17 02:53:21,998] @ [INFO] : common : Directory artifacts/data_transformation created correctly


Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

[2023-08-17 02:53:31,388] @ [INFO] : <ipython-input-119-d9942e7eb007> : Details about loaded data: DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})


In [13]:
@dataclass(frozen=True)
class ModelTrainerConfig:
    model_name: str
    output_dir: str
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    logging_steps: int
    gradient_accumulation_steps: int
    evaluation_strategy: str
    predict_with_generate: bool
            

In [14]:
# Configuration manager
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories, get_size
from pathlib import Path

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])

    def getModelTrainerConfig(self) -> ModelTrainerConfig:
        
        # Take model name from config file
        # Take other training arguments from params file
        
        model_trainer_params = self.params.TrainingArguments  
        create_directories([Path(model_trainer_params.output_dir)])
        
        mtc = ModelTrainerConfig(model_name = self.config.model_name,
                                  output_dir= Path(model_trainer_params.output_dir),
                                  num_train_epochs= int(model_trainer_params.num_train_epochs),
                                  warmup_steps = model_trainer_params.warmup_steps,
                                  per_device_train_batch_size= model_trainer_params.per_device_train_batch_size,
                                  per_device_eval_batch_size= model_trainer_params.per_device_eval_batch_size,
                                  weight_decay= model_trainer_params.weight_decay,
                                  logging_steps= model_trainer_params.logging_steps,
                                  gradient_accumulation_steps= model_trainer_params.gradient_accumulation_steps,
                                  evaluation_strategy = model_trainer_params.evaluation_strategy,
                                  predict_with_generate= model_trainer_params.predict_with_generate)
        
        return mtc

In [17]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq            
import evaluate
import nltk
import tqdm as tqdm
import torch
nltk.download('punkt')


class ModelTrainer:
    def __init__(self, mtc : ModelTrainerConfig):
        self.mtc = mtc
        self.tokenizer = AutoTokenizer.from_pretrained(self.mtc.model_name)
        self.model = AutoModelForSeq2SeqLM(self.mtc.model_name)
        self.metric = evaluate.load('rouge')
    
    
    # Define metric
    def compute_metrics(self, predictions):
        # To decode the generated tokens to words
        preds, labels = predictions
        
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces = True)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces = True)
        
        # rougeLSum expects newline after each sentence
        decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
        decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

#         prediction_summary = [self.tokenizer.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces = True) 
#                              for token in predictions]
        
        # Compute ROUGE score
        return metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    
    def train_model(self, data_transformed):
        
        training_args = TrainingArguments(output_dir = self.mtc.output_dir, 
                                          num_train_epochs = self.mtc.num_train_epochs,
                                          warmup_steps = self.mtc.warmup_steps, 
                                          per_device_train_batch_size = self.mtc.per_device_train_batch_size,
                                          per_device_eval_batch_size = self.mtc.per_device_eval_batch_size,
                                          weight_decay = self.mtc.weight_decay,
                                          logging_steps = self.mtc.logging_steps,
                                          gradient_accumulation_steps= self.mtc.gradient_accumulation_steps,
                                          evaluation_strategy = self.mtc.evaluation_strategy,
                                          predict_with_generate= self.mtc.predict_with_generate)

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Data collators are objects that will form a batch by using a list of dataset elements as input.
        # These elements are of the same type as the elements of train_dataset or eval_dataset. 
        # To be able to build batches, data collators may apply some processing (like padding).
        # Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation 
        # (like random masking) on the formed batch.
        
        seq2seq_dc = DataCollatorForSeq2Seq(tokenizer, model = model_mt)

        trainer = Trainer(model = self.model.to(device), args = training_args, tokenizer = self.tokenizer,
                          data_collator = seq2seq_dc, train_dataset = data_transformed['test'], 
                          eval_dataset = data_transformed['validation'], compute_metrics = self.compute_metrics)
        
#         trainer.train()

        ## Save model after fine-tuning
        self.model.save_pretrained(Path(self.mtc.output_dir))
    
        ## Save tokenizer after making it train on the fine tune dataset
        self.tokenizer.save_pretrained(os.path.join(self.mtc.output_dir, 'tokenizer'))
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\papu_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# data_transformed to train_model


In [None]:
try:
    config = ConfigurationManager()
    dc = config.getModelTrainerConfig()
    di = ModelTrainer(dc)
    di.train_model(data_transformed)
    
except Exception as e:
    raise e

In [21]:
from dataclasses import dataclass

@dataclass(frozen=True)
class ModelEvaluationConfig:
    model_name: str
    data_dir: Path
    model_dir: Path
    tokenizer_dir: Path
    length_penalty: float
    max_length: int
    batch_size: int

        
        

In [29]:
# Configuration manager
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories, get_size
from pathlib import Path

@dataclass
class ConfigurationManager:
    config_file_path: Path = CONFIG_FILE_PATH
    params_file_path: Path = PARAMS_FILE_PATH
    
    # Calling this to not override default init and post init is called as the last function of default init
    def __post_init__(self):
        self.config = read_yaml(self.config_file_path) # CONFIG_FILE_PATH taken from constants
        self.params = read_yaml(self.params_file_path) # PARAMS_FILE_PATH taken from constants
        
        create_directories([self.config.artifacts_path, self.config.data_ingestion.root_dir])

    def getModelEvaluationConfig(self) -> ModelEvaluationConfig:
        
        # Take model name from config file
        # Take other training arguments from params file
        
        model_eval_config = self.config.model_evaluation  
        
        mec = ModelEvaluationConfig(
                                model_name = model_eval_config.model_name,
                                data_dir = Path(model_eval_config.data_dir),
                                model_dir = Path(model_eval_config.model_dir),
                                tokenizer_dir = Path(model_eval_config.tokenizer_dir),
                                length_penalty = model_eval_config.length_penalty,
                                max_length = model_eval_config.max_length,
                                batch_size = model_eval_config.batch_size)
        
        return mec

In [55]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk, Dataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq            
import evaluate
import nltk
import tqdm as tqdm
import torch
nltk.download('punkt')


class ModelEvaluation:
    def __init__(self, mec : ModelEvaluationConfig):
        self.mec = mec
        self.tokenizer = AutoTokenizer.from_pretrained(self.mec.tokenizer_dir) # Load tokenizer from path not model name
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.mec.model_dir)  # Load model from path not model name
        self.metric = evaluate.load('rouge')
    
    def eval_model(self):
        dataset = load_from_disk(self.mec.data_dir)['test'][:10]

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        content_data = []
        target_data = []
#         print(dataset)
        
        for i in range(0, len(dataset['id']), self.mec.batch_size):
            content_data.append(dataset['dialogue'][i: i+self.mec.batch_size])
            target_data.append(dataset['summary'][i: i+self.mec.batch_size])

        for content, target in tqdm.tqdm(zip(content_data, target_data), total = len(content_data)):
            content_enc = self.tokenizer(text = content, padding=True, truncation=True, max_length=1024, return_tensors = 'pt')
                
            prediction_tokens = self.model.generate(input_ids = content_enc['input_ids'].to(device),
                                           attention_mask = content_enc['attention_mask'].to(device),
                                           length_penalty = self.mec.length_penalty, max_length = self.mec.max_length)
            
            
            # To decode the generated tokens to words
            prediction_summary = [self.tokenizer.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces = True) 
                                 for token in prediction_tokens]
            
         
            print('origg preds: ', prediction_summary, '\n')
            print('orig labels: ',target, '\n')
            
            # rougeLSum expects newline after each sentence
            decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in prediction_summary]
            decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in target]
            
            print('decoded preds: ', decoded_preds, '\n')
            print('decoded_labels: ',decoded_labels, '\n')
            
            # Compute ROUGE score
            self.metric.add_batch(predictions = decoded_preds, references= decoded_labels)
            
        result = self.metric.compute(use_stemmer=True)
        print(result)
        return result
            

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\papu_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
try:
    config = ConfigurationManager()
    dc = config.getModelEvaluationConfig()
    me = ModelEvaluation(dc)
    me.eval_model()
    
except Exception as e:
    raise e

[2023-08-20 02:45:37,910] @ [INFO] : common : Path to YAML config\config.yaml loaded correctly
[2023-08-20 02:45:37,913] @ [INFO] : common : Path to YAML params.yaml loaded correctly
[2023-08-20 02:45:37,914] @ [INFO] : common : Directory artifacts created correctly
[2023-08-20 02:45:37,915] @ [INFO] : common : Directory artifacts/data_ingestion created correctly


  0%|                                                                                                                      | 0/1 [00:00<?, ?it/s]

origg preds:  [': Urgh.. Alright Hannah: Bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye bye', ": MACHINE! Rob: That's so gr8! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I know! Rob: I'll check. Rob: I'll watch them now! Rob: I'll watch them", "Lenny: Babe, can you help me with something? Bob: What's up? Bob: Send me photos Lenny: file_photo> Lenny: file_photo> Lenny: file_photo> Lenny: file_photo> Bob: I like the first ones best Lenny: But I already have purple trousers.", ",,,,, what do you want for dinner tonight? Emma: gah, don't worry about it tonight Will: what do you mean? Emma: gah, don't worry about it tonight Will: what do you mean? Emma: not really, but it's ok, don't worry about cook

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.65s/it]

[2023-08-20 02:45:54,878] @ [INFO] : rouge_scorer : Using default tokenizer.
{'rouge1': 0.17579455149472636, 'rouge2': 0.025573192239858912, 'rougeL': 0.13901103445030416, 'rougeLsum': 0.164741275343337}



