In [1]:
import os

In [2]:
%pwd

'd:\\ML\\spelling correction\\Spelling-correction-project\\resreach'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\ML\\spelling correction\\Spelling-correction-project'

In [5]:
from dataclasses import*
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    eval_strategy: str
    eval_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size:  int
    num_train_epochs: int
    save_steps: int
    save_total_limit: int
    logging_steps: int
    predict_with_generate: bool
    fp16: bool

In [6]:
from SpellingCorrection.constants import*
from SpellingCorrection.utils.common import read_yaml, create_directories

In [7]:
import os
import yaml
from dataclasses import dataclass 
from pathlib import Path


class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
        
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self)-> ModelTrainerConfig:
        config= self.config.model_trainer
        params = self.params.Seq2SeqTrainingArguments
        create_directories([config.root_dir])

        model_trainer_config= ModelTrainerConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            model_ckpt = config.model_ckpt,

            eval_strategy = params.eval_strategy,
            eval_steps = params.eval_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            per_device_eval_batch_size = params.per_device_eval_batch_size,
            num_train_epochs = params.num_train_epochs,
            save_steps = params.save_steps,
            save_total_limit = params.save_total_limit,
            logging_steps = params.logging_steps,
            predict_with_generate = params.predict_with_generate,
            fp16 = params.fp16
            
        )
        return model_trainer_config

In [8]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from datasets import Dataset
import pandas as pd

from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import matplotlib.pyplot as plt
import pandas as pd
import torch
from tqdm import tqdm
import os


[2024-06-10 18:10:28,345 : INFO : config : PyTorch version 2.3.0 available.]


In [9]:
!pip install transformers[torch]



In [10]:
!pip install accelerate -U



In [11]:
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    
    def train(self):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        dataset_spelling_correction= load_from_disk(self.config.data_path)

        training_args = Seq2SeqTrainingArguments(
            output_dir = self.config.root_dir,
            eval_strategy = self.config.eval_strategy,
            per_device_train_batch_size = self.config.per_device_train_batch_size,
            per_device_eval_batch_size = self.config.per_device_eval_batch_size,
            predict_with_generate = self.config.predict_with_generate,
            num_train_epochs = self.config.num_train_epochs,
            save_steps = self.config.save_steps,
            save_total_limit = self.config.save_total_limit,
            logging_steps = self.config.logging_steps,
            fp16 = self.config.fp16
        )
        
        trainer = Seq2SeqTrainer(
            model = model,
            tokenizer = tokenizer,
            args = training_args,
            data_collator= seq2seq_data_collator,
            train_dataset = dataset_spelling_correction["train"],
            eval_dataset = dataset_spelling_correction["validation"]
        )
        trainer.train()

        #save model:
        model.save_pretrained(os.path.join(self.config.root_dir,"bartpho-spelling-correction"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir,"bartpho-spelling-correction-tokenizer"))


In [12]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2024-06-10 18:10:44,093 : INFO : common : Reading the yaml file from the path: config\config.yaml loaded successfully]
[2024-06-10 18:10:44,093 : INFO : common : Reading the yaml file from the path: params.yaml loaded successfully]




  0%|          | 0/7053 [00:00<?, ?it/s]

: 