In [None]:
import os
os.chdir("/Users/morizin/Documents/Code/jigsaw-competition")
from src.jigsaw.config.config import ConfigurationManager
from src.jigsaw.entity.config_entity import ModelTrainingConfig

cfg = ConfigurationManager()

config = cfg.get_model_training_config()

In [None]:
import os
import pandas as pd
from typeguard import typechecked
from src.jigsaw import logger
from pandas.core.frame import DataFrame
from src.jigsaw.utils.common import load_csv

# @typechecked
def get_train_test_split(config: ModelTrainingConfig) -> tuple[DataFrame | None]:
    data_coll = []
    for dataset in config.schemas:

        features = dataset.features.copy()

        if dataset.target not in features:
            features.append(dataset.target)

        if config.fold >= 0:
            features.append('fold')

        for file in dataset.train:
            data = load_csv(config.indir.path / dataset.name / file)

            if all([col in data.columns for col in features]):
                data_coll.append(data[features])
            else:
                logger.error(
                    f"The dataset can't be inlcuded as it have unmatched columns names {data.columns}"
                )
    data = pd.concat(data_coll, axis=0)
    if config.fold >= 0 and config.fold <= data.fold.max():
        train_data = data.query("fold != @config.fold")
        valid_data = data.query("fold == @config.fold")
    else:
        train_data = data
        valid_data = None
    return train_data, valid_data

train_data, valid_data = get_train_test_split(config)
train_data.shape, valid_data.shape

In [None]:
from src.jigsaw.components.dataset.classfier_dataset import ClassifierDataset

train_dataset = ClassifierDataset(config, train_data)

In [None]:
from src.jigsaw.components.models.classifier_model import get_deberta_model
model = get_deberta_model(config)

In [None]:
# import torch.nn as nn
# from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# model = AutoModelForSequenceClassification.from_pretrained(config.model_name, trust_remote_code = True)
# model.classifier = nn.Linear(model.classifier.in_features, 1)
# del model.dropout

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=config.outdir.path,
    overwrite_output_dir=True,
    do_train=True,
    per_device_train_batch_size=config.engine.train_batch_size,
    gradient_accumulation_steps= config.engine.gradient_accumulation_steps,
    learning_rate=config.engine.learning_rate,
    weight_decay=config.engine.weight_decay,
    warmup_ratio= config.engine.warmup_ratio,
    num_train_epochs= config.engine.nepochs,
    report_to= 'none',
    save_strategy='no'
)

trainer = Trainer(
    model = model, 
    args = training_args,
    train_dataset=train_dataset
)