In [64]:
import pickle
import torch
import pandas as pd
import random
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [65]:
data_path = 'taskcsv/formatted_data.csv'
df = pd.read_csv(data_path)

In [66]:
df.columns = df.columns.str.strip()
df = df.head(10)

In [67]:
inputs = df['Input'].tolist()
outputs = df['Output'].tolist() 

In [68]:
checkpoint = "HuggingFaceTB/SmolLM2-360M-Instruct"
device = "cpu"

In [69]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)


In [70]:
class TaskDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_length=512):
        self.inputs = inputs
        self.outputs = outputs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        # Dynamically tokenize inputs and outputs
        input_text = self.inputs[idx]
        output_text = self.outputs[idx]

        # Tokenize input and output sequences
        input_encodings = self.tokenizer(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        output_encodings = self.tokenizer(output_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

        # Shift the output sequence for causal language modeling (label shifting)
        labels = output_encodings.input_ids.squeeze()
        labels = torch.cat([torch.tensor([-100]), labels[:-1]])  # Shift and pad with -100 for the first token

        # Return the tokenized data
        return {
            'input_ids': input_encodings.input_ids.squeeze(),
            'attention_mask': input_encodings.attention_mask.squeeze(),
            'labels': labels
        }

In [71]:
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(inputs, outputs, test_size=0.3, random_state=42)

train_dataset = TaskDataset(train_inputs, train_outputs, tokenizer)
test_dataset = TaskDataset(test_inputs, test_outputs, tokenizer)


In [72]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    save_total_limit=1,
)



In [73]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Set test dataset for evaluation
)


: 

In [None]:
trainer.train()

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model_save_path = 'task_subtask_model'
tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)