In [None]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [2]:
data_path = 'taskcsv/formatted_data.csv'
data = pd.read_csv(data_path)

In [3]:
data['instruction'] = data['instruction'].astype(str)
data['output'] = data['output'].astype(str)

In [4]:
input_texts = data['instruction'].tolist()
target_texts = data['output'].tolist()

In [5]:
checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
device = "cpu"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [7]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [8]:
original_model_weights = model.state_dict()

In [None]:
input_encodings = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")
with tokenizer.as_target_tokenizer():
    target_encodings = tokenizer(target_texts, padding=True, truncation=True, return_tensors="pt")

In [10]:
train_dataset = {
    "input_ids": input_encodings.input_ids,
    "attention_mask": input_encodings.attention_mask,
    "labels": target_encodings.input_ids
}

In [11]:
class TaskDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": input_encoding["input_ids"].squeeze(0),
            "attention_mask": input_encoding["attention_mask"].squeeze(0),
            "labels": target_encoding["input_ids"].squeeze(0),
        }

In [12]:
train_inputs, test_inputs, train_targets, test_targets = train_test_split(input_texts, target_texts, test_size=0.3, random_state=42)

In [13]:
train_dataset = TaskDataset(train_inputs, train_targets, tokenizer)
test_dataset = TaskDataset(test_inputs, test_targets, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000,
    save_total_limit=1,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [17]:
fine_tuned_model_path = 'fine_tuned_model'
tokenizer.save_pretrained(fine_tuned_model_path)
model.save_pretrained(fine_tuned_model_path)

In [18]:
pretrained_model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [19]:
fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_path)
fine_tuned_state_dict = fine_tuned_model.state_dict()

In [20]:
pretrained_state_dict = pretrained_model.state_dict()

In [21]:
for name, param in fine_tuned_state_dict.items():
    if name in pretrained_state_dict:
        pretrained_state_dict[name] = param

In [None]:
pretrained_model.load_state_dict(pretrained_state_dict)

In [None]:
merged_model_path = 'merged_model'
pretrained_model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

In [24]:
torch.save(original_model_weights, 'original_pretrained_model_weights.pth')

In [None]:
print("Merged model has been saved.")