In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from typing import List
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from sklearn.utils import shuffle
from datasets import Dataset, load_dataset

In [None]:

full_file = "data/IBC/ibc.csv"
sample_file = "data/IBC/sample_ibc.csv"
ibc = pd.read_csv(full_file)
sample = pd.read_csv(sample_file)
# ibc = shuffle(pd.read_csv(full_file), random_state=1)

dsq = sample["sentence"].to_list()
# print(dsq)
ft_ibc = ibc.loc[~ibc["sentence"].isin(dsq), :].copy()
# print(ft_ibc.label.value_counts(), ibc.label.value_counts())

ft_ibc = shuffle(ft_ibc, random_state=1)
ft_ibc.iloc[0]

options = ["liberal", "neutral", "conservative"]

def add_to_dataset(dataset, sentence, label):
    if label == 'liberal':
        result = 0
    elif label == 'neutral':
        result = 1
    else:
        result = 2

    data = {"sentence": sentence,
            "label": result}
    dataset.append(data)

sample_dataset = []

for index in range(len(sample)):
    sentence = ft_ibc.iloc[index]["sentence"]
    add_to_dataset(sample_dataset, sentence, ft_ibc.iloc[index]["label"].lower())

sample_ex = sample_dataset[0]
sample_ex

dataset = []

for index in range(len(ft_ibc)):
    sentence = ft_ibc.iloc[index]["sentence"]
    add_to_dataset(dataset, sentence, ft_ibc.iloc[index]["label"].lower())

example = dataset[0]
example

test_split = (int) (0.1*len(dataset))
test_set = dataset[:test_split]
train_set = dataset[test_split:]

print(f"Size of test set: {len(test_set)}, size of train set: {len(train_set)}, no overlap: {len(train_set)+len(test_set)==len(dataset)}")

# load into Datasets
train_ds = Dataset.from_pandas(pd.DataFrame(data=train_set))
test_ds = Dataset.from_pandas(pd.DataFrame(data=test_set))

test_ds

In [None]:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length")

tokenized_dataset = train_ds.map(preprocess_function, batched=True)

# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./llama-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

In [None]:
inputs = tokenizer("First, government plays an important role.", return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0]))