In [1]:
import torch
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B")

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.55s/it]


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
dataset_path = '../../preprocessed_data_11.csv'

df = pd.read_csv(dataset_path)

text = df['sentence']
target = df['sentiment']

x_train, x_val, y_train, y_val = train_test_split(text, target, test_size=0.3, shuffle=True, stratify=target, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, test_size=0.33, shuffle=True, stratify=y_val, random_state=42)

train_texts = x_train.tolist()
train_labels = y_train.tolist()
val_texts = x_val.tolist()
val_labels = y_val.tolist()
test_texts = x_test.tolist()
test_labels = y_test.tolist()

In [5]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, return_tensors='pt')

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)
test_encodings = tokenize_function(test_texts)

: 

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)