In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate 
import torch 
import numpy as np



https://huggingface.co/datasets/FinGPT/fingpt-sentiment-train


In [None]:
df = pd.read_parquet('train-00000-of-00001-dabab110260ac909.parquet')

In [None]:
df.head()

In [None]:
df["output"].unique()

In [None]:
df = df.drop(columns=["instruction"])
df = df.rename(columns={'input': 'text', 'output': 'label'})

columns = list(df.columns)
index_A = columns.index('text')
index_B = columns.index('label')

# Swap the columns
columns[index_A], columns[index_B] = columns[index_B], columns[index_A]

# Reorder DataFrame with swapped columns
df = df[columns]
# Reorder DataFrame with swapped columns
df = df[columns]

In [None]:
df.head()

In [None]:
df['text'].apply(len).max()

In [None]:
# Split the DataFrame into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


# Combine them into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [None]:
# Remove the '__index_level_0__' column from both the 'train' and 'test' datasets
dataset = dataset.map(lambda example: {k: v for k, v in example.items() if k != '__index_level_0__'}, remove_columns=['__index_level_0__'])

# Print to verify the column is removed
print(dataset)

Creamos un modelo que clasifica en base a las categorias de nuestro dataframe 

In [None]:
model_checkpoint = "distilbert-base-uncased"

id2label = {
    0: "strong negative",
    1: "moderately negative",
    2: "negative",
    3: "mildly negative",
    4: "neutral",
    5: "mildly positive",
    6: "positive",
    7: "moderately positive",
    8: "strong positive"
}

label2id = {
    "strong negative": 0,
    "moderately negative": 1,
    "negative": 2,
    "mildly negative": 3,
    "neutral": 4,
    "mildly positive": 5,
    "positive": 6,
    "moderately positive": 7,
    "strong positive": 8
}

#generamos modelo clasifiactivo a partir de model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=9, id2label = id2label, label2id = label2id)



In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def tokenize_function(examples):
    # Tokenize and truncate text with a max_length of 512
    tokenized_inputs = tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512  # Set to 512 to match the model's expected input size
    )
    
    # Convert labels to integer ids if they are strings
    labels = [label2id.get(label, -1) for label in examples['label']]
    
    # Ensure labels are in the correct format
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset



In [None]:

tokenized_dataset = DatasetDict({
    'train': tokenized_dataset['train'].remove_columns(['text', 'label']),
    'test': tokenized_dataset['test'].remove_columns(['text', 'label'])
})

tokenized_dataset

The data collator dynamically pads examples by batch, this is more computationally efficient

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Evaluation Metrics

In [None]:
accuracy = evaluate.load("accuracy")

#Evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    #The final prediction is the element with the largest value
    predictions = np.argmax(predictions,axis=1)
    #e.g. If the second element of the resulting list is the largest the prediction is moderatly negative

    return {"accuracy": accuracy.compute(predictions=predictions,references=labels)}


Testing the untrained model

In [None]:

# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Fine-tunning with LoRA

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",#Sequence clasification
                        r = 4, # rank of the trainable matrix
                        lora_alpha=32, #sort of learning rate for Adam optimizer
                        lora_dropout= 0.01, #probability of a dropout,
                        target_modules=['q_lin'])
                         

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Training the model 

In [None]:
#hyperparameters
lr = 1e-3
batch_size = 10
num_epochs = 2

# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()