## Import Library

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate

In [None]:
data_path_open_source = "/home/neemias/PerceptSent-LLM-approach/data/"
data_path_open_ai = "/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/"
data_paths = [os.path.join(data_path_open_source, f) 
              for f in os.listdir(data_path_open_source) if f.endswith('.csv')] + [os.path.join(data_path_open_ai, f)
                                                              for f in os.listdir(data_path_open_ai) if f.endswith('.csv')]
del data_paths[6]
data_paths[:10]

In [None]:
df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha5_p5.csv")
df.head()

In [None]:
df.sentiment.value_counts()

In [5]:
def process_prompt(row):
    return f"{row['text']} - {row['sentiment']}"

def prepare_data(dataframe: pd.DataFrame):
    X_train = []
    X_test = []

    # X_train, X_test = train_test_split(
    #     dataframe,
    #     test_size=0.2,
    #     random_state=42
    # )
    for sentiment in [0, 1, 2, 3, 4]:
        
        train, test = train_test_split(df[df.sentiment == sentiment],
                                       test_size=0.2,
                                       random_state=42)
        X_train.append(train)
        X_test.append(test)

    X_train = pd.concat(X_train)
    X_test = pd.concat(X_test)


    prompt = """What is the sentiment of this description? Please choose an answer from 
        {
            "Positive": 4,
            "SlightlyPositive": 3,
            "Neutral": 2,
            "SlightlyNegative": 1,
            "Negative": 0
        }
    """
    #         {Positive/SlightlyPositive/Neutral/SlightlyNegative/Negative}.
    # sentiment = {
    #     4: "Positive",
    #     3: "SlightlyPositive",
    #     2: "Neutral",
    #     1: "SlightlyNegative",
    #     0: "Negative",
    # }
    # X_train["sentiment"] = X_train["sentiment"].apply(lambda x: sentiment[x])
    # X_test["sentiment"] = X_test["sentiment"].apply(lambda x: sentiment[x])
    # print(len(X_train), len(X_test))
    X_train["input"] = X_train["text"]
    X_test["input"] = X_test["text"]

    X_train["text"] = X_train[["text", "sentiment"]].apply(lambda x: prompt+x["text"]+'='+str(x["sentiment"]), axis=1)
    # X_test["text"] = X_test[["text", "sentiment"]].apply(lambda x: prompt+x["text"]+'='+str(x["sentiment"]), axis=1)
    # X_train["text"] = X_train[["text"]].apply(lambda x: prompt+x+'=', axis=1)
    X_test["text"] = X_test[["text"]].apply(lambda x: prompt+x+'=', axis=1)

    train_data = Dataset.from_pandas(X_train)#.iloc[0:500])
    test_data = Dataset.from_pandas(X_test)

    return train_data, test_data, X_test.drop(["sentiment"], axis=1), X_test["sentiment"]

In [6]:
train_data, test_data, X_test, y_test = prepare_data(df)

In [None]:
train_data

In [None]:
X_test.head()

In [None]:
X_test["text"].iloc[0]

In [10]:
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig)
import torch

In [30]:
# def init_model(model_name="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"):
# def init_model(model_name="meta-llama/Llama-3.2-1B"):
# def init_model(model_name="meta-llama/Llama-2-7b-chat-hf"):
def init_model(model_name="nvidia/Llama3-ChatQA-1.5-8B"):
# def init_model(model_name="nvidia/NV-Embed-v2"):
    compute_dtype = getattr(torch, "float16")
    torch.cuda.reset_peak_memory_stats()
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    #     bnb_4bit_compute_dtype=torch.float16,
        llm_int8_enable_fp32_cpu_offload=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True,
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    return model, tokenizer

In [None]:
model, tokenizer = init_model()

In [13]:
from transformers import TrainingArguments
from trl import SFTTrainer
from peft import LoraModel, LoraConfig

In [30]:
# Define the metric computation function
def compute_metrics(eval_pred):
    f1_metric = evaluate.load("f1", use_auth_token=False)
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {"f1": f1["f1"]}

def train(model, tokenizer, train_data, eval_data):
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        # lora_alpha=2,
        # lora_dropout=0.1,
        # r=8,
        bias="none",
        task_type="CAUSAL_LM",
    )

    training_arguments = TrainingArguments(
        output_dir="../logs/log",
        num_train_epochs=100,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=8,
        optim="paged_adamw_32bit",
        save_steps=0,
        logging_steps=25,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="cosine",
        report_to="tensorboard",
        evaluation_strategy="epoch",
        gradient_checkpointing=True,
        eval_accumulation_steps=2,
    )


    trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        # compute_metrics=compute_metrics,
        packing=False,
        max_seq_length=1064,
    )

    trainer.train()

    output_dir = "../logs/results/trained_model"

    trainer.save_model(output_dir)

In [None]:
train(model, tokenizer, train_data, test_data)

In [32]:
from transformers import pipeline
from tqdm.notebook import tqdm

def inference(pipe, prompt):
    result = pipe(prompt)
    # print(result)
    # print()
    answer = result[0]['generated_text'].split("=")[-1]
    return answer

def predict(X_test, model, tokenizer):
    y_pred = []
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=1,
                    temperature=0.01,
                    do_sample=True
                    # device='cuda'
                    )
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        answer = inference(pipe, prompt)
        # print(answer)
        y_pred.append(int(answer))
    return y_pred

In [None]:
y_pred = predict(X_test, model, tokenizer)

In [None]:
y_pred[0]

In [35]:
sentiment = {
    "Positive": 4,
    "SlightlyPositive": 3,
    "Neutral": 2,
    "SlightlyNegative": 1,
    "Negative": 0,
}

In [None]:
np.unique(y_pred)

In [37]:
from sklearn.metrics import classification_report, f1_score

In [56]:
print(f"{f1_score(y_true=[sentiment[v] for v in y_test.to_list()], y_pred=[sentiment[v] for v in y_pred], average='weighted'):.2f}")

0.65


In [None]:
f1_score(y_test, y_pred)

In [None]:
print(classification_report(y_true=y_test, y_pred=y_pred))