In [1]:
# !pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
# !pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)

[2024-04-12 04:29:29,326] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


2024-04-12 04:29:29.925141: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-12 04:29:29.961416: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 04:29:29.961446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 04:29:29.962341: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 04:29:29.968240: I tensorflow/core/platform/cpu_feature_guar

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [6]:
X_train = []
y_train = []
X_eval = []
y_eval = []
X_test = []
y_test = []

In [7]:
def generate_prompt(text,lab):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{text}] = {lab}
            """.strip()

def generate_test_prompt(text):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{text}] = """.strip()


In [8]:
with open('engspa/train.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()

In [9]:



for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_train.append(da[0])
    y_train.append(lab)

In [10]:
len(X_train),X_train[0]

(12194,
 "After this I 'm just gonna go home drink summ hot chocolate con bolillo and sleep")

In [11]:
with open('engspa/validation.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()

In [12]:
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_eval.append(da[0])
    y_eval.append(lab)

In [13]:
len(X_eval),X_eval[0]

(1859, 'estopp I blashhh lol jk but aww thanks haha x')

In [14]:
with open('engspa/test.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()

In [15]:
X_test=[]
y_test = []
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_test.append(da[0])
    y_test.append(lab)

In [16]:
train_data = []

In [17]:
for i in range(len(X_train)):
    train_data.append(generate_prompt(X_train[i],y_train[i]))

In [18]:
train_df = pd.DataFrame(train_data, columns=['text'])

In [19]:
train_data = Dataset.from_pandas(train_df)

In [20]:
eval_data = []
for i in range(len(X_eval)):
    eval_data.append(generate_prompt(X_eval[i],y_eval[i]))
    

In [21]:
eval_df = pd.DataFrame(eval_data, columns=['text'])
eval_data = Dataset.from_pandas(eval_df)

In [22]:
eval_data

Dataset({
    features: ['text'],
    num_rows: 1859
})

In [23]:
test_data = []
for i in range(len(X_test)):
    test_data.append(generate_test_prompt(X_test[i]))
    

In [24]:
test_df = pd.DataFrame(test_data, columns=['text'])


In [25]:
len(test_df)

4736

In [26]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [27]:
# model_name = "meta-llama/Llama-2-7b-hf"
model_name = ""

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
output_dir="trained_weigths"

peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=5,                       # number of training epochs
    per_device_train_batch_size=16,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=1e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/12194 [00:00<?, ? examples/s]

Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

In [29]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,0.9728,0.938554
1,0.8771,0.877728
2,0.8247,0.869866


TrainOutput(global_step=285, training_loss=0.9590214377955386, metrics={'train_runtime': 1916.3523, 'train_samples_per_second': 19.089, 'train_steps_per_second': 0.149, 'total_flos': 6.354312176118989e+16, 'train_loss': 0.9590214377955386, 'epoch': 2.99})

In [30]:
trainer.save_model()
tokenizer.save_pretrained(output_dir)

('trained_weigths/tokenizer_config.json',
 'trained_weigths/special_tokens_map.json',
 'trained_weigths/tokenizer.model',
 'trained_weigths/added_tokens.json',
 'trained_weigths/tokenizer.json')

In [31]:
from peft import AutoPeftModelForCausalLM

finetuned_model = "./trained_weigths/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

model = AutoPeftModelForCausalLM.from_pretrained(
     finetuned_model,
     torch_dtype=compute_dtype,
     return_dict=False,
     low_cpu_mem_usage=True,
     device_map=device,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model",safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.model',
 './merged_model/added_tokens.json',
 './merged_model/tokenizer.json')

In [32]:
# y_pred = predict(test_df, merged_model, tokenizer)
# evaluate(y_true, y_pred)

In [33]:
test_df

Unnamed: 0,text
0,Analyze the sentiment of the news headline enc...
1,Analyze the sentiment of the news headline enc...
2,Analyze the sentiment of the news headline enc...
3,Analyze the sentiment of the news headline enc...
4,Analyze the sentiment of the news headline enc...
...,...
4731,Analyze the sentiment of the news headline enc...
4732,Analyze the sentiment of the news headline enc...
4733,Analyze the sentiment of the news headline enc...
4734,Analyze the sentiment of the news headline enc...


In [34]:
 y_pred = []
# for i in tqdm(range(len(test_data))):
#     prompt = test_data[i]
#     pipe = pipeline(task="text-generation", 
#                         model=merged_model, 
#                         tokenizer=tokenizer, 
#                         max_new_tokens = 1, 
#                         temperature = 0.0,
#                        )
#     result = pipe(prompt)
#     answer = result[0]['generated_text'].split("=")[-1]
#     if "positive" in answer:
#         y_pred.append("positive")
#     elif "negative" in answer:
#          y_pred.append("negative")
#     else:
#         y_pred.append("neutral")

In [35]:
len(test_data)

4736

In [36]:
 y_pred = []
for i in tqdm(range(len(test_data))):
    prompt = test_data[i]
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    max_new_tokens = 1
    temperature = 0.0
    output = merged_model.generate(input_ids=inputs.input_ids, 
                               max_length=len(inputs.input_ids[0]) + max_new_tokens, 
                               temperature=temperature, 
                               pad_token_id=tokenizer.eos_token_id,
                               eos_token_id=tokenizer.pad_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    answer = generated_text.split("=")[-1]
    if "positive" in answer:
       y_pred.append("positive")
    elif "negative" in answer:
       y_pred.append("negative")
    else:
       y_pred.append("neutral")

100%|███████████████████████████████████████| 4736/4736 [02:10<00:00, 36.16it/s]


In [37]:
# from sklearn.metrics import f1_score

# weighted_f1 = f1_score(y_test, y_pred, average='weighted')

In [38]:
# weighted_f1

In [39]:
with open("sa_spa_eng", "w") as writer:
        writer.write('\n'.join(y_pred))

In [40]:
len(y_pred)

4736

In [41]:
y_pred

['positive',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'negative',
 'neutral',
 'positive',
 'negative',
 'positive',
 'neutral',
 'positive',
 'negative',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'negative',
 'neutral',
 'positive',
 'negative',
 'neutral',
 'positive',
 'negative',
 'negative',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'nega

In [42]:
len(y_pred)

4736