In [3]:
!pip install -q accelerate==0.21
!pip install -q peft==0.4.0
!pip install -q bitsandbytes==0.40.2
!pip install -q transformers==4.33.1 
!pip install -q trl==0.4.7

In [4]:
#!pip install tensorflow_probability===0.20.0

In [5]:
#!pip install simpletransformers "transformers==4.30.2"

In [6]:
import numpy as np
import pandas as pd
import re
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split



In [7]:
df_train = pd.read_csv('/kaggle/input/input-data/HODI_2023_train_subtaskA.tsv', sep='\t')
df_test = pd.read_csv('/kaggle/input/input-data/HODI_2023_test_subtaskA.csv', sep='\t')

In [8]:
def pre_process(text):
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('#', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('RT|rt', '', text)
    text = text.strip()
    return text

In [9]:
df_train["text"] = df_train["text"].apply(pre_process)
df_test["text"] = df_test["text"].apply(pre_process)

In [10]:
print(df_train)

        id                                               text  homotransphobic
0        1  hanno rotto le palle con i €compleanni a doppi...                0
1        2  avete rotto,come potete prendere se le pubblic...                0
2        3  guevara si è rotto il cazzo e ha mandato a far...                0
3        4  se io volessi entrare in un bar con una trombe...                0
4        5  ok nessuno discute ciò ma  passetti avanti e  ...                0
...    ...                                                ...              ...
4995  4996  questa volta perderete molti consensi voi che ...                1
4996  4997  quando la russia ti piglia per il culo e la ue...                0
4997  4998  stai zitto coglione e fatti bombardare da puti...                1
4998  4999  lesbiche? non lo so. per me sono solo delle sq...                0
4999  5000  vi giuro una non me la dimenticherò mai era ve...                0

[5000 rows x 3 columns]


In [11]:
ground_truth_labels = pd.read_csv("/kaggle/input/input-data/HODI_2023_test_GOLD.tsv", sep="\t")

In [12]:
df_test = pd.merge(df_test, ground_truth_labels[['id', 'homotransphobic']], on='id', how='left')

In [13]:
print(df_test)

       id                                               text  homotransphobic
0    5001                  quasi quasi è meglio femminiello!                1
1    5002  sotto quel tweet, c'è un ammasso di ritardati ...                0
2    5003  brava mara come sempre puntatona.. sei la zia ...                0
3    5004  copio i gay cagatemi !!! se vi chiedessi di de...                0
4    5005       io: è bono eh papà: ma non eri frocia te 🤡🤡🤡                0
..    ...                                                ...              ...
995  5996  quindi tu dai dei piglianculo alla gente e non...                0
996  5997  quel “sono a lavoro andré” che suona un po’ co...                0
997  5998  gol e  assist in  presenze. nella top  dei mig...                0
998  5999        ma non rompermi le balle culattone di merda                1
999  6000  vedo che anche oggi ti sei svegliata lesbica p...                0

[1000 rows x 3 columns]


In [14]:
def generate_prompt(data_point):
    return f"""
            ### Instruction: 
            Analyze the Italian sentence enclosed in square brackets in the Input 
            classify whether it is homotransphobic or not.
            Return in the Response "0" if it isn't homotransphobic and "1" if it is.
            
            ### Input: 
            [{data_point["text"]}]
            
            ### Response:
            {data_point['homotransphobic']}
            """
    
def generate_test_prompt(data_point):
    return f"""
            ### Instruction: 
            Analyze the Italian sentence enclosed in square brackets in the Input 
            classify whether it is homotransphobic or not.
            Return in the Response "0" if it isn't homotransphobic and "1" if it is.
            
            ### Input: 
            [{data_point["text"]}]
            
            ### Response:
            """

In [15]:
train_data, eval_data = train_test_split(df_train, test_size=0.2, random_state=42)

# Apply pre-processing to the text column
train_data["text"] = train_data["text"].apply(pre_process)
eval_data["text"] = eval_data["text"].apply(pre_process)

# Apply the generate_prompt function to the training dataset
train_data['text'] = train_data.apply(generate_prompt, axis=1)

# Apply the generate_prompt function to the evaluation dataset
eval_data['text'] = eval_data.apply(generate_prompt, axis=1)

df_test["text"] = df_test.apply(generate_test_prompt, axis=1)

y_true = df_test['homotransphobic'].astype(str)

In [16]:
#df_test['homotransphobic'] = df_test['homotransphobic'].map({0: 'negative', 1: 'positive'})
#train_data['homotransphobic'] = train_data['homotransphobic'].map({0: 'negative', 1: 'positive'})
#eval_data['homotransphobic'] = eval_data['homotransphobic'].map({0: 'negative', 1: 'positive'})

In [17]:
def evaluate(y_true, y_pred):
    # Convert string labels to numeric
    y_true = np.array(y_true, dtype=int)
    y_pred = np.array(y_pred, dtype=int)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    unique_labels = set(y_true)

    # Generate accuracy report
    for label in unique_labels:
        label_indices = np.where(y_true == label)[0]
        label_y_true = y_true[label_indices]
        label_y_pred = y_pred[label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report with specified labels
    class_report = classification_report(y_true=y_true, y_pred=y_pred, target_names=['1', '0'])
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [18]:
#!huggingface-cli login

In [19]:
model_name = ("meta-llama/Llama-2-7b-chat-hf")

In [20]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token='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',
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token='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',
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [21]:
def predict(samples, model, tokenizer):
    y_pred = []
    print(len(samples))  # Add this line
    for _, row in tqdm(samples.iterrows(), total=len(samples)):
        prompt = row["text"]

        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=1
                       )
        result = pipe(prompt)

        generated_text = result[0]['generated_text']

        # Find the position of '### Response:'
        response_index = generated_text.find('### Response:')

        # Extract the text after '### Response:'
        response_text = generated_text[response_index + len('### Response:'):].strip()

        y_pred.append(response_text)

    return y_pred

In [22]:
# Maybe implement few shot learning here? 

In [23]:
#y_pred = predict(df_test, model, tokenizer)

In [24]:
#evaluate(y_true, y_pred)

# Fine Tuning

In [25]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [34]:
training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

In [27]:
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

  if _pandas_api.is_sparse(col):


In [35]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)



  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [36]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model1")

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.


Epoch,Training Loss,Validation Loss
1,0.6787,1.128597
2,0.6378,1.082965
3,0.6294,1.078393


In [37]:
y_pred = predict(df_test, model, tokenizer)
evaluate(y_true, y_pred)

1000


100%|██████████| 1000/1000 [07:36<00:00,  2.19it/s]

Accuracy: 0.661
Accuracy for label 0: 0.834
Accuracy for label 1: 0.495

Classification Report:
              precision    recall  f1-score   support

           1       0.61      0.83      0.71       489
           0       0.76      0.50      0.60       511

    accuracy                           0.66      1000
   macro avg       0.69      0.66      0.65      1000
weighted avg       0.69      0.66      0.65      1000


Confusion Matrix:
[[408  81]
 [258 253]]





In [39]:
'''

The following code will create a Pandas DataFrame called evaluation containing the text,
true labels, and predicted labels from the test set. This is expectially useful for understanding
 the errors that the fine-tuned model makes, and gettting insights on how to improve the prompt.

'''

evaluation = pd.DataFrame({'text': df_test["text"],
                           'y_true':y_true,
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)