In [1]:
!pip install transformers datasets accelerate evaluate trl "tensorboard==2.15" optuna scikit-learn --upgrade --quiet
!huggingface-cli login --token "..."

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# resuse https://github.com/pkasela/DESIRE-ME/blob/main/src/model/utils.py
import logging
import os
import random

import numpy as np
import torch
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

logger = logging.getLogger(__name__)


def seed_everything(seed: int):
    logger.info(f'Setting global random seed to {seed}')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [3]:
seed_everything(42)

In [4]:
import pandas as pd
import numpy as np

# Modeling
#import tensorflow as tf
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline, DataCollatorWithPadding

# Huggingface access token
from huggingface_hub import login
from huggingface_hub import HfFolder
#from trl import SFTConfig, SFTTrainer


# Hugging Face Dataset
from datasets import Dataset

from optuna.samplers import TPESampler

# Model performance evaluation
import evaluate

# Load to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
import pandas as pd
# Read the unbalanced binary data

#train_unb = pd.read_csv('/workspace/phi3_direct_query_generation_processed_5k.csv')
#train_unb = pd.read_csv('/workspace/phi3_topic_in_generation_5k_unique_final.csv')

train_unb = pd.read_csv('/workspace/gpt4o_direction_query_generation_processed_5k_reprocessed.csv')
#train_unb = pd.read_csv('/workspace/gpt4o_topic_in_generation_5k_unique_final.csv')

#train_unb = pd.read_csv('/workspace/llama3.1_direct_generation_5k_final.csv')
#train_unb = pd.read_csv('/workspace/llama3.1_topic_in_generation_unique_5k_final.csv')


test_unb = pd.concat([pd.read_csv('/workspace/clariq_unbalanced/train_binary.csv'), pd.read_csv('/workspace/clariq_unbalanced/test_binary.csv'), pd.read_csv('/workspace/clariq_unbalanced/dev_binary.csv')], axis=0)

train_clariq_unb = train_unb[['initial_request', 'binary_label']].drop_duplicates().dropna()
train_clariq_unb = train_clariq_unb.reset_index(drop=True)
#dev_clariq_unb = dev_unb[['initial_request', 'binary_label']].drop_duplicates()
#dev_clariq_unb = dev_clariq_unb.reset_index(drop=True)
test_clariq_unb = test_unb[['initial_request', 'binary_label']].drop_duplicates().dropna()
test_clariq_unb = test_clariq_unb.reset_index(drop=True)

In [6]:
train_clariq_unb

Unnamed: 0,initial_request,binary_label
0,What are the latest recommendations for managi...,0
1,What are the benefits of incorporating mindful...,0
2,What are the major policy changes proposed in ...,0
3,What are the health benefits of incorporating ...,0
4,What are effective techniques for improving my...,0
...,...,...
4995,Is it hot?,1
4996,Who’s in charge?,1
4997,Turn that on.,1
4998,He took her to the zoo.,1


In [None]:
train_clariq_unb['label'] = train_clariq_unb['binary_label'].astype(int)
#dev_clariq_unb['label'] = dev_clariq_unb['binary_label'].astype(int)
test_clariq_unb['label'] = test_clariq_unb['binary_label'].astype(int)


# Convert dataframe to Hugging Face arrow dataset
hg_train_data = Dataset.from_pandas(train_clariq_unb[['initial_request', 'label']])
#hg_dev_data = Dataset.from_pandas(dev_clariq_unb[['initial_request', 'label']])
hg_test_data = Dataset.from_pandas(test_clariq_unb[['initial_request', 'label']])

print("Length of hg_train_data", len(hg_train_data))
#print("Length of hg_dev_data", len(hg_dev_data))
print("Length of hg_test_data", len(hg_test_data))

In [None]:
ambig_test = pd.read_csv('/workspace/ambig_test.csv')
ambig_test = ambig_test[['question', 'label']]
ambig_test = ambig_test.rename(columns = {'question': 'initial_request'})
hg_ambig_test_data = Dataset.from_pandas(ambig_test)
hg_ambig_test_data

In [None]:
inscit_test = pd.read_csv('/workspace/INSCIT_test_furtherprocessed.csv')
inscit_test = inscit_test[['initial_request', 'binary_label']]
inscit_test = inscit_test.rename(columns = {'binary_label': 'label'})
hg_inscit_test_data = Dataset.from_pandas(inscit_test)
hg_inscit_test_data


In [None]:
len(test_clariq_unb[test_clariq_unb['label']==1])/len(test_clariq_unb)

In [None]:
len(ambig_test[ambig_test['label']==1])/len(ambig_test)

In [None]:
len(inscit_test[inscit_test['label']==1])/len(inscit_test)

In [None]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  #"distilbert-base-uncased"  "bert-base-uncased"roberta-base
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data['initial_request'],
                     max_length=512,
                     truncation=True)
                     #padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset, batched=True)
#dataset_dev = hg_dev_data.map(tokenize_dataset, batched=True)
dataset_test = hg_test_data.map(tokenize_dataset, batched=True)
dataset_test_ambig = hg_ambig_test_data.map(tokenize_dataset, batched=True)
dataset_test_inscit = hg_inscit_test_data.map(tokenize_dataset, batched=True)


# Dynamically pad
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Load model

#def model_init():
  #model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=4)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, return_dict=True).to(device) #"bert-base-uncased" "distilbert-base-uncased"roberta-base 
# Set up training arguments
training_args = TrainingArguments(
    f"...",

    num_train_epochs=3,
    per_device_train_batch_size=64,
    #per_device_eval_batch_size=64,
    learning_rate=5e-5,
    #weight_decay=0.01,
    seed=42,

)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    #eval_dataset=dataset_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)


In [None]:
trainer.train()

In [None]:
np.argmax(trainer.predict(dataset_test).predictions, axis=1)

In [None]:
trainer.evaluate(dataset_test)

In [17]:
from sklearn.metrics import balanced_accuracy_score
def compute_metrics(predictions, labels):

    p= evaluate.load("precision")
    r = evaluate.load("recall")
    f = evaluate.load("f1")
    acc = evaluate.load("accuracy")


    # Calculate accuracy
    accuracy = acc.compute(predictions=predictions, references=labels)["accuracy"]
    bacc = balanced_accuracy_score(y_true=labels, y_pred=predictions)

   # Calculate precision, recall, and F1-score
    precision = p.compute(predictions=predictions, references=labels, average="weighted")["precision"]
    recall = r.compute(predictions=predictions, references=labels, average="weighted")["recall"]
    f1_weighted = f.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    f1_macro = f.compute(predictions=predictions, references=labels, average="macro")["f1"] 

    return {
        'bacc': bacc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_weighted': f1_weighted,
        'f1_marco': f1_macro
    }

In [None]:
compute_metrics(np.argmax(trainer.predict(dataset_test).predictions, axis=1), test_clariq_unb['label'])

In [None]:
compute_metrics(np.argmax(trainer.predict(dataset_test_ambig).predictions, axis=1), ambig_test['label'])

In [None]:
compute_metrics(np.argmax(trainer.predict(dataset_test_inscit).predictions, axis=1), inscit_test['label'])