<a href="https://colab.research.google.com/github/masoudcharkhabi/ML-from-Expert-Preferences/blob/abstractions/abstractions/colab_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Setup**

### Packages

In [None]:
from google.colab import drive
import os
from getpass import getpass
import wandb

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Disable TensorFlow info, warning, and error messages.


### Huggingface login

In [None]:
# Prompt for Hugging Face API token without saving it
huggingface_token = getpass("Please enter your Hugging Face API token: ")

# Set the token as an environment variable
os.environ["HUGGINGFACE_TOKEN"] = huggingface_token

# Log in using the token
from huggingface_hub import login
login(token=os.environ["HUGGINGFACE_TOKEN"])

Please enter your Hugging Face API token: ··········


### Make sure you have a GPU and High memory

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Wed Dec  4 05:05:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              30W /  70W |   2735MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Clone Github repo (you need private-public keys with the same naming convention)

In [None]:
drive.mount('/content/drive', force_remount=True)

# Create symbolic links to the SSH keys in Drive
!ln -s /content/drive/MyDrive/ssh_keys/id_colab ~/.ssh/id_colab
!ln -s /content/drive/MyDrive/ssh_keys/id_colab.pub ~/.ssh/id_colab.pub

# Set the correct permissions for SSH keys
!chmod 600 ~/.ssh/id_colab
!chmod 644 ~/.ssh/id_colab.pub

# Start SSH agent and add key
!eval "$(ssh-agent -s)"
!ssh-add ~/.ssh/id_colab

# Create SSH config
ssh_config = """
Host github.com
  HostName github.com
  User git
  IdentityFile ~/.ssh/id_colab
  StrictHostKeyChecking no
"""

# Create the .ssh directory if it doesn't exist
os.makedirs(os.path.expanduser("~/.ssh"), exist_ok=True)

with open(os.path.expanduser("~/.ssh/config"), "w") as f:
    f.write(ssh_config)

# Test SSH connection
!ssh -T git@github.com

Mounted at /content/drive
ln: failed to create symbolic link '/root/.ssh/id_colab': File exists
ln: failed to create symbolic link '/root/.ssh/id_colab.pub': File exists
Agent pid 44810
Could not open a connection to your authentication agent.
Hi masoudcharkhabi! You've successfully authenticated, but GitHub does not provide shell access.


In [None]:
repo_ssh_url = "git@github.com:masoudcharkhabi/ML-from-Expert-Preferences.git"
branch_name = "colab"
!git clone -b {branch_name} --single-branch {repo_ssh_url}

Cloning into 'ML-from-Expert-Preferences'...
remote: Enumerating objects: 261, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 261 (delta 9), reused 13 (delta 6), pack-reused 233 (from 1)[K
Receiving objects: 100% (261/261), 348.16 KiB | 1.32 MiB/s, done.
Resolving deltas: 100% (126/126), done.


In [None]:
repo_name = "ML-from-Expert-Preferences"
os.chdir(repo_name)

In [None]:
!git branch
!ls -ltr

* [32mcolab[m
total 20
-rw-r--r-- 1 root root 2680 Dec  4 04:17 README.md
drwxr-xr-x 3 root root 4096 Dec  4 04:17 baseline
drwxr-xr-x 4 root root 4096 Dec  4 04:17 cs329h-project
drwxr-xr-x 5 root root 4096 Dec  4 04:17 data
-rw-r--r-- 1 root root  155 Dec  4 04:17 requirements.txt


In [None]:
!pip install -r requirements.txt



# Data prep

In [None]:
# data_prep.py

from datasets import load_dataset

class DataPreparation:
    def __init__(self, dataset_path: str):
        self.dataset_path = dataset_path
        self.dataset = None

    def load_data(self):
        """Load dataset from Hugging Face"""
        self.dataset = load_dataset(self.dataset_path)
        return self.dataset

    def preprocess(self, example):
        """Preprocess dataset into input-output pairs"""
        return {
            "input_text": example['inputs'],
            "target_text": example['targets'],
        }

    def tokenize_function(self, examples, tokenizer):
        """Tokenize input and output text"""
        model_inputs = tokenizer(
            examples["input_text"],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        labels = tokenizer(
            examples["target_text"],
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def prepare_dataset(self, tokenizer):
        """Prepare the dataset for training"""
        train_dataset = self.dataset["train"].map(self.preprocess)
        train_dataset = train_dataset.map(lambda x: self.tokenize_function(x, tokenizer), batched=True)
        eval_dataset = self.dataset["validation"].map(self.preprocess) if "validation" in self.dataset else None
        eval_dataset = eval_dataset.map(lambda x: self.tokenize_function(x, tokenizer), batched=True) if eval_dataset else None
        # Return only train_dataset instead of a tuple
        return train_dataset

    def dataset_info(self):
        """Print information about the dataset, such as the size"""
        if self.dataset:
            for split in self.dataset.keys():
                print(f"Split: {split}, Number of examples: {len(self.dataset[split])}")
        else:
            print("Dataset is not loaded. Please call load_data() first.")

In [None]:
from transformers import AutoTokenizer
# from data_prep import DataPreparation

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")  # Replace "t5-small" with the appropriate model name

# Create instance of DataPreparation
data_preparation = DataPreparation(dataset_path="ai2-adapt-dev/flan_v2_converted")

# Load the dataset
dataset = data_preparation.load_data()

# Prepare the dataset (tokenize)
train_dataset = data_preparation.prepare_dataset(tokenizer=tokenizer)

# Now train_dataset is ready for training
# Get dataset information
data_preparation.dataset_info()

Repo card metadata block was not found. Setting CardData to empty.


Split: train, Number of examples: 89982


# Fine-tune

### This will require and Weights and Biases API key for logging

In [None]:
# train.py

import os
import wandb
import datetime
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

class ModelTrainer:
    def __init__(self, model_name: str):
        self.model_name = model_name
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.trainer = None
        self.experiment_id = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        self.output_dir = f"./models/fine_tuned_model_{self.experiment_id}"
        wandb.init(project="active-llm", name=f"fine_tune_{self.experiment_id}", resume="allow")

    def setup_training(self, train_dataset, eval_dataset=None, tokenizer=None):
        """Set up training arguments and Trainer"""
        # Ensure the output directory exists
        os.makedirs(self.output_dir, exist_ok=True)

        training_args = TrainingArguments(
            output_dir=self.output_dir,
            eval_strategy="epoch" if eval_dataset is not None else "no",
            learning_rate=2e-5,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=0.0005,
            weight_decay=0.01,
            report_to=["wandb"],  # Log training statistics to Weights & Biases
            run_name="model_training"
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
        )

    def train_model(self, save_model: bool = True):
        """Train the model"""
        if self.trainer is not None:
            self.trainer.train()
            if save_model:
                # Ensure the output directory exists before saving
                os.makedirs(self.output_dir, exist_ok=True)
                self.trainer.save_model(self.output_dir)
                print(f"Model saved to: {self.output_dir}")
            wandb.finish()
        else:
            raise ValueError("Trainer is not initialized. Please call setup_training first.")


In [None]:
# Import necessary classes
from transformers import AutoTokenizer
# from data_prep import DataPreparation
# from train import ModelTrainer

# Step 1: Load tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 2: Load and prepare the dataset
data_preparation = DataPreparation(dataset_path="ai2-adapt-dev/flan_v2_converted")
dataset = data_preparation.load_data()
train_dataset = data_preparation.prepare_dataset(tokenizer=tokenizer)

# Step 3: Initialize ModelTrainer and setup training
trainer = ModelTrainer(model_name=model_name)
trainer.setup_training(train_dataset=train_dataset, tokenizer=tokenizer)

# Step 4: Train the model
trainer.train_model()


Repo card metadata block was not found. Setting CardData to empty.


  self.trainer = Trainer(


Step,Training Loss


Model saved to: ./models/fine_tuned_model_20241204_050641


0,1
train/epoch,▁
train/global_step,▁

0,1
total_flos,6225722867712.0
train/epoch,0.00051
train/global_step,23.0
train_loss,17.81372
train_runtime,5.6087
train_samples_per_second,8.022
train_steps_per_second,4.101


# Serve

In [None]:
# serve.py

import torch

class ModelServer:
    def __init__(self, model_name: str):
        self.model_name = model_name
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = None

    def set_tokenizer(self, tokenizer):
        """Set the tokenizer for the model"""
        self.tokenizer = tokenizer

    def run_inference(self, input_text: str):
        """Generate output for a given input text"""
        if self.tokenizer is None:
            raise ValueError("Tokenizer not set. Please use set_tokenizer method.")

        inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        outputs = self.model.generate(**inputs)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def store_output(self, input_text: str, output_path: str):
        """Store the generated output in a file"""
        output = self.run_inference(input_text)
        with open(output_path, "w") as file:
            file.write(output)



In [None]:
from transformers import AutoTokenizer
# from serve import ModelServer

# Step 1: Load the tokenizer
model_name = "t5-small"  # TODO: Replace with fine-tuned model later
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 2: Initialize ModelServer
model_server = ModelServer(model_name=model_name)

# Step 3: Set the tokenizer for the model server
model_server.set_tokenizer(tokenizer)

# Step 4: Run inference
input_text = "Translate the following sentence to French: 'The weather is nice today.'"
output_text = model_server.run_inference(input_text)

# Step 5: Print the result
print("Generated Output:", output_text)

# Optional: Store the output in a file
experiment_id = "a1005"
output_path = "./data/output/"+experiment_id+"_output.txt"
model_server.store_output(input_text, output_path)
print(f"Output stored in: {output_path}")




Generated Output: « Le temps est agréable aujourd'hui ».
Output stored in: ./data/output/a1005_output.txt


# Eval

In [None]:
# eval.py

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoModelForSeq2SeqLM
from evaluate import load
import torch
import wandb
import numpy as np
import datetime
from random import sample

class ModelEvaluator:
    def __init__(self, model_name: str, tokenizer):
        self.model_name = model_name
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = tokenizer
        self.bleu_metric = load("bleu")
        self.rouge_metric = load("rouge")
        self.experiment_id = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        wandb.init(project="active-llm", name=f"eval_{self.experiment_id}", resume="allow")

    def evaluate(self, dataset, sample_size=100, batch_size=8):
        """Evaluate model performance on a subset of the dataset using batches"""
        # Sample a subset of the dataset to speed up evaluation
        if len(dataset) > sample_size:
            dataset = sample(list(dataset), sample_size)

        predictions = []
        references = []
        total_loss = 0
        num_tokens = 0

        for i in range(0, len(dataset), batch_size):
            batch = dataset[i:i + batch_size]
            input_texts = [example["input_text"] for example in batch]
            target_texts = [example["target_text"] for example in batch]

            inputs = self.tokenizer(input_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=100)
            labels = self.tokenizer(target_texts, return_tensors="pt", truncation=True, padding="max_length", max_length=100).input_ids

            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_length=100)
                predicted_texts = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
                predictions.extend(predicted_texts)
                references.extend(target_texts)

                # Calculate loss for perplexity in a batch
                output_loss = self.model(**inputs, labels=labels)
                total_loss += output_loss.loss.item() * labels.size(1)
                num_tokens += labels.size(1)

        # Calculate traditional metrics
        accuracy = accuracy_score(references, predictions)
        f1 = f1_score(references, predictions, average='weighted')
        precision = precision_score(references, predictions, average='weighted')
        recall = recall_score(references, predictions, average='weighted')

        # Calculate LLM and NLP specific metrics
        bleu_score = self.bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
        rouge_score = self.rouge_metric.compute(predictions=predictions, references=references)

        # Calculate perplexity
        avg_loss = total_loss / num_tokens
        perplexity = torch.exp(torch.tensor(avg_loss))

        # Log the results to W&B
        wandb.log({
            "accuracy": accuracy,
            "f1_score": f1,
            "precision": precision,
            "recall": recall,
            "bleu": bleu_score,
            "rouge": rouge_score,
            "perplexity": perplexity.item(),
                    })
        wandb.finish()

        return {
            "accuracy": accuracy,
            "f1_score": f1,
            "precision": precision,
            "recall": recall,
            "bleu": bleu_score,
            "rouge": rouge_score,
            "perplexity": perplexity.item()
        }


In [None]:
from transformers import AutoTokenizer
# from data_prep import DataPreparation
# from eval import ModelEvaluator

# Step 1: Load the tokenizer
model_name = "t5-small"

# TODO: Replace with  fine-tuned model path (including the correct timestamp) later
# model_name = "./fine_tuned_model_20231124_123456"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 2: Create instance of DataPreparation and load the dataset
data_preparation = DataPreparation(dataset_path="ai2-adapt-dev/flan_v2_converted")
dataset = data_preparation.load_data()

# Step 3: Prepare evaluation dataset
eval_dataset = data_preparation.prepare_dataset(tokenizer=tokenizer)

# Step 4: Initialize ModelEvaluator
evaluator = ModelEvaluator(model_name=model_name, tokenizer=tokenizer)

# Step 5: Run evaluation
evaluation_results = evaluator.evaluate(eval_dataset)

# Step 6: Print the evaluation results
print("Evaluation Results:", evaluation_results)


Repo card metadata block was not found. Setting CardData to empty.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
f1_score,▁
perplexity,▁
precision,▁
recall,▁

0,1
accuracy,0.0
f1_score,0.0
perplexity,76779512.0
precision,0.0
recall,0.0


Evaluation Results: {'accuracy': 0.0, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0, 'bleu': {'bleu': 0.03550836856172761, 'precisions': [0.11895388076490439, 0.03905120046282904, 0.023435182438445563, 0.014602981442044418], 'brevity_penalty': 1.0, 'length_ratio': 1.2912127814088599, 'translation_length': 3556, 'reference_length': 2754}, 'rouge': {'rouge1': 0.10575423600418363, 'rouge2': 0.03249340922976353, 'rougeL': 0.0822570979963348, 'rougeLsum': 0.08913628656954073}, 'perplexity': 76779512.0}
