In [1]:
from finetune_utils import create_mistral_finetune_data
from baseline import transform_test_data
from api_utils import run_inference
import json
import together
from eval import evaluate_clusters_iou, count_matches, evaluate_clusters_entropy
import numpy as np

In [2]:
!pip install -q torch
!pip install -q git+https://github.com/huggingface/transformers #huggingface transformers for downloading models weights
!pip install -q datasets #huggingface datasets to download and manipulate datasets
!pip install -q peft #Parameter efficient finetuning - for qLora Finetuning
!pip install -q bitsandbytes #For Model weights quantisation
!pip install -q trl #Transformer Reinforcement Learning - For Finetuning using Supervised Fine-tuning
!pip install -q wandb -U #Used to monitor the model score during training

In [25]:
!pip install accelerate

In [14]:
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.10 (from ipywidgets)
  Downloading widgetsnbextension-4.0.10-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.10 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.2-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.0/215.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.10-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, ipy

In [3]:
import json
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## ONLY RUN THESE DATA UPLOAD CELLS ONCE!

In [19]:
with open('data/splits/connections_train.json', 'r') as file:
    train_data = json.load(file)
with open('data/splits/connections_val.json', 'r') as file:
    val_data = json.load(file)

In [None]:
train_df = create_mistral_finetune_data(train_data, "train")
val_df = create_mistral_finetune_data(val_data, "val")
print(train_df)

['<s>[INST] Given 16 items, find groups of four items that share something in common.\n\nCategory Examples\nFish: BASS, FLOUNDER, SALMON, TROUT\nFire ___: ANT, DRILL, ISLAND, OPAL\nCategories will always be more specific than “5-letter-words,” “Names” or “Verbs.”\nEach puzzle has exactly one solution. Watch out for words that seem to belong to multiple categories!\n\nNow answer for these 16 words. Follow these restrictions for the output:\n1. Respond in 4 lines, with a group of 4 words on each line. ONLY include the 16 words given. There should be NO OTHER WORDS.\n2. DO NOT include the descriptions. DO NOT have any "Descriptions:" text.\n3. DO NOT include any preceding text, like "Answers:", or line numbers, like "1.".\n\nNECK, PENNY, AGAIN, DIME, CLOVER, NICKEL, LEAD, SO, MOON, HALF, RAINBOW, TIN, QUARTER, HORSESHOE, ZINC, IRON [/INST]\nDIME, NICKEL, PENNY, QUARTER\nIRON, LEAD, TIN, ZINC\nCLOVER, HORSESHOE, MOON, RAINBOW\nAGAIN, HALF, NECK, SO </s>', '<s>[INST] Given 16 items, find gr

### Fine-tuning

In [20]:
dataset = load_dataset('json', data_files={'train': "./data/splits/connections_train.jsonl",
                                              'val': "./data/splits/connections_val.jsonl"})

In [22]:
new_model = "connections-mistral" #set the name of the new model

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./finetuned-mistral"

# Number of training epochs
num_train_epochs = 10

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [27]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# Load the base model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)

base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

# Load MitsralAi tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:
# Set LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    save_strategy='epoch',
    max_grad_norm=max_grad_norm,
    # max_steps=100, # the total number of training steps to perform
    evaluation_strategy="epoch",
    do_eval=True,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb",
    push_to_hub=True
)

# Initialize the SFTTrainer for fine-tuning
trainer = SFTTrainer(
    model=model_name,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,  # You can specify the maximum sequence length here
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
torch.cuda.empty_cache()

In [None]:
wandb.login(key="191ee4748b3456523ee47c1c1ec1e3f476ade698")

In [None]:
# Start the training process
trainer.train()

# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

In [None]:
trainer.push_to_hub()
wandb.finish()
model.config.use_cache = True

## Inference/Eval

In [21]:
with open('data/splits/connections_test.json', 'r') as file:
    data = json.load(file)
test_data = transform_test_data(data, False)
#read from json file

with open('data/predictions/finetuned_results.json', 'r') as file:
    predicted_answers = json.load(file)

In [22]:

ious = []
for i in range(len(test_data)):
    pred_clusters = predicted_answers[i]
    true_clusters = test_data[i]['solutions']
    print(pred_clusters)
    print(true_clusters)
    iou = evaluate_clusters_iou(pred_clusters=pred_clusters, true_clusters=true_clusters)
    ious.append(iou)
avg_iou = sum(ious) / len(ious)
print("average IOU loss in test set: ", avg_iou)

[['DRESS', 'HARVEST', 'SAILOR', 'STYLE'], ['MANNER', 'SAME', 'SAME', 'SAME'], ['BLUE', 'LIKEWISE', 'LOOK', 'SECOND'], ['DITTO', 'SIGHT', 'SMELL', 'TOUCH']]
[['SIGHT', 'SMELL', 'TASTE', 'TOUCH'], ['DRESS', 'LOOK', 'MANNER', 'STYLE'], ['DITTO', 'LIKEWISE', 'SAME', 'SECOND'], ['BLUE', 'HARVEST', 'NEW', 'SAILOR']]
[['FUZZ', 'HAIL', 'STATIC', 'NOISE'], ['BUTT', 'INTENSE', 'RAM', 'VIOLENT'], ['EXTRA', 'OVER', 'PROUD', 'VIRGIN'], ['BLOODY', 'DEEP', 'EXTREME', 'SNOW']]
[['DEEP', 'EXTREME', 'FIERCE', 'INTENSE'], ['BUMP', 'BUTT', 'KNOCK', 'RAM'], ['FUZZ', 'NOISE', 'SNOW', 'STATIC'], ['BLOODY', 'HAIL', 'PROUD', 'VIRGIN']]
[['BALL', 'BUMPER', 'HAZE', 'TRAIL'], ['FLIPPER', 'ICE', 'TRACK', 'TAIL'], ['CLOUD', 'CLOSE', 'FOG', 'MIST'], ['BUMP', 'CONFRONT', 'COMET', 'EMPIRE']]
[['CLOUD', 'FOG', 'HAZE', 'MIST'], ['SHADOW', 'TAIL', 'TRACK', 'TRAIL'], ['BALL', 'BUMPER', 'FLIPPER', 'PLUNGER'], ['FIN', 'ICE', 'IRE', 'NETHER']]
[['DINKY', 'LITTLE', 'MINUTE', 'PINKY'], ['APPENDIX', 'INDEX', 'PREFACE', 'RING'],

In [25]:
matches = np.zeros((len(test_data),4))
entropy = np.zeros(len(test_data))

for i in range(len(test_data)):
    pred_clusters = predicted_answers[i]
    true_clusters = test_data[i]['solutions']
    print(pred_clusters)
    print(true_clusters)
    matches[i] = count_matches(pred_clusters, true_clusters)
    entropy[i] = evaluate_clusters_entropy(pred_clusters, true_clusters)

average_match_per_difficulty = np.mean(matches, axis=0)
print("average matches per difficulty: ", average_match_per_difficulty)
matches_per_puzzle = np.sum(matches, axis=1)
average_matches = np.mean(matches_per_puzzle)
print("average matches per puzzle: ", average_matches)
perfect_matches = np.sum(matches_per_puzzle == 4) 
print("number of perfectly matched puzzle: ", perfect_matches, "out of ", len(test_data), "; percentage: ", perfect_matches/len(test_data) * 100, "%")
average_entropy = np.mean(entropy)
print("average cluster entropy: ", average_entropy)

[['DRESS', 'HARVEST', 'SAILOR', 'STYLE'], ['MANNER', 'SAME', 'SAME', 'SAME'], ['BLUE', 'LIKEWISE', 'LOOK', 'SECOND'], ['DITTO', 'SIGHT', 'SMELL', 'TOUCH']]
[['SIGHT', 'SMELL', 'TASTE', 'TOUCH'], ['DRESS', 'LOOK', 'MANNER', 'STYLE'], ['DITTO', 'LIKEWISE', 'SAME', 'SECOND'], ['BLUE', 'HARVEST', 'NEW', 'SAILOR']]
[['FUZZ', 'HAIL', 'STATIC', 'NOISE'], ['BUTT', 'INTENSE', 'RAM', 'VIOLENT'], ['EXTRA', 'OVER', 'PROUD', 'VIRGIN'], ['BLOODY', 'DEEP', 'EXTREME', 'SNOW']]
[['DEEP', 'EXTREME', 'FIERCE', 'INTENSE'], ['BUMP', 'BUTT', 'KNOCK', 'RAM'], ['FUZZ', 'NOISE', 'SNOW', 'STATIC'], ['BLOODY', 'HAIL', 'PROUD', 'VIRGIN']]
[['BALL', 'BUMPER', 'HAZE', 'TRAIL'], ['FLIPPER', 'ICE', 'TRACK', 'TAIL'], ['CLOUD', 'CLOSE', 'FOG', 'MIST'], ['BUMP', 'CONFRONT', 'COMET', 'EMPIRE']]
[['CLOUD', 'FOG', 'HAZE', 'MIST'], ['SHADOW', 'TAIL', 'TRACK', 'TRAIL'], ['BALL', 'BUMPER', 'FLIPPER', 'PLUNGER'], ['FIN', 'ICE', 'IRE', 'NETHER']]
[['DINKY', 'LITTLE', 'MINUTE', 'PINKY'], ['APPENDIX', 'INDEX', 'PREFACE', 'RING'],