In [None]:
import os
from together import Together
import random
from pathlib import Path
from datasets import load_from_disk, Dataset
from prompts import MODIFIED_INSTRUCTION_GENERATION_PROMPT, JUDGEMENT_ANNOTATION_PROMPT
import logging
import json
from groq import Groq
from typing import Any, Dict
import time
from sklearn.model_selection import train_test_split
from self_taught_evaluator import (
    load_data,
    create_eval_set,
    evaluate_model_as_judge,
    generate_response,
    generate_bad_response,
    generate_judgements,
    annotate_data,
    launch_finetuning_job,
)

# force reimport of launch_finetuning_job
from self_taught_evaluator import launch_finetuning_job


# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Constants
NUM_JUDGEMENTS = 5
NUM_ITERATIONS = 4
INFERENCE_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [None]:
with open("keys.json", "r") as f:
    import json
    keys = json.load(f)
    
    os.environ["TOGETHER_API_KEY"] = keys["TOGETHER_API_KEY"]

api_key = os.environ["TOGETHER_API_KEY"]

if not api_key:
    raise ValueError("Please set all required environment variables: TOGETHER_API_KEY")

# Initialize clients
together_client = Together(api_key=api_key)

# Load dataset
data_path = "processed_data/"
full_dataset = load_from_disk(data_path)

# Create directories
output_dir = Path("finetuning_data")
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Dataset loaded with {len(full_dataset)} examples")

In [None]:
# Model configuration
base_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
suffix = "self_taught_eval_ft_v3_sanity"
n_epochs = 3
n_checkpoints = 1
lora = True

# Data configuration
num_samples = 200
eval_set_size = 50

indices = list(range(len(full_dataset)))
train_indices, eval_indices = train_test_split(
    indices, 
    test_size=min(eval_set_size, int(len(full_dataset) * 0.2)),
    random_state=42
)

train_indices = train_indices[:min(num_samples, len(train_indices))]
dataset = full_dataset.select(train_indices)
eval_dataset = full_dataset.select(eval_indices)

print(f"Training on {len(dataset)} samples")
print(f"Evaluating on {len(eval_dataset)} samples")

In [None]:
iteration = 1
current_model = base_model
output_filename = output_dir / f"annotations_iter{iteration}.jsonl"

model_for_annotation = INFERENCE_MODEL
current_client = together_client

print(f"Starting iteration {iteration} - Annotation")
print(f"Using model: {model_for_annotation}")

# Annotate data
annotated_data = annotate_data(dataset, model_for_annotation, current_client, iteration, output_filename)
print(f"Annotated {len(annotated_data)} data points")

In [None]:
suffix_iter = f"{suffix}_iter{iteration}"
print(f"Starting iteration {iteration} - Fine-tuning")
print(f"Suffix: {suffix_iter}")

# Launch fine-tuning
ft_resp = launch_finetuning_job(
    output_filename,
    current_model,
    suffix_iter,
    n_epochs,
    n_checkpoints,
    lora,
)

print(f"Fine-tuning job launched: {ft_resp.id}")

In [None]:
job_to_monitor_id = ft_resp.id
new_fine_tuned_model_id = None

print("Monitoring fine-tuning job...")
while True:
    try:
        status_response = together_client.fine_tuning.retrieve(job_to_monitor_id)
        print(f"Job {job_to_monitor_id}: Status = {status_response.status}")
        
        if status_response.status == 'completed':
            new_fine_tuned_model_id = ft_resp.output_name
            print(f"Fine-tuning completed! New model: {new_fine_tuned_model_id}")
            break
        elif status_response.status in ['error', 'failed', 'cancelled']:
            print(f"Fine-tuning failed with status: {status_response.status}")
            break
    except Exception as e:
        print(f"Error checking status: {e}")
    
    time.sleep(30)

In [None]:
if new_fine_tuned_model_id:
    current_model = new_fine_tuned_model_id
    print(f"Updated current model to: {current_model}")

In [None]:
current_model = "nikilrav/Mixtral-8x7B-Instruct-v0.1-self_taught_eval_ft_v3_sanity_iter1-223a1d08-a5fdb2b8" # dedicated endpoint

In [None]:
print("Evaluating model...")
first_iter_model_results = evaluate_model_as_judge(
    current_model,
    together_client,
    eval_dataset,
    output_dir / f"{suffix}_iter1_model_eval.jsonl"
)
print(f"First iteration model results: {first_iter_model_results}")

In [None]:
iteration = 2
output_filename = output_dir / f"annotations_iter{iteration}.jsonl"

# For iteration 2+, use Together with fine-tuned model
model_for_annotation = current_model
current_client = together_client

print(f"Starting iteration {iteration} - Annotation")
print(f"Using model: {model_for_annotation}")

# Annotate data
annotated_data = annotate_data(dataset, model_for_annotation, current_client, iteration, output_filename)
print(f"Annotated {len(annotated_data)} data points")

In [None]:
suffix_iter = f"{suffix}_iter{iteration}"
print(f"Starting iteration {iteration} - Fine-tuning")
print(f"Suffix: {suffix_iter}")

# Launch fine-tuning
ft_resp = launch_finetuning_job(
    output_filename,
    base_model,
    suffix_iter,
    n_epochs,
    n_checkpoints,
    lora,
)

print(f"Fine-tuning job launched: {ft_resp.id}")

In [None]:
job_to_monitor_id = ft_resp.id
new_fine_tuned_model_id = None

print("Monitoring fine-tuning job...")
while True:
    try:
        status_response = together_client.fine_tuning.retrieve(job_to_monitor_id)
        print(f"Job {job_to_monitor_id}: Status = {status_response.status}")
        
        if status_response.status == 'completed':
            new_fine_tuned_model_id = ft_resp.output_name
            print(f"Fine-tuning completed! New model: {new_fine_tuned_model_id}")
            break
        elif status_response.status in ['error', 'failed', 'cancelled']:
            print(f"Fine-tuning failed with status: {status_response.status}")
            break
    except Exception as e:
        print(f"Error checking status: {e}")
    
    time.sleep(30)

# Update current model
if new_fine_tuned_model_id:
    current_model = new_fine_tuned_model_id
    print(f"Updated current model to: {current_model}")

In [None]:
current_model = "nikilrav/Mixtral-8x7B-Instruct-v0.1-self_taught_eval_ft_v3_sanity_iter2-978d1376-ae723cfc"

In [None]:
print("Evaluating model...")
second_iter_model_results = evaluate_model_as_judge(
    current_model,
    together_client,
    eval_dataset,
    output_dir / f"{suffix}_iter2_model_eval.jsonl"
)
print(f"Second iteration model results: {second_iter_model_results}")

In [None]:
iteration = 3
output_filename = output_dir / f"annotations_iter{iteration}.jsonl"

# For iteration 3, use Together with fine-tuned model
model_for_annotation = current_model
current_client = together_client

print(f"Starting iteration {iteration} - Annotation")
print(f"Using model: {model_for_annotation}")

# Annotate data
annotated_data = annotate_data(dataset, model_for_annotation, current_client, iteration, output_filename)
print(f"Annotated {len(annotated_data)} data points")


In [None]:
suffix_iter = f"{suffix}_iter{iteration}"
print(f"Starting iteration {iteration} - Fine-tuning")
print(f"Suffix: {suffix_iter}")

# Launch fine-tuning
ft_resp = launch_finetuning_job(
    output_filename,
    base_model,
    suffix_iter,
    n_epochs,
    n_checkpoints,
    lora,
)

print(f"Fine-tuning job launched: {ft_resp.id}")


In [None]:
job_to_monitor_id = ft_resp.id
new_fine_tuned_model_id = None

print("Monitoring fine-tuning job...")
while True:
    try:
        status_response = together_client.fine_tuning.retrieve(job_to_monitor_id)
        print(f"Job {job_to_monitor_id}: Status = {status_response.status}")
        
        if status_response.status == 'completed':
            new_fine_tuned_model_id = ft_resp.output_name
            print(f"Fine-tuning completed! New model: {new_fine_tuned_model_id}")
            break
        elif status_response.status in ['error', 'failed', 'cancelled']:
            print(f"Fine-tuning failed with status: {status_response.status}")
            break
    except Exception as e:
        print(f"Error checking status: {e}")
    
    time.sleep(30)

# Update current model
if new_fine_tuned_model_id:
    current_model = new_fine_tuned_model_id
    print(f"Updated current model to: {current_model}")


In [None]:
current_model = "nikilrav/Mixtral-8x7B-Instruct-v0.1-self_taught_eval_ft_v3_sanity_iter3-c7c58d52-e448820e"

In [None]:
print("Evaluating third iteration model...")
third_iter_model_results = evaluate_model_as_judge(
    current_model,
    together_client,
    eval_dataset,
    output_dir / f"{suffix}_iter3_model_eval.jsonl"
)
print(f"Third iteration model results: {third_iter_model_results}")

In [None]:
iteration = 4
output_filename = output_dir / f"annotations_iter{iteration}.jsonl"

# For iteration 3, use Together with fine-tuned model
model_for_annotation = current_model
current_client = together_client

print(f"Starting iteration {iteration} - Annotation")
print(f"Using model: {model_for_annotation}")

# Annotate data
annotated_data = annotate_data(dataset, model_for_annotation, current_client, iteration, output_filename)
print(f"Annotated {len(annotated_data)} data points")

In [None]:
suffix_iter = f"{suffix}_iter{iteration}"
print(f"Starting iteration {iteration} - Fine-tuning")
print(f"Suffix: {suffix_iter}")

# Launch fine-tuning
ft_resp = launch_finetuning_job(
    output_filename,
    base_model,
    suffix_iter,
    n_epochs,
    n_checkpoints,
    lora,
)

print(f"Fine-tuning job launched: {ft_resp.id}")

In [None]:
job_to_monitor_id = ft_resp.id
new_fine_tuned_model_id = None

print("Monitoring fine-tuning job...")
while True:
    try:
        status_response = together_client.fine_tuning.retrieve(job_to_monitor_id)
        print(f"Job {job_to_monitor_id}: Status = {status_response.status}")
        
        if status_response.status == 'completed':
            new_fine_tuned_model_id = ft_resp.output_name
            print(f"Fine-tuning completed! New model: {new_fine_tuned_model_id}")
            break
        elif status_response.status in ['error', 'failed', 'cancelled']:
            print(f"Fine-tuning failed with status: {status_response.status}")
            break
    except Exception as e:
        print(f"Error checking status: {e}")
    
    time.sleep(30)

# Update current model
if new_fine_tuned_model_id:
    current_model = new_fine_tuned_model_id
    print(f"Updated current model to: {current_model}")


In [None]:
current_model = "nikilrav/Mixtral-8x7B-Instruct-v0.1-self_taught_eval_ft_v3_sanity_iter4-e670c500-3ac461bb"

In [None]:
print("Evaluating fourth iteration model...")
third_iter_model_results = evaluate_model_as_judge(
    current_model,
    together_client,
    eval_dataset,
    output_dir / f"{suffix}_iter4_model_eval.jsonl"
)
print(f"Fourth iteration model results: {third_iter_model_results}")

In [None]:
print("Evaluating base model...")
base_model_results = evaluate_model_as_judge(
    base_model,
    together_client,
    eval_dataset,
    output_dir / "base_model_eval.jsonl"
)
print(f"Base model results: {base_model_results}")

In [None]:
current_model

In [None]:
print("\n" + "="*50)
print("EVALUATION COMPARISON")
print("="*50)
print(f"Base model:   {base_model}")
print(f"Final model:  {current_model}")
print("-"*50)
print(f"Base model accuracy:  {base_model_results['accuracy']:.2f}%")
print(f"Final model accuracy: {third_iter_model_results['accuracy']:.2f}%")
print(f"Improvement: {third_iter_model_results['accuracy'] - base_model_results['accuracy']:.2f}%")
print("="*50)

# Save comparison
comparison = {
    "base_model": {
        "name": base_model,
        "results": base_model_results
    },
    "final_model": {
        "name": current_model,
        "results": third_iter_model_results
    },
    "improvement": third_iter_model_results['accuracy'] - base_model_results['accuracy']
}

with open(output_dir / "evaluation_comparison.json", 'w') as f:
    json.dump(comparison, f, indent=2)

print(f"Saved comparison to {output_dir / 'evaluation_comparison.json'}")