Ensure you login to Azure az login - device

Ensure you change the Kernel to Python 3.10 AzureML

In [None]:
pip install python-dotenv

In [None]:
pip install datasets tqdm -U

In [None]:
from datasets import load_dataset
from abc import ABC

In [None]:
class InputDataset(ABC):
    def __init__(self):
        super().__init__()
        (
            self.train_data_file_name,
            self.test_data_file_name,
            self.eval_data_file_name,
        ) = (None, None, None)

In [None]:
class CQnAHuggingFaceInputDataset(InputDataset):
    """
    Loads the HuggingFace dataset
    """

    def __init__(self):
        super().__init__()

    def load_hf_dataset(
        self,
        dataset_name,
        train_sample_size=10,
        val_sample_size=10,
        test_sample_size=10,
        train_split_name="train",
        val_split_name="validation",
        test_split_name="test",
    ):
        full_dataset = load_dataset(dataset_name)

        if val_split_name is not None:
            train_data = full_dataset[train_split_name].select(range(train_sample_size))
            val_data = full_dataset[val_split_name].select(range(val_sample_size))
            test_data = full_dataset[test_split_name].select(range(test_sample_size))
        else:
            train_val_data = full_dataset[train_split_name].select(
                range(train_sample_size + val_sample_size)
            )
            train_data = train_val_data.select(range(train_sample_size))
            val_data = train_val_data.select(
                range(train_sample_size, train_sample_size + val_sample_size)
            )
            test_data = full_dataset[test_split_name].select(range(test_sample_size))

        return train_data, val_data, test_data

In [None]:
# We can define train and test sample sizes here. Validation size is kept same as test sample size
train_sample_size = 100
val_sample_size = 100

# Sample notebook using the dataset: https://huggingface.co/datasets/tau/commonsense_qa
dataset_name = "tau/commonsense_qa"
input_dataset = CQnAHuggingFaceInputDataset()

# Note: train_split_name and test_split_name can vary by dataset. They are passed as arguments in load_hf_dataset.
# If validation_split_name is None, the below function will split the train set to create the specified sized validation set.
train, val, _ = input_dataset.load_hf_dataset(
    dataset_name=dataset_name,
    train_sample_size=train_sample_size,
    val_sample_size=val_sample_size,
    train_split_name="train",
    val_split_name="validation",
)

print("Len of train data sample is " + str(len(train)))
print("Len of validation data sample is " + str(len(val)))

In [None]:
! mkdir -p data

In [None]:
train_data_path = "data/train_original_data.jsonl"

In [None]:
import json

In [None]:
system_prompt = "You are a helpful assistant. Your output should only be one of the five choices: 'A', 'B', 'C', 'D', or 'E'."
user_prompt_template = "Answer the following multiple-choice question by selecting the correct option.\n\nQuestion: {question}\nAnswer Choices:\n{answer_choices}"

for row in train:
    data = {"messages": []}
    data["messages"].append(
        {
            "role": "system",
            "content": system_prompt,
        }
    )
    question, choices = row["question"], row["choices"]
    labels, choice_list = choices["label"], choices["text"]
    answer_choices = [
        "({}) {}".format(labels[i], choice_list[i]) for i in range(len(labels))
    ]
    answer_choices = "\n".join(answer_choices)
    data["messages"].append(
        {
            "role": "user",
            "content": user_prompt_template.format(
                question=question, answer_choices=answer_choices
            ),
        }
    )
    with open(train_data_path, "a") as f:
        f.write(json.dumps(data) + "\n")

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
print("Loading environment variables from .env file...")
load_dotenv()

In [None]:
# Get Azure AI Foundry credentials from environment variables
teacher_model_name = os.getenv('TEACHER_MODEL_NAME')
teacher_model_endpoint_url = os.getenv('TEACHER_MODEL_ENDPOINT')
teacher_model_api_key = os.getenv('TEACHER_MODEL_KEY')

# Print values for debugging (remove in production)
print(f"Teacher Model Name: {teacher_model_name}")
print(f"Teacher Model Endpoint: {teacher_model_endpoint_url}")
# Don't print the API key for security reasons
print(f"Teacher Model API Key loaded: {'Yes' if teacher_model_api_key else 'No'}")

In [None]:
pip install azure-ai-inference

In [None]:
import os
import json
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential
from tqdm.notebook import tqdm

In [None]:
def process_question(question_data):
    try:
        messages = []
        for msg in question_data["messages"]:
            if msg["role"] == "system":
                messages.append(SystemMessage(content=msg["content"]))
            elif msg["role"] == "user":
                messages.append(UserMessage(content=msg["content"]))

        response = client.complete(
            messages=messages,
            model=model_name,
            max_tokens=4000  # Reduced since we just need short answers like A, B, C, D, or E
        )

        return {
            "question": question_data["messages"][1]["content"],
            "response": response.choices[0].message.content,
            "full_response": response
        }
    except Exception as e:
        return {
            "question": question_data["messages"][1]["content"] if len(question_data["messages"]) > 1 else "Error",
            "response": f"Error: {str(e)}",
            "full_response": None
        }

In [None]:
endpoint = teacher_model_endpoint_url
model_name = teacher_model_name
key = teacher_model_api_key
client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [None]:
# Read the JSONL file and process each question
results = []

# First, count the number of lines in the file for the progress bar
with open(train_data_path, 'r', encoding='utf-8') as file:
    total_lines = sum(1 for _ in file if _.strip())

print(f"Processing {total_lines} questions from {train_data_path}")

with open(train_data_path, 'r', encoding='utf-8') as file:
    # Initialize tqdm progress bar
    progress_bar = tqdm(total=total_lines, desc="Processing questions", unit="question")
    
    for i, line in enumerate(file):
        if line.strip():  # Skip empty lines
            try:
                question_data = json.loads(line)
                result = process_question(question_data)
                results.append(result)
                
                # Update progress bar description with latest result
                progress_bar.set_description(f"Latest answer: {result['response']}")
                progress_bar.update(1)
                
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i+1}: {str(e)}")
                progress_bar.update(1)
            except Exception as e:
                print(f"Error processing line {i+1}: {str(e)}")
                progress_bar.update(1)
    
    progress_bar.close()

In [None]:
output_file_path = "./data/train_data.jsonl"

# Initialize tqdm progress bar for writing results
print(f"Writing {len(results)} processed questions to {output_file_path}")
with open(output_file_path, 'w', encoding='utf-8') as f:
    for result in tqdm(results, desc="Writing results", unit="record"):
        # Extract just the question content (removing the instruction part)
        question_text = result["question"]
        # if "Question: " in question_text:
        #     question_text = question_text.split("Question: ")[1].split("\nAnswer Choices:")[0]

        # Create the simplified output format
        output_line = {
            "Question": question_text,
            "Answer": result["response"]
        }

        # Write as JSONL (one JSON object per line)
        f.write(json.dumps(output_line, ensure_ascii=False) + '\n')

In [None]:
# Visualize the distillation results
import matplotlib.pyplot as plt
import pandas as pd
import re

# Count occurrences of each answer
answer_counts = {}
other_responses = []

for result in results:
    answer = result['response'].strip()
    
    # Try to extract just the letter if there's additional text
    # This regex matches: A, A., A), (A), (A) text
    match = re.search(r'(?:^|\(|\s)([A-E])(?:\)|\.|\s|$)', answer, re.IGNORECASE)
    if match:
        answer = match.group(1).upper()  # Extract just the letter and convert to uppercase
        answer_counts[answer] = answer_counts.get(answer, 0) + 1
    else:
        # For non-standard answers
        answer_counts['Other'] = answer_counts.get('Other', 0) + 1
        # Store problematic responses for analysis
        other_responses.append(answer)

# Create a DataFrame for visualization
df = pd.DataFrame(list(answer_counts.items()), columns=['Answer', 'Count'])
df = df.sort_values('Count', ascending=False)

# Create a bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(df['Answer'], df['Count'], color='skyblue')
plt.title('Distribution of Answers in Distilled Dataset')
plt.xlabel('Answer Choice')
plt.ylabel('Count')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add count labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 1,
            f'{height}', ha='center', va='bottom')

# Calculate and display success rate (assuming valid answers are A-E)
total_questions = sum(answer_counts.values())
valid_answers = sum(answer_counts.get(ans, 0) for ans in ['A', 'B', 'C', 'D', 'E'])
success_rate = (valid_answers / total_questions) * 100 if total_questions > 0 else 0

plt.figtext(0.5, 0.01, f'Success Rate: {success_rate:.1f}% ({valid_answers}/{total_questions} questions with valid answers)', 
           ha='center', fontsize=12)

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()

# Display sample of 'Other' responses if they exist
if other_responses:
    print(f"\nSamples of 'Other' responses (showing up to 10):")
    for i, resp in enumerate(other_responses[:10]):
        print(f"  {i+1}. '{resp}'")
    
    # Analyze if there are patterns in the 'Other' responses
    lowercase_letters = sum(1 for r in other_responses if r.lower() in ['a', 'b', 'c', 'd', 'e'])
    has_period = sum(1 for r in other_responses if re.search(r'[A-Ea-e]\.', r))
    has_explanation = sum(1 for r in other_responses if len(r) > 5)  # Simple heuristic for explanations
    
    print(f"\nAnalysis of {len(other_responses)} 'Other' responses:")
    print(f"  - Lowercase letters (a-e): {lowercase_letters} ({lowercase_letters/len(other_responses)*100:.1f}%)")
    print(f"  - Answers with periods (A.): {has_period} ({has_period/len(other_responses)*100:.1f}%)")
    print(f"  - Likely explanations (>5 chars): {has_explanation} ({has_explanation/len(other_responses)*100:.1f}%)")

In [None]:
# Analyze potential model bias in answer distribution

# Expected distribution (ideally uniform for multiple choice)
expected_prob = 0.2  # 20% chance for each of A,B,C,D,E in a uniform distribution
expected_counts = {letter: total_questions * expected_prob for letter in ['A', 'B', 'C', 'D', 'E']}

# Create a DataFrame for comparing actual vs expected
comparison_data = []
for letter in ['A', 'B', 'C', 'D', 'E']:
    actual = answer_counts.get(letter, 0)
    expected = expected_counts[letter]
    difference = actual - expected
    percent_diff = (difference / expected) * 100 if expected > 0 else 0
    comparison_data.append({
        'Answer': letter,
        'Actual Count': actual,
        'Expected Count': expected,
        'Difference': difference,
        'Percent Difference': percent_diff
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nAnalysis of potential answer bias:")
display(comparison_df)

# Create a visual comparison
plt.figure(figsize=(12, 6))
x = range(len(comparison_df))
width = 0.35

plt.bar([i - width/2 for i in x], comparison_df['Actual Count'], width, label='Actual', color='skyblue')
plt.bar([i + width/2 for i in x], comparison_df['Expected Count'], width, label='Expected', color='lightgreen')

plt.xlabel('Answer Choice')
plt.ylabel('Count')
plt.title('Actual vs Expected Answer Distribution')
plt.xticks(x, comparison_df['Answer'])
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Create a confusion matrix to analyze answer patterns
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

# We don't have ground truth labels in this dataset, but we can:
# 1. Analyze confusion between expected uniform distribution and actual distribution
# 2. Alternatively, check if there are patterns in how the model responds to different question types

# Approach 1: Create a "pseudo-confusion matrix" showing bias toward certain answers
# Normalize the counts to get proportions
total_valid_answers = sum(answer_counts.get(ans, 0) for ans in ['A', 'B', 'C', 'D', 'E'])
pseudo_cm = np.zeros((5, 5))

# Fill the diagonal with actual proportions (representing how much the model prefers each answer)
for i, letter in enumerate(['A', 'B', 'C', 'D', 'E']):
    actual_prop = answer_counts.get(letter, 0) / total_valid_answers if total_valid_answers > 0 else 0
    expected_prop = 0.2  # Expected uniform distribution (20% each)
    
    # The diagonal shows the actual proportion
    pseudo_cm[i, i] = actual_prop
    
    # The off-diagonal elements represent the "confusion" - the difference between
    # expected and actual distribution
    for j in range(5):
        if i != j:
            pseudo_cm[i, j] = (1 - actual_prop) / 4  # Distribute remaining probability

# Plot the pseudo-confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(pseudo_cm, annot=True, fmt='.3f', cmap='Blues',
           xticklabels=['A', 'B', 'C', 'D', 'E'],
           yticklabels=['A', 'B', 'C', 'D', 'E'])
plt.title('Answer Distribution Bias Matrix')
plt.xlabel('Predicted Answer')
plt.ylabel('Expected Uniform Distribution')
plt.tight_layout()

# Add text annotation explaining this visualization
plt.figtext(0.5, 0.01, 
            'This matrix shows model bias toward certain answers.\n'
            'Diagonal values represent the proportion of each answer in the results.\n'
            'In an unbiased model, all diagonal values would be close to 0.2 (20%)',
            ha='center', fontsize=11, bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8))

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()

# Approach 2: If we had access to ground truth or additional features about questions,
# we could create an actual confusion matrix here

In [None]:
# Creating a real confusion matrix using ground truth data
import json
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Load the ground truth data
train_data_path = "./data/train_data.jsonl"

# Arrays to store ground truth and predicted labels
true_labels = []
predicted_labels = []

# Load and parse the data
print("Loading ground truth data from", train_data_path)
with open(train_data_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():
            try:
                data = json.loads(line)
                
                # Extract the predicted answer from our results variable
                question_text = data['Question']
                model_answer = data['Answer'].strip()
                
                # Find the ground truth for this question by parsing the answer choices
                # For simplicity, we'll use the letter of the answer
                match = re.search(r'\(([A-E])\)', model_answer)
                if match:
                    predicted_letter = match.group(1).upper()
                    predicted_labels.append(predicted_letter)
                    
                    # Try to extract what the actual ground truth is
                    # Since we don't have explicit ground truth,
                    # we'll use the consistent answer pattern from the commonsense_qa dataset
                    # where the correct answer is part of the answer choices
                    
                    # For this demonstration, we will assume these answers are correct
                    # In a real scenario, you would need to match against the ground truth answer key
                    true_labels.append(predicted_letter)
                
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {str(e)}")
            except Exception as e:
                print(f"Error processing line: {str(e)}")

# Since we don't have separate ground truth labels in this dataset,
# we'll create a more advanced analysis by grouping answers by question type

# 1. Create a simple question type classifier based on the first word of the question
question_types = []
question_type_predicted = []

with open(train_data_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():
            try:
                data = json.loads(line)
                question_text = data['Question']
                
                # Extract the actual question from the format
                if "Question: " in question_text:
                    actual_question = question_text.split("Question: ")[1].split("\nAnswer Choices:")[0]
                    
                    # Simple question type classification based on first word
                    first_word = actual_question.strip().split()[0].lower()
                    
                    # Group question types
                    if first_word in ['what', 'which']:
                        q_type = 'What/Which'
                    elif first_word in ['where']:
                        q_type = 'Where'
                    elif first_word in ['who']:
                        q_type = 'Who'
                    elif first_word in ['how']:
                        q_type = 'How'
                    elif first_word in ['why']:
                        q_type = 'Why'
                    else:
                        q_type = 'Other'
                    
                    question_types.append(q_type)
                    
                    # Get the predicted answer letter
                    model_answer = data['Answer'].strip()
                    match = re.search(r'\(([A-E])\)', model_answer)
                    if match:
                        letter = match.group(1).upper()
                        question_type_predicted.append(letter)
                
            except Exception as e:
                print(f"Error: {str(e)}")

# Count occurrences of each answer by question type
q_type_counts = {}
for q_type, pred in zip(question_types, question_type_predicted):
    if q_type not in q_type_counts:
        q_type_counts[q_type] = {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0}
    
    q_type_counts[q_type][pred] += 1

# Create a better confusion matrix - question type vs answer choice
unique_types = sorted(set(question_types))
q_type_matrix = np.zeros((len(unique_types), 5))

for i, q_type in enumerate(unique_types):
    for j, letter in enumerate(['A', 'B', 'C', 'D', 'E']):
        q_type_matrix[i, j] = q_type_counts.get(q_type, {}).get(letter, 0)

# Normalize by row (question type) to get distribution
row_sums = q_type_matrix.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1  # Avoid division by zero
q_type_matrix_norm = q_type_matrix / row_sums

# Plot the question type vs answer choice matrix
plt.figure(figsize=(12, 10))
sns.heatmap(q_type_matrix_norm, annot=True, fmt='.2f', cmap='YlGnBu',
           xticklabels=['A', 'B', 'C', 'D', 'E'],
           yticklabels=unique_types)
plt.title('Answer Distribution by Question Type (Normalized)')
plt.xlabel('Answer Choice')
plt.ylabel('Question Type')

# Add an explanation of the visualization
plt.figtext(0.5, 0.01, 
            'This matrix shows how answer patterns vary by question type.\n'
            'Each row shows the distribution of answers for a specific question type.\n'
            'An unbiased model would show similar distributions across question types.',
            ha='center', fontsize=11, bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8))

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()

# Create a second visualization - pattern analysis by question length
# Group questions by length and analyze answer patterns
question_lengths = []
length_predicted = []

with open(train_data_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():
            try:
                data = json.loads(line)
                question_text = data['Question']
                
                # Extract the actual question
                if "Question: " in question_text:
                    actual_question = question_text.split("Question: ")[1].split("\nAnswer Choices:")[0]
                    
                    # Get question length in words
                    word_count = len(actual_question.split())
                    
                    # Group by length
                    if word_count < 10:
                        length_group = 'Very Short (<10 words)'
                    elif word_count < 15:
                        length_group = 'Short (10-14 words)'
                    elif word_count < 20:
                        length_group = 'Medium (15-19 words)'
                    else:
                        length_group = 'Long (20+ words)'
                    
                    question_lengths.append(length_group)
                    
                    # Get the predicted answer letter
                    model_answer = data['Answer'].strip()
                    match = re.search(r'\(([A-E])\)', model_answer)
                    if match:
                        letter = match.group(1).upper()
                        length_predicted.append(letter)
                        
            except Exception as e:
                print(f"Error: {str(e)}")

# Count occurrences by length group
length_counts = {}
for length, pred in zip(question_lengths, length_predicted):
    if length not in length_counts:
        length_counts[length] = {'A': 0, 'B': 0, 'C': 0, 'D': 0, 'E': 0}
    
    length_counts[length][pred] += 1

# Create matrix for length vs answer
unique_lengths = ['Very Short (<10 words)', 'Short (10-14 words)', 
                 'Medium (15-19 words)', 'Long (20+ words)']
unique_lengths = [l for l in unique_lengths if l in length_counts]
length_matrix = np.zeros((len(unique_lengths), 5))

for i, length in enumerate(unique_lengths):
    for j, letter in enumerate(['A', 'B', 'C', 'D', 'E']):
        length_matrix[i, j] = length_counts.get(length, {}).get(letter, 0)

# Normalize by row
row_sums = length_matrix.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1  # Avoid division by zero
length_matrix_norm = length_matrix / row_sums

# Plot the question length vs answer choice matrix
plt.figure(figsize=(12, 8))
sns.heatmap(length_matrix_norm, annot=True, fmt='.2f', cmap='YlOrRd',
           xticklabels=['A', 'B', 'C', 'D', 'E'],
           yticklabels=unique_lengths)
plt.title('Answer Distribution by Question Length (Normalized)')
plt.xlabel('Answer Choice')
plt.ylabel('Question Length Group')

# Add an explanation of the visualization
plt.figtext(0.5, 0.01, 
            'This matrix shows how answer patterns vary by question length.\n'
            'Differences between rows may indicate biases in how the model handles questions of different complexity.',
            ha='center', fontsize=11, bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8))

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()