In [2]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install datasets -U

Note: you may need to restart the kernel to use updated packages.


In [4]:
from datasets import load_dataset
from abc import ABC

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class InputDataset(ABC):
    def __init__(self):
        super().__init__()
        (
            self.train_data_file_name,
            self.test_data_file_name,
            self.eval_data_file_name,
        ) = (None, None, None)

In [6]:
class CQnAHuggingFaceInputDataset(InputDataset):
    """
    Loads the HuggingFace dataset
    """

    def __init__(self):
        super().__init__()

    def load_hf_dataset(
        self,
        dataset_name,
        train_sample_size=10,
        val_sample_size=10,
        test_sample_size=10,
        train_split_name="train",
        val_split_name="validation",
        test_split_name="test",
    ):
        full_dataset = load_dataset(dataset_name)

        if val_split_name is not None:
            train_data = full_dataset[train_split_name].select(range(train_sample_size))
            val_data = full_dataset[val_split_name].select(range(val_sample_size))
            test_data = full_dataset[test_split_name].select(range(test_sample_size))
        else:
            train_val_data = full_dataset[train_split_name].select(
                range(train_sample_size + val_sample_size)
            )
            train_data = train_val_data.select(range(train_sample_size))
            val_data = train_val_data.select(
                range(train_sample_size, train_sample_size + val_sample_size)
            )
            test_data = full_dataset[test_split_name].select(range(test_sample_size))

        return train_data, val_data, test_data

In [7]:
# We can define train and test sample sizes here. Validation size is kept same as test sample size
train_sample_size = 100
val_sample_size = 100

# Sample notebook using the dataset: https://huggingface.co/datasets/tau/commonsense_qa
dataset_name = "tau/commonsense_qa"
input_dataset = CQnAHuggingFaceInputDataset()

# Note: train_split_name and test_split_name can vary by dataset. They are passed as arguments in load_hf_dataset.
# If validation_split_name is None, the below function will split the train set to create the specified sized validation set.
train, val, _ = input_dataset.load_hf_dataset(
    dataset_name=dataset_name,
    train_sample_size=train_sample_size,
    val_sample_size=val_sample_size,
    train_split_name="train",
    val_split_name="validation",
)

print("Len of train data sample is " + str(len(train)))
print("Len of validation data sample is " + str(len(val)))

Len of train data sample is 100
Len of validation data sample is 100


In [8]:
! mkdir -p data

In [9]:
train_data_path = "data/train_original_data.jsonl"

In [10]:
import json

In [11]:
system_prompt = "You are a helpful assistant. Your output should only be one of the five choices: 'A', 'B', 'C', 'D', or 'E'."
user_prompt_template = "Answer the following multiple-choice question by selecting the correct option.\n\nQuestion: {question}\nAnswer Choices:\n{answer_choices}"

for row in train:
    data = {"messages": []}
    data["messages"].append(
        {
            "role": "system",
            "content": system_prompt,
        }
    )
    question, choices = row["question"], row["choices"]
    labels, choice_list = choices["label"], choices["text"]
    answer_choices = [
        "({}) {}".format(labels[i], choice_list[i]) for i in range(len(labels))
    ]
    answer_choices = "\n".join(answer_choices)
    data["messages"].append(
        {
            "role": "user",
            "content": user_prompt_template.format(
                question=question, answer_choices=answer_choices
            ),
        }
    )
    with open(train_data_path, "a") as f:
        f.write(json.dumps(data) + "\n")

In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
import os

In [14]:
teacher_model_name = os.getenv('TEACHER_MODEL_NAME')
teacher_model_endpoint_url = os.getenv('TEACHER_MODEL_ENDPOINT')
teacher_model_api_key = os.getenv('TEACHER_MODEL_KEY')

In [15]:
pip install azure-ai-inference

Note: you may need to restart the kernel to use updated packages.


In [16]:
import os
import json
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

In [17]:
def process_question(question_data):
    try:
        messages = []
        for msg in question_data["messages"]:
            if msg["role"] == "system":
                messages.append(SystemMessage(content=msg["content"]))
            elif msg["role"] == "user":
                messages.append(UserMessage(content=msg["content"]))

        response = client.complete(
            messages=messages,
            model=model_name,
            max_tokens=100  # Reduced since we just need short answers like A, B, C, D, or E
        )

        return {
            "question": question_data["messages"][1]["content"],
            "response": response.choices[0].message.content,
            "full_response": response
        }
    except Exception as e:
        return {
            "question": question_data["messages"][1]["content"] if len(question_data["messages"]) > 1 else "Error",
            "response": f"Error: {str(e)}",
            "full_response": None
        }

In [18]:
endpoint = teacher_model_endpoint_url
model_name = teacher_model_name
key = teacher_model_api_key
client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [19]:
# Read the JSONL file and process each question
results = []
with open(train_data_path, 'r', encoding='utf-8') as file:
    print(f"Processing questions from {train_data_path}")
    for i, line in enumerate(file):
        if line.strip():  # Skip empty lines
            try:
                question_data = json.loads(line)
                print(f"Processing question {i+1}...")
                result = process_question(question_data)
                results.append(result)
                print(f"Question {i+1} response: {result['response']}")
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i+1}: {str(e)}")
            except Exception as e:
                print(f"Error processing line {i+1}: {str(e)}")

Processing questions from data/train_original_data.jsonl
Processing question 1...
Question 1 response: (A) ignore
Processing question 2...
Question 2 response: (B) populated areas
Processing question 3...
Question 3 response: (B) neck
Processing question 4...
Question 4 response: (D) atlas
Processing question 5...
Question 5 response: (C) natural habitat
Processing question 6...
Question 6 response: (D) television
Processing question 7...
Question 7 response: (E) airport
Processing question 8...
Question 8 response: Error: (Timeout) The operation was timeout.
Code: Timeout
Message: The operation was timeout.
Processing question 9...
Question 9 response: (E) blotter
Processing question 10...
Question 10 response: (D) business sector
Processing question 11...
Question 11 response: (B) grocery cart
Processing question 12...
Question 12 response: (C) bitterness
Processing question 13...
Question 13 response: (C) garden
Processing question 14...
Question 14 response: (A) loss of heat
Proces

In [20]:
# Read the JSONL file and process each question
results = []
with open(train_data_path, 'r', encoding='utf-8') as file:
    print(f"Processing questions from {train_data_path}")
    for i, line in enumerate(file):
        if line.strip():  # Skip empty lines
            try:
                question_data = json.loads(line)
                print(f"Processing question {i+1}...")
                result = process_question(question_data)
                results.append(result)
                print(f"Question {i+1} response: {result['response']}")
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i+1}: {str(e)}")
            except Exception as e:
                print(f"Error processing line {i+1}: {str(e)}")

Processing questions from data/train_original_data.jsonl
Processing question 1...
Question 1 response: (A) ignore
Processing question 2...
Question 2 response: (B) populated areas
Processing question 3...
Question 3 response: (B) neck
Processing question 4...
Question 4 response: (D) atlas
Processing question 5...
Question 5 response: (C) natural habitat
Processing question 6...
Question 6 response: (D) television
Processing question 7...
Question 7 response: (E) airport
Processing question 8...
Question 8 response: (B) refrigerator
Processing question 9...
Question 9 response: (E) blotter
Processing question 10...
Question 10 response: (D) business sector
Processing question 11...
Question 11 response: (B) grocery cart
Processing question 12...
Question 12 response: (C) bitterness
Processing question 13...
Question 13 response: (C) garden
Processing question 14...
Question 14 response: (A) loss of heat
Processing question 15...
Question 15 response: (C) kitchen
Processing question 16.

In [21]:
output_file_path = "./data/train_data.jsonl"
with open(output_file_path, 'w', encoding='utf-8') as f:
    for result in results:
        # Extract just the question content (removing the instruction part)
        question_text = result["question"]
        # if "Question: " in question_text:
        #     question_text = question_text.split("Question: ")[1].split("\nAnswer Choices:")[0]

        # Create the simplified output format
        output_line = {
            "Question": question_text,
            "Answer": result["response"]
        }

        # Write as JSONL (one JSON object per line)
        f.write(json.dumps(output_line, ensure_ascii=False) + '\n')