In [1]:
!git clone https://github.com/mhizterpaul/leetcode-questions-dataset.git
!mv leetcode-questions-dataset/data .

Cloning into 'leetcode-questions-dataset'...
remote: Enumerating objects: 202, done.[K
remote: Counting objects: 100% (202/202), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 202 (delta 98), reused 167 (delta 77), pack-reused 0 (from 0)[K
Receiving objects: 100% (202/202), 2.96 MiB | 6.04 MiB/s, done.
Resolving deltas: 100% (98/98), done.


In [2]:
import pandas as pd

leetcode_dataset = pd.read_csv("data/leetcode_dataset/all_questions_details_cumulative.csv")

interview_dataset = pd.read_csv("data/interview_dataset/SoftwareQuestions.csv", encoding='latin1')

hackerrank_algo_dataset = pd.read_csv("data/hackerrank_dataset/algorithms_questions.csv")

hackerrank_ds_dataset = pd.read_csv("data/hackerrank_dataset/data_structures_questions.csv")

In [3]:
# Process LeetCode dataset
leetcode_processed = leetcode_dataset[[
    'Question ID',
    'Question Title',
    'Question Text',
    'Topic Tagged text',
    'Difficulty Level'
]].copy()

# Rename columns to match target schema where possible
leetcode_processed.rename(columns={
    'Question ID': 'id',
    'Question Title': 'question_title',
    'Question Text': 'question_text',
    'Topic Tagged text': 'tags',
    'Difficulty Level': 'difficulty'
}, inplace=True)

# Combine title and text for the question column
leetcode_processed['question'] = leetcode_processed['question_title'] + "\n" + leetcode_processed['question_text']

# Drop the individual title and text columns
leetcode_processed.drop(columns=['question_title', 'question_text'], inplace=True)

# Display the processed LeetCode data
display(leetcode_processed.head())

Unnamed: 0,id,tags,difficulty,question
0,1,"Array,Hash Table",Easy,Two Sum\nGiven an array of integers nums and a...
1,2,"Linked List,Math,Recursion",Medium,Add Two Numbers\nYou are given two non-empty l...
2,3,"Hash Table,String,Sliding Window",Medium,Longest Substring Without Repeating Characters...
3,4,"Array,Binary Search,Divide and Conquer",Hard,Median of Two Sorted Arrays\nGiven two sorted ...
4,5,"String,Dynamic Programming",Medium,Longest Palindromic Substring\nGiven a string ...


In [4]:
# Define the five target categories
target_categories = [
    'Algorithms',
    'Data Structures',
    'Database Systems',
    'System Design',
    'Security'
]

# Process Interview dataset
interview_processed = interview_dataset[[
    'Question Number',
    'Question',
    'Answer',
    'Category',
    'Difficulty'
]].copy()

# Filter the dataset to include only the target categories
interview_processed = interview_processed[interview_processed['Category'].isin(target_categories)].copy()


# Rename columns to match target schema where possible
interview_processed.rename(columns={
    'Question Number': 'id',
    'Question': 'question',
    'Answer': 'correct',
    'Category': 'category',
    'Difficulty': 'difficulty'
}, inplace=True)

# Interview dataset does not have explicit tags, a, b, c, d columns.
# These will need to be generated by the LLM later.
interview_processed['tags'] = None
interview_processed['a'] = None
interview_processed['b'] = None
interview_processed['c'] = None
interview_processed['d'] = None


# Display the processed Interview data
display(interview_processed.head())

Unnamed: 0,id,question,correct,category,difficulty,tags,a,b,c,d
10,11,What is the difference between an array and a ...,An array has fixed size and stores elements in...,Data Structures,Easy,,,,,
11,12,Explain the time complexity of an algorithm.,Time complexity measures the time an algorithm...,Data Structures,Hard,,,,,
12,13,Describe the difference between a binary searc...,"A binary search tree is hierarchical, maintain...",Data Structures,Medium,,,,,
13,14,What is a linked list and how does it work?,A linked list is a series of nodes each contai...,Data Structures,Medium,,,,,
14,15,Explain the concept of recursion.,Recursion is when a function calls itself to s...,Data Structures,Medium,,,,,


In [5]:
# Process HackerRank Algorithms dataset
hackerrank_algo_processed = hackerrank_algo_dataset[[
    'url',
    'question_text',
]].copy()

# Rename columns to match target schema where possible
hackerrank_algo_processed.rename(columns={
    'url': 'id', # Using url as a temporary ID
    'question_text': 'question',
}, inplace=True)

# Add missing columns, to be generated by LLM
hackerrank_algo_processed['tags'] = None
hackerrank_algo_processed['a'] = None
hackerrank_algo_processed['b'] = None
hackerrank_algo_processed['c'] = None
hackerrank_algo_processed['d'] = None
hackerrank_algo_processed['correct'] = None
hackerrank_algo_processed['category'] = 'Algorithms' # Assign category based on dataset

# Display the processed HackerRank Algorithms data
display(hackerrank_algo_processed.head())

Unnamed: 0,id,question,tags,a,b,c,d,correct,category
0,https://www.hackerrank.com/challenges/count-st...,A regular expression is used to describe a set...,,,,,,,Algorithms
1,https://www.hackerrank.com/challenges/kingdom-...,It has been a prosperous year for King Charles...,,,,,,,Algorithms
2,https://www.hackerrank.com/challenges/morgan-a...,Jack and Daniel are friends. Both of them like...,,,,,,,Algorithms
3,https://www.hackerrank.com/challenges/red-knig...,"In ordinary chess, the pieces are only of two ...",,,,,,,Algorithms
4,https://www.hackerrank.com/challenges/two-robo...,You have a warehouse with\ncontainers filled w...,,,,,,,Algorithms


In [6]:
# Process HackerRank Data Structures dataset
hackerrank_ds_processed = hackerrank_ds_dataset[[
    'url',
    'question_text',
]].copy()

# Rename columns to match target schema where possible
hackerrank_ds_processed.rename(columns={
    'url': 'id', # Using url as a temporary ID
    'question_text': 'question',
}, inplace=True)

# Add missing columns, to be generated by LLM
hackerrank_ds_processed['tags'] = None
hackerrank_ds_processed['a'] = None
hackerrank_ds_processed['b'] = None
hackerrank_ds_processed['c'] = None
hackerrank_ds_processed['d'] = None
hackerrank_ds_processed['correct'] = None
hackerrank_ds_processed['category'] = 'Data Structures' # Assign category based on dataset

# Display the processed HackerRank Data Structures data
display(hackerrank_ds_processed.head())

Unnamed: 0,id,question,tags,a,b,c,d,correct,category
0,https://www.hackerrank.com/challenges/box-oper...,Alice purchased an array of\nwooden boxes that...,,,,,,,Data Structures
1,https://www.hackerrank.com/challenges/is-binar...,"For the purposes of this challenge, we define ...",,,,,,,Data Structures
2,https://www.hackerrank.com/challenges/array-an...,Given two numbers\nand\n.\nindicates the numbe...,,,,,,,Data Structures
3,https://www.hackerrank.com/challenges/subseque...,A subsequence of a sequence is a sequence whic...,,,,,,,Data Structures
4,https://www.hackerrank.com/challenges/company-...,The\nLRT Company\nhas\nemployees. Each employe...,,,,,,,Data Structures


In [7]:
import os

# Create the interim directory if it doesn't exist
os.makedirs("data/interim", exist_ok=True)

# List of processed dataframes
processed_dfs = [
    leetcode_processed,
    interview_processed,
    hackerrank_algo_processed,
    hackerrank_ds_processed
]

# Combine all processed dataframes
combined_df = pd.concat(processed_dfs, ignore_index=True)

# Deduplicate based on the 'question' column
combined_df_deduplicated = combined_df.drop_duplicates(subset=['question'])

# Select only the 'id' and 'question' columns
output_df = combined_df_deduplicated.reindex(columns=['id', 'question'])

# Define the output path
OUTPUT_CSV = "data/interim/filtered_dataset.csv"

# Save the combined dataframe to a CSV file
output_df.to_csv(OUTPUT_CSV, index=False)

print(f"Combined and deduplicated dataset with only 'id' and 'question' saved to {OUTPUT_CSV}")
display(output_df.head())

Combined and deduplicated dataset with only 'id' and 'question' saved to data/interim/filtered_dataset.csv


Unnamed: 0,id,question
0,1,Two Sum\nGiven an array of integers nums and a...
1,2,Add Two Numbers\nYou are given two non-empty l...
2,3,Longest Substring Without Repeating Characters...
3,4,Median of Two Sorted Arrays\nGiven two sorted ...
4,5,Longest Palindromic Substring\nGiven a string ...


In [None]:
# Step 1: Install the Hugging Face Hub tools
!pip install -q huggingface_hub transformers accelerate bitsandbytes pandas tqdm

# Step 2: Import and login using your token
from huggingface_hub import login

# Paste your token here (get it from https://huggingface.co/settings/tokens)
login("YOUR_KEY")

import os
import json
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from huggingface_hub import snapshot_download

# --- Config ---
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
LOCAL_MODEL_DIR = "/content/Mistral-7B" # Changed to absolute path
INPUT_CSV = "data/interim/filtered_dataset.csv"
OUTPUT_CSV = "data/dist/combined_questions_with_llm_features.csv"
MAX_RETRIES = 3
TEMPERATURE = 0.7
TOP_P = 0.9
MAX_TOKENS = 512
BATCH_SIZE = 3

# --- Step 1: Download Model ---
if not os.path.exists(LOCAL_MODEL_DIR):
    print(f"⬇️ Downloading Mistral-7B to {LOCAL_MODEL_DIR}")
    snapshot_download(
        repo_id=MODEL_NAME,
        local_dir=LOCAL_MODEL_DIR,
        local_dir_use_symlinks=False
    )
else:
    print(f"✅ Using cached model from {LOCAL_MODEL_DIR}")

# --- Step 2: Load Model ---
print("🔧 Loading Mistral-7B...")

# Configure 4-bit quantization for GPU
# Remove CPU-specific settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", # nf4 is generally recommended for 4-bit on GPU
    bnb_4bit_compute_dtype=torch.float16
)


tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIR, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODEL_DIR,
    trust_remote_code=True,
    device_map="auto", # Use device_map="auto" to automatically use the GPU if available
    quantization_config=bnb_config # Use the quantization config
)
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

# --- Step 3: Prompt Template ---
def build_prompt(question: str) -> str:
    return tokenizer.apply_chat_template([
        {
            "role": "system",
            "content": (
                "You are an expert tutor that generates multiple-choice questions for technical interviews. "
                "Your job is to create **difficult** questions with 4 realistic but incorrect options "
                "(a, b, c, d), 1 correct answer, relevant tags, and a category. "
                "The correct answer must NOT appear in the options (a, b, c, d). "
                "Tags must be inferred based on the nature of the question. "
                "Categories must be chosen from only one of these: [algorithms, data structures, database systems, system design, security]."
            )
        },
        {
            "role": "user",
            "content": f"""
Question:
\"\"\"{question.strip()}\"\"\"

Expected JSON:
{{
  "a": "...",
  "b": "...",
  "c": "...",
  "d": "...",
  "correct": "...",
  "tags": ["...", "..."],
  "category": "..."
}}
""",
        },
    ], tokenize=False, add_generation_prompt=True)

# --- Step 4: Load Data ---
df = pd.read_csv(INPUT_CSV)[['id', 'question']]
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

output = []

# --- [NEW] Batched Inference Loop ---
from itertools import islice

def batched(iterable, batch_size):
    it = iter(iterable)
    while True:
        batch = list(islice(it, batch_size))
        if not batch:
            break
        yield batch

# --- Step 5: Inference Loop ---
print("🚀 Generating MCQs...")
for batch_rows in tqdm(batched(df.iterrows(), BATCH_SIZE), total=(len(df) + BATCH_SIZE - 1) // BATCH_SIZE):
    prompts = []
    metadata = []

    for _, row in batch_rows:
        prompt = build_prompt(row["question"])
        prompts.append(prompt)
        metadata.append((row["id"], row["question"]))

    try:
        results = llm(prompts, max_new_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, do_sample=True)

        for i, res in enumerate(results):
            qid, question = metadata[i]
            retries = 0
            success = False

            while retries < MAX_RETRIES and not success:
                try:
                    response = res['generated_text']
                    json_str = response[response.find('{'):]
                    parsed = json.loads(json_str)

                    options = [parsed['a'], parsed['b'], parsed['c'], parsed['d']]
                    correct = parsed['correct']

                    if correct in options:
                        print(f"⚠️ Correct leaked into options for ID {qid}, retrying...")
                        retries += 1
                        continue

                    output.append({
                        "id": qid,
                        "question": question,
                        "a": parsed['a'],
                        "b": parsed['b'],
                        "c": parsed['c'],
                        "d": parsed['d'],
                        "correct": correct,
                        "tags": ", ".join(parsed.get("tags", [])),
                        "category": parsed["category"]
                    })

                    success = True

                except Exception as e:
                    print(f"❌ Error parsing result for ID {qid}, attempt {retries + 1}: {e}")
                    retries += 1

            if not success:
                print(f"❌ Skipped ID {qid} after {MAX_RETRIES} failed attempts")

    except Exception as e:
        print(f"❌ Batch failed: {e}")

# --- Step 6: Save Final Output ---
print("💾 Saving dataset...")
pd.DataFrame(output).to_csv(OUTPUT_CSV, index=False)
print(f"✅ Final output saved to {OUTPUT_CSV}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━