## Host LLM on Kaggla/Colab for local inferencing

Starts a local server to serve a large language model (Qwen2.5-Coder-7B-Instruct), and uses ngrok to make the server accessible over the internet.

*** sensitive information not included

In [None]:
!pip install vllm transformers pyngrok
!ngrok config add-authtoken ***
!vllm serve Qwen/Qwen2.5-Coder-7B-Instruct --port 11434 --dtype=half --max-seq-len 4096 --tensor-parallel-size 2 & ngrok http 11434 --host-header="localhost:11434" --log stdout --url ***

## Base-line

Directly call the model.

Use LangChain's [Structured outputs](https://python.langchain.com/docs/concepts/structured_outputs/) to make sure results in a structured format for further use. Batch queries for faster inference.

In [None]:
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
import os
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('b6_test_data.csv')

df_yet = pd.read_csv('submission.csv')
# Save all values in the task_id column to a list
task_id_list = df_yet['task_id'].tolist()


def format_choices_with_letters(choices_str):
    choices = eval(choices_str)
    letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    return "\n".join([f"{letters[i]}. {choice}" for i, choice in enumerate(choices)])


# Set your vLLM endpoint as the OpenAI-compatible API
os.environ["OPENAI_API_BASE"] = "https://glad-bass-wrongly.ngrok-free.app/v1"
os.environ["OPENAI_API_KEY"] = "sk-fake-key"  # Not used but required by LangChain

# Define structured output using Pydantic
class ExtractedInfo(BaseModel):
    answer: str = Field(description="The output of the given code as one of A, B, C, D, or E")
    # reason: str = Field(description="A short reason explaining why the output is correct")

# Initialize the vLLM-backed ChatOpenAI model
llm = ChatOpenAI(model_name="Qwen/Qwen2.5-Coder-7B-Instruct",
                 temperature=0,
                 openai_api_base=os.environ["OPENAI_API_BASE"],
                 )

llm = llm.with_structured_output(ExtractedInfo, method="json_mode")

# Function to analyze code in batches and return structured output
def analyze_code_batch(batch):
    prompts = []
    for question, formatted_choices in batch:
        prompt = [("system", "You are a helpful assistant that excels in answering multiple-choice programming questions. Let's explain step by step."),
                  ("human",
                   "Analyze the following question and choices."
                   "Choose the correct answer from the following choices and return JSON with 'answer'.\n\n"
                   f"Question:\n{question}\n\n"
                   "Which choice is correct?\n\n"
                   f"Choices:\n{formatted_choices}\n\n")]
        prompts.append(prompt)
    return llm.batch_invoke(prompts)

# Example usage
if __name__ == "__main__":
    batch_size = 10  # Define the batch size
    results = []

    # Filter rows that need processing
    rows_to_process = df[~df['task_id'].isin(task_id_list)]

    # Process in batches
    for i in tqdm(range(0, len(rows_to_process), batch_size)):
        batch = rows_to_process.iloc[i:i + batch_size]
        batch_prompts = []

        for _, row in batch.iterrows():
            task_id = row['task_id']
            question = row['question']
            choices_str = row['choices']
            formatted_choices = format_choices_with_letters(choices_str)
            batch_prompts.append((question, formatted_choices))

        try:
            batch_results = analyze_code_batch(batch_prompts)
            for row, result in zip(batch.iterrows(), batch_results):
                task_id = row[1]['task_id']
                results.append({
                    'task_id': task_id,
                    'answer': result.answer,
                })
        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            continue

    # Save the results to a CSV file
    output_df = pd.DataFrame(results)
    output_df.to_csv('submission.csv', index=False, mode='a', header=not os.path.exists('submission.csv'))

## Few-shot prompting

Find similar questions in the training data for a given test question. Include them all when inference. Functions to do so.

This can furthur improved by generate the reason for the answer, but I didn't have much time to do that

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_data(train_path: str, test_path: str):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    train_df['full_text'] = train_df['question'] + ' ' + train_df['choices'].fillna('')
    return train_df, test_df

def compute_embeddings(train_df, test_df, model_name='intfloat/multilingual-e5-large-instruct'):
    model = SentenceTransformer(model_name)
    
    train_texts = train_df['full_text'].tolist()
    test_texts = test_df['question'].tolist()
    
    train_embeddings = model.encode(train_texts, convert_to_tensor=True, normalize_embeddings=True)
    test_embeddings = model.encode(test_texts, convert_to_tensor=True, normalize_embeddings=True)
    
    return train_embeddings, test_embeddings

def precompute_similarity(train_embeddings, test_embeddings):
    similarity_matrix = (test_embeddings @ train_embeddings.T) * 100
    return similarity_matrix.cpu().numpy() if torch.is_tensor(similarity_matrix) else similarity_matrix

def retrieve_top_k(similarity_matrix, train_df, k=3):
    top_k_indices = np.argsort(-similarity_matrix, axis=1)[:, :k]
    top_k_task_ids = train_df['task_id'].values[top_k_indices]
    return top_k_task_ids

def find_top_k_for_test_id(test_task_id, test_df, top_k_task_ids):
    test_index = test_df.index[test_df['task_id'] == test_task_id].tolist()
    if not test_index:
        return []
    return top_k_task_ids[test_index[0]].tolist()


Few-show prompting

In [None]:
from pydantic import BaseModel, Field

class ExtractedInfo(BaseModel):
    answer: str = Field(description="The output of the given code as one of A, B, C, D, or E")

def generate_answers(test_df, train_df, top_k_task_ids, model_name="Qwen/Qwen2.5-Coder-7B-Instruct", batch_size=10):
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    system_prompt = """You are a helpful AI assistant tasked with answering multiple-choice questions about coding.

                        I will provide you with questions and their possible answer choices. I will also give you several examples. For each question:
                        1. Read the question carefully.
                        2. Examine all answer options.
                        3. Select the most accurate answer.
                        4. Respond ONLY with the **letter** of the correct choice (e.g., 'A', 'B', 'C', etc.).
                        5. Do not include any explanations, text, or anything other than the letter of the correct answer.

                        Here are some examples:

                        1. Question: {question1}
                        Choices:
                        {choice1}
                        Answer: {answer1}

                        2. Question: {question2}
                        Choices:
                        {choice2}
                        Answer: {answer2}

                        3. Question: {question3}
                        Choices:
                        {choice3}
                        Answer: {answer3}

                        **Your answer must be only a single letter corresponding to the correct choice, with no additional content.**
                        Now, please answer the following question:
                        """
    
    answers = []
    for i in range(0, len(test_df), batch_size):
        batch = test_df.iloc[i:i + batch_size]
        prompts = []
        
        for _, row in batch.iterrows():
            top_k_examples = top_k_task_ids[_]
            examples = "\n\n".join([
                f"Question: {train_df.loc[train_df['task_id'] == tid, 'question'].values[0]}\nChoices:\n{train_df.loc[train_df['task_id'] == tid, 'choices'].values[0]}\nAnswer: {train_df.loc[train_df['task_id'] == tid, 'answer'].values[0]}"
                for tid in top_k_examples
            ])
            
            prompt = f"{system_prompt}\n\n{examples}\n\nHere is the question:\nQuestion:\n{row['question']}\nChoices:\n{row['choices']}"
            messages = [{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
                        {"role": "user", "content": prompt}]
            
            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            prompts.append(text)
        
        # Tokenize the batch
        model_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        
        # Generate responses for the batch
        generated_ids = model.generate(**model_inputs, max_new_tokens=10)
        responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # Parse responses and append to answers
        for row, response in zip(batch.iterrows(), responses):
            task_id = row[1]['task_id']
            structured_output = ExtractedInfo.parse_raw(response.strip())
            answers.append((task_id, structured_output.answer))
    
    # Save the results to a CSV file
    submission_df = pd.DataFrame(answers, columns=['task_id', 'answer'])
    submission_df.to_csv('submission.csv', index=False)

if __name__ == "__main__":
    train_df, test_df = load_data('b6_train_data.csv', 'b6_test_data.csv')
    train_embeddings, test_embeddings = compute_embeddings(train_df, test_df)
    similarity_matrix = precompute_similarity(train_embeddings, test_embeddings)
    top_k_task_ids = retrieve_top_k(similarity_matrix, train_df)
    generate_answers(test_df, train_df, top_k_task_ids)

## Code running for executable questions

Use regex to find out executable question. And extract the code snippet.

In [None]:
import csv
import re

def extract_questions(input_csv, output_csv):
    patterns = [
        r"^Question: What will be output\??",
        r"^Question: What will be the output\??",
        r"^Question: What is the output\??",
        r"^Question: What would be the output\??"
    ]

    with open(input_csv, newline='', encoding='utf-8') as infile, \
         open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:

        reader = csv.DictReader(infile)
        fieldnames = ['task_id', 'extracted_text']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            question = row['question'].strip()
            task_id = row['task_id']

            for pattern in patterns:
                match = re.match(pattern, question)
                if match:
                    extracted_text = question[question.find('?')+1:].strip()
                    writer.writerow({'task_id': task_id, 'extracted_text': extracted_text})
                    break

extract_questions('b6_test_data.csv', 'b6_code_extracted.csv')