In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key_google = os.getenv("GEMINI_API_KEY")
api_key_openai = os.getenv("OPENAI_API_KEY")
api_key_claude = os.getenv("CLAUDE_API_KEY")
api_key_fireworks = os.getenv('FIREWORKS_API_KEY')

In [None]:
## function for gemini api call
import asyncio
import google.generativeai as genai
from typing import List
from datasets import load_dataset
import random
from tqdm.asyncio import tqdm
import nest_asyncio
import sys
nest_asyncio.apply()


genai.configure(api_key=api_key_google)
model = genai.GenerativeModel('gemini-2.5-flash')  

async def call_gemini(prompt: str, pbar) -> str:
    """Async Gemini call with error handling"""
    try:
        pbar.update(1)
        response = await model.generate_content_async(prompt)  
        return response.text
    except Exception as e:
        pbar.update(1)
        return f"Error: {str(e)}"
        

async def parallel_gemini(prompts: List[str], max_concurrency: int = 10) -> List[str]:
    with tqdm(total=len(prompts), desc="Calling Gemini") as pbar:
        semaphore = asyncio.Semaphore(max_concurrency)
        
        async def limited_call(prompt):
            async with semaphore:
                return await call_gemini(prompt, pbar)
        
        tasks = [limited_call(prompt) for prompt in prompts]
        return await asyncio.gather(*tasks)

In [None]:
## function for gpt api call
import asyncio
from typing import List
from tqdm.asyncio import tqdm
from openai import AsyncOpenAI
import nest_asyncio
nest_asyncio.apply()


model_name="o4-mini"
client = AsyncOpenAI(api_key=api_key_openai)

async def call_openai(prompt: str, pbar) -> str:
    """Async OpenAI call with error handling"""
    try:
        pbar.update(1)
        response = await client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        pbar.update(1)
        return f"Error: {str(e)}"
    
async def parallel_openai(prompts: List[str], max_concurrency: int = 5) -> List[str]:
    with tqdm(total=len(prompts), desc="Calling OpenAI GPT-4o-mini") as pbar:
        semaphore = asyncio.Semaphore(max_concurrency)

        async def limited_call(prompt):
            async with semaphore:
                return await call_openai(prompt, pbar)

        tasks = [limited_call(prompt) for prompt in prompts]
        return await asyncio.gather(*tasks)


In [None]:
## function for claude api call
import asyncio
from typing import List
from tqdm.asyncio import tqdm
from anthropic import AsyncAnthropic, HUMAN_PROMPT, AI_PROMPT
import nest_asyncio
nest_asyncio.apply()

# Claude Client Setup
client = AsyncAnthropic(api_key=api_key_claude)

model_name = "claude-sonnet-4-20250514"


async def call_claude(prompt: str, pbar) -> str:
    """Async call to Claude with streaming enabled"""
    try:
        pbar.update(1)
        response = await client.messages.create(
            model=model_name,
            max_tokens=8192,
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content[0].text.strip()
    except Exception as e:
        pbar.update(1)
        return f"Error: {str(e)}"

async def parallel_claude(prompts: List[str], max_concurrency: int = 5) -> List[str]:
    with tqdm(total=len(prompts), desc="Calling Claude") as pbar:
        semaphore = asyncio.Semaphore(max_concurrency)

        async def limited_call(prompt):
            async with semaphore:
                return await call_claude(prompt, pbar)

        tasks = [limited_call(prompt) for prompt in prompts]
        return await asyncio.gather(*tasks)


In [None]:
## function for deepseek api call
import asyncio
from typing import List
from tqdm.asyncio import tqdm
from openai import AsyncOpenAI
import nest_asyncio
nest_asyncio.apply()


model_name="accounts/fireworks/models/deepseek-r1-basic"

client = AsyncOpenAI(
    base_url="https://api.fireworks.ai/inference/v1",
    api_key= api_key_fireworks,
)

async def call_deepseek(prompt: str, pbar) -> str:
    """Async DeepSeek call with error handling"""
    try:
        pbar.update(1)
        stream = await client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=5000000000000,
            stream=True  
        )

        full_response = ""
        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content:
                full_response += chunk.choices[0].delta.content

        return full_response.strip()
    except Exception as e:
        pbar.update(1)
        return f"Error: {str(e)}"
    
async def parallel_deepseek(prompts: List[str], max_concurrency: int = 5) -> List[str]:
    with tqdm(total=len(prompts), desc="Calling DeepSeek R1") as pbar:
        semaphore = asyncio.Semaphore(max_concurrency)

        async def limited_call(prompt):
            async with semaphore:
                return await call_deepseek(prompt, pbar)

        tasks = [limited_call(prompt) for prompt in prompts]
        return await asyncio.gather(*tasks)

## Regular Inputs

In [None]:
import json

with open("../ExecBench/Livecodebench_regular_inputs.json", "r", encoding="utf-8") as f:
    inputs = json.load(f)

expected_outputs = list()
prompts = list()
q_ids = list()
for i in tqdm(range(len(inputs))):
        prompt = f"""{inputs[i]['code']}

run the code above and infer the output from the code and provided input, including a detailed, step-by-step explanation of the thinking process and each step of the data flow and control flow that led to your conclusion. print the final output in the last line in the following format: output=answer"""
        prompts.append(prompt)
        expected_outputs.append(inputs[i]['execution_output'])
        q_ids.append(inputs[i]['q_id'])

### Gemini

In [None]:
import json 

async def main():
    responses = await parallel_gemini(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_gemini_livecodebench_regular.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### GPT

In [None]:
async def main():
    responses = await parallel_openai(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_gpt_livecodebench_regular.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### Claude

In [None]:
async def main():
    responses = await parallel_claude(prompts, max_concurrency=5)
    return responses

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_claude_livecodebench_regular.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### DeepSeek

In [None]:
async def main():
    responses = await parallel_deepseek(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_deepseek_livecodebench_regular.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


## Edge Inputs

In [None]:
import json

with open("../ExecBench/Livecodebench_edge_inputs.json", "r", encoding="utf-8") as f:
    inputs = json.load(f)

expected_outputs = list()
prompts = list()
q_ids = list()
for i in tqdm(range(len(inputs))):
        prompt = f"""{inputs[i]['code']}

run the code above and infer the output from the code and provided input, including a detailed, step-by-step explanation of the thinking process and each step of the data flow and control flow that led to your conclusion. print the final output in the last line in the following format: output=answer"""
        prompts.append(prompt)
        expected_outputs.append(inputs[i]['execution_output'])
        q_ids.append(inputs[i]['q_id'])

### Gemini

In [None]:
import json 

async def main():
    responses = await parallel_gemini(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_gemini_livecodebench_edge.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### GPT

In [None]:
async def main():
    responses = await parallel_openai(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_gpt_livecodebench_edge.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### Claude

In [None]:
async def main():
    responses = await parallel_claude(prompts, max_concurrency=5)
    return responses

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_claude_livecodebench_edge.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### DeepSeek

In [None]:
async def main():
    responses = await parallel_deepseek(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": expected_outputs[i],                
        }
    records.append(record)

with open("llm_reasoning_deepseek_livecodebench_edge.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


## Invalid Inputs

In [None]:
import json

with open("../ExecBench/Livecodebench_invalid_inputs.json", "r", encoding="utf-8") as f:
    inputs = json.load(f)

prompts = list()
q_ids = list()
for i in tqdm(range(len(inputs))):
        prompt = f"""{inputs[i]['code']}

run the code above and infer the output from the code and provided input, including a detailed, step-by-step explanation of the thinking process and each step of the data flow and control flow that led to your conclusion. print the final output in the last line in the following format: output=answer"""
        prompts.append(prompt)
        q_ids.append(inputs[i]['q_id'])

### Gemini

In [None]:
import json 

async def main():
    responses = await parallel_gemini(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": "Error",                
        }
    records.append(record)

with open("llm_reasoning_gemini_livecodebench_invalid.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### GPT

In [None]:
async def main():
    responses = await parallel_openai(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": "Error",}
    records.append(record)

with open("llm_reasoning_gpt_livecodebench_invalid.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### Claude

In [None]:
async def main():
    responses = await parallel_claude(prompts, max_concurrency=5)
    return responses

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": "Error"
        }
    records.append(record)

with open("llm_reasoning_claude_livecodebench_invalid.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)


### DeepSeek

In [None]:
async def main():
    responses = await parallel_deepseek(prompts, max_concurrency=5)
    return responses 

responses = asyncio.run(main())

records = list()
for i in range(len(responses)):
   
    record = {
        "question_id": str(q_ids[i]),
        "prompt": prompts[i],
        "llm_output": responses[i], 
        "expected_output": "Error"
        }
    records.append(record)

with open("llm_reasoning_deepseek_livecodebench_invalid.json", "w", encoding="utf-8") as file:
    json.dump(records, file, indent=4, ensure_ascii=False)
