In [4]:
import itertools
from fractions import Fraction
import random
from itertools import product
import json
import time
from openai import AsyncOpenAI
from openai import (
    APIConnectionError,
    APIError,
    BadRequestError,
    RateLimitError,
    Timeout,
)
import asyncio
import os
from tqdm import tqdm
import logging
from typing import List, Dict, Any

In [5]:


# ------------------- PROMPT TEMPLATES -------------------

def template_sum_condition(num_dice, sides, condition, target):
    return f"What is the probability that the sum of {num_dice} {sides}-sided dice is {condition} {target}?"

def template_at_least_one_value(num_dice, sides, target_value):
    return f"What is the probability that at least one die shows a {target_value} when rolling {num_dice} {sides}-sided dice?"

def template_all_different(num_dice, sides):
    return f"What is the probability that all {num_dice} dice show different values when rolling {sides}-sided dice?"

def template_at_least_two_same(num_dice, sides):
    return f"What is the probability that at least two dice show the same number when rolling {num_dice} {sides}-sided dice?"

def template_sum_in_range(num_dice, sides, min_sum, max_sum):
    return f"What is the probability that the sum is between {min_sum} and {max_sum}, inclusive, when rolling {num_dice} {sides}-sided dice?"

# ------------------- SOLVERS -------------------

def solve_sum_condition(num_dice, sides, condition, target):
    """
    Solve the probability that the sum of dice satisfies a condition (e.g., '>= 20')
    """
    total_outcomes = sides ** num_dice
    favorable_outcomes = 0

    # For small dice counts, we can enumerate all possibilities
    if num_dice <= 8:  # Adjust based on memory constraints
        for roll in product(range(1, sides + 1), repeat=num_dice):
            total = sum(roll)
            if condition == '=' and total == target:
                favorable_outcomes += 1
            elif condition == '>' and total > target:
                favorable_outcomes += 1
            elif condition == '<' and total < target:
                favorable_outcomes += 1
            elif condition == '>=' and total >= target:
                favorable_outcomes += 1
            elif condition == '<=' and total <= target:
                favorable_outcomes += 1
    else:
        # For larger dice counts, use sampling
        samples = 100000  # Adjust based on desired accuracy
        for _ in range(samples):
            roll = [random.randint(1, sides) for _ in range(num_dice)]
            total = sum(roll)
            if condition == '=' and total == target:
                favorable_outcomes += 1
            elif condition == '>' and total > target:
                favorable_outcomes += 1
            elif condition == '<' and total < target:
                favorable_outcomes += 1
            elif condition == '>=' and total >= target:
                favorable_outcomes += 1
            elif condition == '<=' and total <= target:
                favorable_outcomes += 1
        return Fraction(favorable_outcomes, samples)

    return Fraction(favorable_outcomes, total_outcomes)

def solve_at_least_one_value(num_dice, sides, target_value):
    """
    Calculate probability that at least one die shows a specific value
    """
    # Probability of NOT getting the target value on a single die
    p_not_target = (sides - 1) / sides
    # Probability of NOT getting the target value on any of the dice
    p_none = p_not_target ** num_dice
    # Probability of getting at least one target value
    p_at_least_one = 1 - p_none
    return Fraction(int(p_at_least_one * (sides ** num_dice)), sides ** num_dice)

def solve_all_different(num_dice, sides):
    """
    Calculate probability that all dice show different values
    """
    if num_dice > sides:
        return Fraction(0, 1)  # Impossible to have all different values
    
    # Number of ways to select num_dice different values from sides values
    favorable = 1
    for i in range(num_dice):
        favorable *= (sides - i)
    
    # Total number of possible outcomes
    total = sides ** num_dice
    
    return Fraction(favorable, total)

def solve_at_least_two_same(num_dice, sides):
    """
    Calculate probability that at least two dice show the same value
    """
    return Fraction(1) - solve_all_different(num_dice, sides)

def solve_sum_in_range(num_dice, sides, min_sum, max_sum):
    """
    Calculate probability that the sum of dice is within a range
    """
    # For small dice counts, we can enumerate all possibilities
    if num_dice <= 8:  # Adjust based on memory constraints
        all_rolls = itertools.product(range(1, sides + 1), repeat=num_dice)
        total, favorable = 0, 0
        for roll in all_rolls:
            s = sum(roll)
            if min_sum <= s <= max_sum:
                favorable += 1
            total += 1
        return Fraction(favorable, total)
    else:
        # For larger dice counts, use sampling
        samples = 100000  # Adjust based on desired accuracy
        favorable = 0
        for _ in range(samples):
            roll = [random.randint(1, sides) for _ in range(num_dice)]
            s = sum(roll)
            if min_sum <= s <= max_sum:
                favorable += 1
        return Fraction(favorable, samples)

# ------------------- GENERATE PROBLEMS -------------------

In [6]:



def is_answer_correct(model_answer, correct_answer):
    """
    Check if the model's answer matches the correct answer by comparing numerical values.
    Handles fractions, decimals, percentages, and LaTeX boxed expressions.
    """
    import re
    from fractions import Fraction
    
    # Convert correct_answer to a Fraction object
    try:
        if '/' in correct_answer:
            num, denom = correct_answer.split('/')
            correct_fraction = Fraction(int(num), int(denom))
        else:
            correct_fraction = Fraction(correct_answer)
        correct_float = float(correct_fraction)
    except:
        return False  # Can't parse the correct answer
    
    # Simple exact match check
    if correct_answer in model_answer:
        return True
    
    # Check for boxed LaTeX expressions first (common in formatted answers)
    boxed_pattern = r'\\boxed\{(?:\\d?frac\{(\d+)\}\{(\d+)\}|(\d+)/(\d+))\}'
    boxed_matches = re.findall(boxed_pattern, model_answer)
    
    for match in boxed_matches:
        # Process non-empty groups in the match
        for i in range(0, len(match), 2):
            if i+1 < len(match) and match[i] and match[i+1]:
                try:
                    model_fraction = Fraction(int(match[i]), int(match[i+1]))
                    if model_fraction == correct_fraction or abs(float(model_fraction) - correct_float) < 0.0001:
                        return True
                except:
                    continue
    
    # Standard fraction patterns
    fraction_pattern = r'(?:(?:the answer|probability|result|final answer) is|probability of|equals|=)\s*(\d+)/(\d+)|(\d+)\s*/\s*(\d+)|\\frac\{(\d+)\}\{(\d+)\}'
    matches = re.findall(fraction_pattern, model_answer)
    
    for match in matches:
        # Process non-empty groups in the match
        for i in range(0, len(match), 2):
            if i+1 < len(match) and match[i] and match[i+1]:
                try:
                    model_fraction = Fraction(int(match[i]), int(match[i+1]))
                    if model_fraction == correct_fraction or abs(float(model_fraction) - correct_float) < 0.0001:
                        return True
                except:
                    continue
    
    # Check for decimals
    for decimal_match in re.findall(r'(\d+\.\d+)', model_answer):
        try:
            if abs(float(decimal_match) - correct_float) < 0.001:
                return True
        except:
            continue
    
    # Check for percentages
    for percent_match in re.findall(r'(\d+(?:\.\d+)?)%', model_answer):
        try:
            if abs(float(percent_match) / 100 - correct_float) < 0.001:
                return True
        except:
            continue
    
    # Special cases for 1 and 0
    if (correct_float == 1.0 and any(phrase in model_answer.lower() for phrase in 
                                    ["probability is 1", "probability of 1", "= 1"])):
        return True
    
    if (correct_float == 0.0 and any(phrase in model_answer.lower() for phrase in 
                                    ["probability is 0", "probability of 0", "= 0"])):
        return True
    
    return False


In [None]:
openai_client = AsyncOpenAI(base_url="https://aldito1--vllm-app-serve.modal.run", api_key="super-secret-key")

In [7]:
data = json.load(open("probability_dataset.json"))



In [8]:
prompts = [data[i]["prompt"] for i in range(len(data))]

In [37]:
import asyncio
import time
from typing import List, Dict, Any, Optional
from openai import AsyncOpenAI
import os

class AsyncOpenAIClient:
    """
    A client for making asynchronous parallel requests to OpenAI API.
    """
    
    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o", max_concurrency: int = 32):
        """
        Initialize the async OpenAI client.
        
        Args:
            api_key: OpenAI API key. If None, it will be read from OPENAI_API_KEY environment variable.
            model: The OpenAI model to use for completions.
            max_concurrency: Maximum number of concurrent requests.
        """
        
        self.client = AsyncOpenAI(base_url="https://mihirathale98--vllm-app-serve.modal.run/v1", api_key="super-secret-key")
        self.model = "Qwen/Qwen2.5-1.5B-Instruct"
        self.semaphore = asyncio.Semaphore(max_concurrency)
        
    async def get_completion(self, prompt: str, temperature: float = 0.7, max_tokens: int = 1000) -> Dict[str, Any]:
        """
        Get a completion for a single prompt.
        
        Args:
            prompt: The prompt text to send to the API.
            temperature: Controls randomness (0.0 to 1.0).
            max_tokens: Maximum number of tokens to generate.
            
        Returns:
            The API response as a dictionary.
        """
        async with self.semaphore:
            try:
                response = await self.client.chat.completions.create(
                    model=self.model,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=temperature,
                    max_tokens=max_tokens
                )
                return {
                    "prompt": prompt,
                    "completion": response.choices[0].message.content,
                    "success": True,
                    "error": None
                }
            except Exception as e:
                return {
                    "prompt": prompt,
                    "completion": None,
                    "success": False,
                    "error": str(e)
                }
    
    async def process_prompts_batch(self, prompts: List[str], temperature: float = 0.7, max_tokens: int = 1000) -> List[Dict[str, Any]]:
        """
        Process a batch of prompts in parallel.
        
        Args:
            prompts: List of prompts to send to the API.
            temperature: Controls randomness (0.0 to 1.0).
            max_tokens: Maximum number of tokens to generate.
            
        Returns:
            List of responses, one for each prompt.
        """
        tasks = []
        for prompt in prompts:
            tasks.append(self.get_completion(prompt, temperature, max_tokens))
        
        return await asyncio.gather(*tasks)

# Jupyter Notebook Usage Example

# Import the AsyncOpenAIClient class defined above
# Note: In a real notebook, you would either define the class in a previous cell
# or import it from a separate .py file
client = AsyncOpenAIClient()


# Define an async function to process the prompts
async def process_prompts(prompts):
    start_time = time.time()
    results = await client.process_prompts_batch(prompts)
    elapsed_time = time.time() - start_time
    print(f"Processed {len(prompts)} prompts in {elapsed_time:.2f} seconds")
    return results

# In Jupyter, you need to use asyncio.run_until_complete() with the event loop
# This is how you execute async code in a Jupyter cell
import nest_asyncio
nest_asyncio.apply()  # This allows running asyncio in Jupyter notebooks

# Run the async function and get the results



In [21]:

prompts = [data[i]["prompt"] for i in range(len(data))]

# results = asyncio.get_event_loop().run_until_complete(process_prompts(prompts))

In [22]:
prompts

['What is the probability that the sum is between 1 and 4, inclusive, when rolling 1 8-sided dice?',
 'What is the probability that the sum is between 14 and 22, inclusive, when rolling 5 6-sided dice?',
 'What is the probability that at least one die shows a 2 when rolling 3 12-sided dice?',
 'What is the probability that the sum of 5 10-sided dice is = 31?',
 'What is the probability that the sum is between 16 and 19, inclusive, when rolling 3 8-sided dice?',
 'What is the probability that the sum is between 10 and 15, inclusive, when rolling 2 8-sided dice?',
 'What is the probability that the sum of 3 8-sided dice is <= 9?',
 'What is the probability that the sum of 6 12-sided dice is <= 30?',
 'What is the probability that the sum of 5 20-sided dice is < 35?',
 'What is the probability that at least one die shows a 9 when rolling 2 12-sided dice?',
 'What is the probability that at least two dice show the same number when rolling 2 20-sided dice?',
 'What is the probability that t

In [23]:
results_5 = asyncio.get_event_loop().run_until_complete(process_prompts(prompts))

Processed 237 prompts in 85.09 seconds


In [54]:
results

[{'prompt': 'What is the probability that the sum is between 1 and 4, inclusive, when rolling 1 8-sided dice?',
  'completion': 'To calculate the probability that the sum is between 1 and 4 inclusive when rolling an 8-sided die, we first need to understand the range of possible sums and then count the number of outcomes that fall within the desired range.\n\n### Step 1: Determine the Range of Possible Sums\n- The smallest possible sum is 1, which occurs if we roll a 1 on both dice.\n- The largest possible sum is 8, which occurs if we roll an 8 on both dice.\n- Therefore, the possible sums range from 1 to 8, inclusive.\n\n### Step 2: Count the Total Number of Outcomes\nAn 8-sided die has 8 faces, so when rolling two such dice, the total number of possible outcomes is:\n\\[ 8 \\times 8 = 64 \\]\n\n### Step 3: Count the Favorable Outcomes\nWe need to count the number of outcomes where the sum is between 1 and 4, inclusive.\n\n1. Sum = 1: (1, 1)\n   - This is 1 outcome.\n\n2. Sum = 2: (1, 

In [24]:
x = 0
for og, gen in zip(data, results):
    correct = is_answer_correct(gen["completion"], og["correct_answer"])
    if correct:
        x += 1

In [25]:
x/len(data)

0.36416184971098264

In [31]:
x = 0
for og, gen in zip(data, results_2):
    correct = is_answer_correct(gen["completion"], og["correct_answer"])
    if correct:
        x += 1

In [32]:
x/len(data)

0.04046242774566474

In [63]:
x = 0
for og, gen in zip(data, results_3):
    correct = is_answer_correct(gen["completion"], og["correct_answer"])
    if correct:
        x += 1

In [64]:
x/len(data)

0.42196531791907516

In [15]:
x = 0
for og, gen in zip(data, results_4):
    correct = is_answer_correct(gen["completion"], og["correct_answer"])
    if correct:
        x += 1

In [16]:
x/len(data)

0.42616033755274263

In [24]:
x = 0
for og, gen in zip(data, results_5):
    correct = is_answer_correct(gen["completion"], og["correct_answer"])
    if correct:
        x += 1

In [25]:
x/len(data)

0.4388185654008439

In [None]:
client.change_model("mihirathale98/finetuned-dice-2")

In [38]:
out = await client.get_completion("What is the probability that at least two dice show the same number when rolling 3 12-sided dice?", max_tokens=32000)

In [39]:
print(out['completion'])

To determine the probability that at least two dice show the same number when rolling 3 12-sided dice, we can use the complementary probability approach. This involves calculating the probability that all three dice show different numbers and then subtracting this from 1.

### Step 1: Calculate the total number of possible outcomes
Each die has 12 faces, and since the dice are rolled independently, the total number of possible outcomes when rolling 3 dice is:
\[
12 \times 12 \times 12 = 12^3 = 1728
\]

### Step 2: Calculate the number of outcomes where all dice show different numbers
- The first die can show any of the 12 faces.
- The second die can show any of the remaining 11 faces.
- The third die can show any of the remaining 10 faces.

Thus, the number of outcomes where all three dice show different numbers is:
\[
12 \times 11 \times 10 = 1320
\]

### Step 3: Calculate the probability that all three dice show different numbers
The probability \( P(\text{all different}) \) is the n

In [33]:
out

{'prompt': 'What is the probability that the sum of 4 20-sided dice is >= 65?',
 'completion': " I have 4 dice, each can be 1-20.  What's the chance that their sum is >=65?\nI know there are 20^4=16000 possible plays.  I can do this in two ways:\n1) Compute the number of 4-tuples (a,b,c,d) with a,b,c,d each 1-20 and a+b+c+d >=65, then divide by 20^4.\n2) Compute the number of 4-tuples (a,b,c,d) with a,b,c,d each 1-20 and a+b+c+d = 65, then divide by 20^4.\nIt is easier to compute the second, since 65 is close to 4*20=80, so the number of pairs (x,y) with x+y=65 where x and y are between 1 and 20.  The number of pairs is 29, since 65-1=64 and 65-20=45, so 64 to 45 is 20 numbers, then reverse, so 21 to 20 is 14, so total 20+14=34, but wait, 65-17=48, so 48 to 20 is 23, so total 34?  Wait, but 17+48=65.  Hmm.  Alternatively, 65-1=64; the pair is (1,64), but 64 is over 20, so no.  65-2=63, but that's below 1, so no.  65-3=62, same.  65-4=61, same.  65-5=60, same.  65-6=59, now maybe?  5...