In [3]:
import random
import json
import asyncio

from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio
import pandas as pd

from predictors import *
from llm import *
from prompts import *
from tools import *

In [2]:
dataset = load_dataset("gsm8k", "main")

In [3]:
sum(len(dataset[ds_name]) for ds_name in dataset)

8792

In [4]:
df = pd.read_csv('./data/dataset_mixture_train.csv')
df

Unnamed: 0,question,dataset,gpt-4.1-nano,gpt-4.1-mini,gpt-4.1
0,Tyler had 15 dogs . Each dog had 5 puppies . H...,train,True,True,True
1,Steven wants to split a collection of cards in...,train,True,True,True
2,Katie had 85 files on her computer . She delet...,train,True,True,True
3,Luna's monthly food budget is equal to 60% of ...,train,False,False,False
4,James buys 3 dirt bikes for $150 each and 4 of...,train,False,False,False
...,...,...,...,...,...
3041,Mike had 34 peaches at his roadside fruit dish...,valid,True,True,True
3042,"Of 96 oranges, half were ripe. If 1/4 of the r...",valid,False,False,False
3043,Tim had 50 cents . He paid 45 cents for a cand...,valid,True,True,True
3044,On Sunday Alice bought 4 pints of strawberry i...,valid,True,False,False


In [5]:
qs=list(set(df['question']))
qs

['The cafeteria had 86 apples . For lunch they handed out 30 to students and decided to use the rest to make pies . If each pie takes 8 apples , how many pies could they make ?',
 'The price of a home is $98 per square foot (sq ft).  The house is 2,400 sq ft and the barn out back is 1,000 sq ft.  How much is this property?',
 'Paige was unboxing some of her old winter clothes . She found 6 boxes of clothing and inside each box there were 5 scarves and 5 mittens . How many pieces of winter clothing did Paige have total ?',
 'Holly needs to take 2 insulin pills per day, 3 blood pressure pills per day, and twice as many anticonvulsants as blood pressure pills each day. How many pills does Holly take in a week?',
 'Minnie is making a playlist of songs for a party she will be throwing. She wants the playlist to be an hour long. She has added 16 three-minute songs to the playlist. How many four-minute songs does she have to add to make sure the playlist is an hour long?',
 'Maria bought 2 ne

In [4]:
qs=list(set(item['question'] for ds_name in dataset for item in dataset[ds_name]))
len(qs)

8792

In [6]:
# Calculate OpenAI embeddings for each question
async def get_embedding(text, semaphore):
    async with semaphore:
        response = await openai_client.embeddings.create(
            input=text,
            model="text-embedding-3-large"
        )
        return text,response.data[0].embedding

async def calculate_and_store_embeddings(unique_questions, max_concurrent_requests):
    embeddings_dict = {}
    
    # Calculate embeddings for each unique question
    semaphore = asyncio.Semaphore(max_concurrent_requests)
    tasks = [get_embedding(question, semaphore) for question in unique_questions]
    for result in tqdm_asyncio.as_completed(tasks):
        question,emb = await result
        embeddings_dict[question] = emb
    
    print(f"Embeddings calculated and saved for {len(unique_questions)} questions")
    
    return embeddings_dict

In [7]:
embeddings = await calculate_and_store_embeddings(qs, max_concurrent_requests=10)

100%|██████████████████████████████████████| 3046/3046 [03:10<00:00, 16.03it/s]

Embeddings calculated and saved for 3046 questions





In [7]:
len(embeddings)

8792

In [None]:
list(embeddings.items())[0]

In [8]:
import pickle

with open('./models/embeddings2.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [None]:
import pickle

with open('./models/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)