In [None]:
# %% Minimal setup
# If needed (uncomment in a notebook):
%pip install requests python-dotenv

import os, json, textwrap, re, time
import requests

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              


#trying to implement chain of thought, so the main change I've made here is increasing the maximum tokens so it can actually go through the chain of thought. The end
#algorithm I have in mind is to implement self-consistency, so multiple COTs are ran at the same time and then majority vote on an answer. Ideally seems to be the 
#best way to get an accurate answer, though may not be very fast.

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answerâ€”no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.0,
                                timeout: int = 60) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 512,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


Note: you may need to restart the kernel to use updated packages.


In [None]:
#chain of thought implementation, my idea to get chain of thoughts is to prompt the model to do the problem in steps, so that with each subproblem, forms a chain of thought

def ChainOfThought(question, temperature=0.7):
    system = (
        "You are an expert contest math solver. "
        "Always follow the user's formatting instructions exactly."
    )
    prompt = f"""Solve the problem step by step in plain text.
    Use at most 4 short steps, each on its own line.
    After the steps, on the very last line, write EXACTLY:

    Final answer: <integer>

    Replace <integer> with the final numeric answer and nothing else.

    Problem:
    {question}
    """

    result = call_model_chat_completions(
        prompt=prompt,
        system=system,
        temperature=temperature,
    )
    return result["text"]


def extract_integer_final(text: str):
    #first check if the answer is shown at the "Final Answer" text
    final = re.search(r"Final answer:\s*([\-]?\d+)", text)

    if final:
        return final.group(1).strip()
    
    #it was not at the final answer text, so just last number that was printed out
    final = re.findall(r"[\-]?\d+", text)
    return final[-1] if final else None




In [28]:
#now to implement the actual self consistency, so I want to run the chain of thought multiple times, and keep a list of all the final answers, and then simulate voting
#in the sense of taking the most common answer in the list, if all the answers are different, maybe I should keep running until I come across an answer that already
#exists in the array and return that

def SelfConsistency(question, attempts=5,temperature=0.7):
    finalAnswers = []
    frequency = {}
    for i in range(attempts):
        textVersion = ChainOfThought(question, temperature=temperature)
        number = extract_integer_final(textVersion)

        if number is None:
            continue
        
        finalAnswers.append(number)
        frequency[number] = frequency.get(number, 0) + 1

    if not frequency:
        return None
    
    #now we have all our finalAnswers and frequency table filled out, we can pick the highest frequency number
    bestAnswer = None
    bestCount = 0

    for value, count in frequency.items():
        if count > bestCount:
            bestCount = count
            bestAnswer = value

    return bestAnswer

In [33]:

question = "A tennis player computes her win ratio by dividing the number of matches she has won by the total number of matches she has played. At the start of a weekend, her win ratio is exactly $0.500$ . During the weekend, she plays four matches, winning three and losing one. At the end of the weekend, her win ratio is greater than $.503$ . What's the largest number of matches she could've won before the weekend began?"


text = SelfConsistency(question, attempts=7, temperature=0.7)
print("Self Consistency Answer: ", text)

Self Consistency Answer:  164


In [None]:
import importlib, finalProject_NilayKumar
importlib.reload(finalProject_NilayKumar)

from finalProject_NilayKumar import solveQuestion

q_math_easy = {"input": "What is 17 + 28?"}
q_math_hard = {"input": "What is the product of the real roots of the equation x^2 + 18x + 30 = 2 sqrt(x^2 + 18x + 45)?"}


print("EASY:", solveQuestion(q_math_easy))
print("HARD:", solveQuestion(q_math_hard))

TEST1
TEST2
TEST3
TEST
EASY: 45
TEST1
TEST2
TEST3
TEST
HARD: 20
