# Model evaluations

In [1]:
# Setup
import pandas as pd
import numpy as np
import sys, os
sys.path.append('..')

from utils.eval_utils import evaluate_experiment
from utils.model_utils import async_converse_llm
from utils.data_utils import process_data_table

# Reproducibility
np.random.seed(1996)
assert np.random.random() == 0.6193051167657804

# Read, process, sample
raw_data = pd.read_json('../data/train.json')
all_prompts = process_data_table(raw_data)
sample_prompts = np.random.choice(all_prompts, 100)

from openai import AsyncOpenAI
async_client = AsyncOpenAI(
    api_key=os.environ.get('DEEPSEEK_API'),
    base_url="https://api.deepseek.com",
)

from time import time
import asyncio

## Experiment 1: Prompt 1

> Asks for evaluated maths

1. Uses `deepseek-chat`
2. Uses full context
3. Uses straight question
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V1

In [3]:
from utils.prompts import SYSTEM_PROMPT_V1

print(SYSTEM_PROMPT_V1)


    Act as a financial analysis specialist. Your responses must:
    1. Strictly use only the contextual information provided by the user
    2. Deliver the final answer in this exact format:
        - Unitless numerical value
        - Enclosed in \boxed{} LaTeX formatting

    Never reference external knowledge or assumptions. 
    Convert all scaled values to absolute numbers during calculations,
    but omit units in the final answer.
    


In [4]:
if 'exp1.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V1,
                use_structured_outputs=False,
                )
        )
        tasks.append(res)

    results_exp1 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp1).to_json('../results/exp1.json')

if 'exp1.json' in os.listdir('../results/'):
    metrics_exp1 = evaluate_experiment('../results/exp1.json')

Metrics:
            % correct: 0.451
            % correct (percentage fix): 0.451
        


## Experiment 2: Prompt 2

> Asks for evaluated maths + reasoning

1. Uses `deepseek-chat`
2. Uses full context
3. Uses straight question
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V2

In [5]:
from utils.prompts import SYSTEM_PROMPT_V2

print(SYSTEM_PROMPT_V2)


    Act as a financial analysis specialist. Your responses must:
    1. Strictly use only the contextual information provided by the user
    2. Explicitly show your logical reasoning process through sequential step-by-step explanations
    3. Deliver the final answer in this exact format:
        - Unitless numerical value
        - Enclosed in \boxed{} LaTeX formatting

    Never reference external knowledge or assumptions. 
    Convert all scaled values to absolute numbers during calculations, 
    but omit units in the final answer.
    


In [6]:
if 'exp2.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V2,
                use_structured_outputs=False,
                )
        )
        tasks.append(res)

    results_exp2 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp2).to_json('../results/exp2.json')

if 'exp2.json' in os.listdir('../results/'):
    metrics_exp2 = evaluate_experiment('../results/exp2.json')

Metrics:
            % correct: 0.534
            % correct (percentage fix): 0.548
        


## Experiment 3: Prompt 3
> Uses maths operators + no evaluation

1. Uses `deepseek-chat`
2. Uses full context
3. Uses straight question
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V3

In [7]:
from utils.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [8]:
if 'exp3.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3,
                use_structured_outputs=False,
                )
        )
        tasks.append(res)

    results_exp3 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp3).to_json('../results/exp3.json')

if 'exp3.json' in os.listdir('../results/'):
    metrics_exp3 = evaluate_experiment('../results/exp3.json')

Metrics:
            % correct: 0.408
            % correct (percentage fix): 0.465
        


## Experiment 4: Prompt 3

> Uses broken-down questions (step-by-step)

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V3
6. Uses maths operators + no evaluation
7. Uses step by step questions rather than the straight question

In [9]:
from utils.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [10]:
if 'exp4.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3,
                use_structured_outputs=False,
                )
        )
        tasks.append(res)

    results_exp4 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp4).to_json('../results/exp4.json')

if 'exp4.json' in os.listdir('../results/'):
    metrics_exp4 = evaluate_experiment('../results/exp4.json')


Execution time: 42 minutes and 3 seconds
Metrics:
            % correct: 0.56
            % correct (percentage fix): 0.625
        


## Experiment 5: Prompt 4

> Uses structured outputs and broken-down questions (step-by-step)

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V4
6. Uses maths operators + no evaluation
7. Uses step by step questions rather than the straight question
8. Uses structured output

In [11]:
from utils.prompts import SYSTEM_PROMPT_V4

print(SYSTEM_PROMPT_V4)


    Act as a financial computation engine that outputs valid JSON. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. JSON Output Requirements:
    - Structure response as valid JSON with this schema:
        {
            "user_question": "string",
            "user_context": "string",
            "reasoning": ["step1", "step2", ..., "stepN"],
            "final_answer": "boxed_expression"
        }
    - Maintain atomic values in JSON (no complex objects)
    - Escape special characters properly
    - final_answer must use: \boxed{operator(...)} format
    
    4. Complian

In [None]:
if 'exp5.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='', # sys_prompt already has one-shot example
                sys_prompt=SYSTEM_PROMPT_V4,
                use_structured_outputs=True
                )
        )
        tasks.append(res)

    results_exp5 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp5).to_json('../results/exp5.json')

if 'exp5.json' in os.listdir('../results/'):
    metrics_exp5 = evaluate_experiment('../results/exp5.json')


Execution time: 30 minutes and 45 seconds
Metrics:
            % correct: 0.0
            % correct (percentage fix): 0.0
        


## Experiment 6: Prompt 4

> Uses structured outputs and straight questions

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V4
6. Uses maths operators + no evaluation
7. Uses straight question
8. Uses structured output

In [13]:
from utils.prompts import SYSTEM_PROMPT_V4

print(SYSTEM_PROMPT_V4)


    Act as a financial computation engine that outputs valid JSON. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. JSON Output Requirements:
    - Structure response as valid JSON with this schema:
        {
            "user_question": "string",
            "user_context": "string",
            "reasoning": ["step1", "step2", ..., "stepN"],
            "final_answer": "boxed_expression"
        }
    - Maintain atomic values in JSON (no complex objects)
    - Escape special characters properly
    - final_answer must use: \boxed{operator(...)} format
    
    4. Complian

In [None]:
if 'exp6.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='', # sys_prompt already has one-shot example
                sys_prompt=SYSTEM_PROMPT_V4,
                use_structured_outputs=True
                )
        )
        tasks.append(res)

    results_exp6 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp6).to_json('../results/exp6.json')

if 'exp6.json' in os.listdir('../results/'):
    metrics_exp6 = evaluate_experiment('../results/exp6.json')


Execution time: 1 minutes and 20 seconds
Metrics:
            % correct: 0.0
            % correct (percentage fix): 0.0
        


## Experiment 7: Prompt 3

> Uses `deepseek-reasoner`, WITHOUT structured outputs, straight questions

**`deepseek-reasoner` does not support structured outputs as of now**

See: https://github.com/deepseek-ai/DeepSeek-R1/issues/9

1. Uses full context
2. Does not use example fewshots
4. Uses SYSTEM_PROMPT_V3
5. Uses maths operators + no evaluation
6. Uses straight question
7. Does not uses structured output

In [15]:
from utils.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [16]:
if 'exp7.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-reasoner',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3,
                use_structured_outputs=False,
                )
        )
        tasks.append(res)

    results_exp7 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp7).to_json('../results/exp7.json')

if 'exp7.json' in os.listdir('../results/'):
    metrics_exp7 = evaluate_experiment('../results/exp7.json')


Execution time: 6 minutes and 31 seconds
Metrics:
            % correct: 0.508
            % correct (percentage fix): 0.548
        


### Experiment 8: Prompt 3

> Uses `deepseek-reasoner`, WITHOUT structured outputs, step by step questions

1. Uses full context
2. Does not use example fewshots
4. Uses SYSTEM_PROMPT_V3
5. Uses maths operators + no evaluation
6. Uses straight question
7. Does not use structured output

In [17]:
from utils.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [18]:
if 'exp8.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-reasoner',

                use_short_context=False,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3,
                use_structured_outputs=False
                )
        )
        tasks.append(res)

    results_exp8 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp8).to_json('../results/exp8.json')

if 'exp8.json' in os.listdir('../results/'):
    metrics_exp8 = evaluate_experiment('../results/exp8.json')


Execution time: 6 minutes and 39 seconds
Metrics:
            % correct: 0.5
            % correct (percentage fix): 0.599
        


### Experiment 9: Prompt 3

> Uses `deepseek-reasoner`, WITHOUT structured outputs, step by step questions, short context

1. Uses short context
2. Does not use example fewshots
4. Uses SYSTEM_PROMPT_V3
5. Uses maths operators + no evaluation
6. Uses straight question
7. Does not use structured output

In [2]:
from utils.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [3]:
if 'exp9.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-reasoner',

                use_short_context=True,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3,
                use_structured_outputs=False
                )
        )
        tasks.append(res)

    results_exp9 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp9).to_json('../results/exp9.json')

if 'exp9.json' in os.listdir('../results/'):
    metrics_exp9 = evaluate_experiment('../results/exp9.json')


Execution time: 11 minutes and 14 seconds
Metrics:
            % correct: 0.435
            % correct (percentage fix): 0.511
        


### Experiment 10: Prompt 3

> Uses `deepseek-reasoner`, WITHOUT structured outputs, step by step questions, golden context

1. Uses golden context
2. Does not use example fewshots
4. Uses SYSTEM_PROMPT_V3
5. Uses maths operators + no evaluation
6. Uses straight question
7. Does not use structured output

In [2]:
from utils.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [3]:
if 'exp10.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-reasoner',

                use_gold_inds=True,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3,
                use_structured_outputs=False
                )
        )
        tasks.append(res)

    results_exp10 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp10).to_json('../results/exp10.json')

if 'exp10.json' in os.listdir('../results/'):
    metrics_exp10 = evaluate_experiment('../results/exp10.json')


Execution time: 9 minutes and 39 seconds
Metrics:
            % correct: 0.514
            % correct (percentage fix): 0.633
        


# Deep dive