# Model evaluations

In [None]:
# Setup
import pandas as pd
import numpy as np
import sys, os
sys.path.append('..')

from src.eval_utils import evaluate_experiment
from src.model_utils import async_converse_llm
from src.data_utils import process_data_table

# Reproducibility
np.random.seed(1996)
assert np.random.random() == 0.6193051167657804
raw_data = pd.read_json('../data/train.json')
all_prompts = process_data_table(raw_data)
sample_prompts = np.random.choice(all_prompts, 100)

from openai import AsyncOpenAI
async_client = AsyncOpenAI(
    api_key=os.environ.get('DEEPSEEK_API'),
    base_url="https://api.deepseek.com",
)

from time import time
import asyncio

# Example of a data entry

In [2]:
sample_prompts[0]

{'question_type': 'simple',
 'question': 'what is the increase observed in the interest net of capitalized interest during 2016 and 2017?',
 'answer': '9.235%',
 'short_context': 'Context:\n\n[Row 0] : interest net of capitalized interest; 2017: $ 275305; 2016: $ 252030; 2015: $ 222088\n[Row 1] : income taxes net of refunds received; 2017: $ 188946; 2016: $ -39293 ( 39293 ); 2015: $ 41108\n\n',
 'full_context': "Context:\n\nthe diluted earnings per share calculation excludes stock options, sars, restricted stock and units and performance units and stock that were anti-dilutive. shares underlying the excluded stock options and sars totaled 2.6 million, 10.3 million and 10.2 million for the years ended december 31, 2017, 2016 and 2015, respectively. for the year ended december 31, 2016, 4.5 million shares of restricted stock and restricted stock units and performance units and performance stock were excluded. 10. supplemental cash flow information net cash paid for interest and income ta

## Experiment 1: Prompt 1

> Asks for evaluated maths

1. Uses `deepseek-chat`
2. Uses full context
3. Uses straight question
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V1

In [None]:
from src.prompts import SYSTEM_PROMPT_V1

print(SYSTEM_PROMPT_V1)


    Act as a financial analysis specialist. Your responses must:
    1. Strictly use only the contextual information provided by the user
    2. Deliver the final answer in this exact format:
        - Unitless numerical value
        - Enclosed in \boxed{} LaTeX formatting

    Never reference external knowledge or assumptions. 
    Convert all scaled values to absolute numbers during calculations,
    but omit units in the final answer.
    


In [4]:
if 'exp1.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V1
                )
        )
        tasks.append(res)

    results_exp1 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp1).to_json('../results/exp1.json')

if 'exp1.json' in os.listdir('../results/'):
    metrics_exp1 = evaluate_experiment('../results/exp1.json')

Experiment:
            % correct: 0.532
            % correct (percentage fix): 0.54
        


## Experiment 2: Prompt 2

> Asks for evaluated maths + reasoning

1. Uses `deepseek-chat`
2. Uses full context
3. Uses straight question
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V2

In [None]:
from src.prompts import SYSTEM_PROMPT_V2

print(SYSTEM_PROMPT_V2)


    Act as a financial analysis specialist. Your responses must:
    1. Strictly use only the contextual information provided by the user
    2. Explicitly show your logical reasoning process through sequential step-by-step explanations
    3. Deliver the final answer in this exact format:
        - Unitless numerical value
        - Enclosed in \boxed{} LaTeX formatting

    Never reference external knowledge or assumptions. 
    Convert all scaled values to absolute numbers during calculations, 
    but omit units in the final answer.
    


In [6]:
if 'exp2.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V2
                )
        )
        tasks.append(res)

    results_exp2 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp2).to_json('../results/exp2.json')

if 'exp2.json' in os.listdir('../results/'):
    metrics_exp2 = evaluate_experiment('../results/exp2.json')

Experiment:
            % correct: 0.548
            % correct (percentage fix): 0.565
        


## Experiment 3: Prompt 3
> Uses maths operators + no evaluation

1. Uses `deepseek-chat`
2. Uses full context
3. Uses straight question
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V3

In [None]:
from src.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [8]:
if 'exp3.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3
                )
        )
        tasks.append(res)

    results_exp3 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp3).to_json('../results/exp3.json')

if 'exp3.json' in os.listdir('../results/'):
    metrics_exp3 = evaluate_experiment('../results/exp3.json')

Experiment:
            % correct: 0.403
            % correct (percentage fix): 0.5
        


## Experiment 4: Prompt 3

> Uses broken-down questions (step-by-step)

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V3
6. Uses maths operators + no evaluation
7. Uses step by step questions rather than the straight question

In [None]:
from src.prompts import SYSTEM_PROMPT_V3

print(SYSTEM_PROMPT_V3)


    Act as a financial computation engine. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. Output Requirements:
    - Final answer must be:
        - A nested combination of allowed operators
        - In unevaluated functional form
        - Expressed as \boxed{operator(...)} LaTeX
    - Include intermediate unit normalization calculations

    Example: For "Revenue per share - Cost per share = (5,000,000 revenue / 2,000,000 shares) - $5"
    Acceptable: \boxed{subtract(divide(5000000, 2000000), 5)}
    Unacceptable: \boxed{2.5 - 5} or \boxed{-2.5}



In [10]:
if 'exp4.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V3
                )
        )
        tasks.append(res)

    results_exp4 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp4).to_json('../results/exp4.json')

if 'exp4.json' in os.listdir('../results/'):
    metrics_exp4 = evaluate_experiment('../results/exp4.json')

Experiment:
            % correct: 0.554
            % correct (percentage fix): 0.554
        


## Experiment 5: Prompt 4

> Uses structured outputs and broken-down questions (step-by-step)

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V4
6. Uses maths operators + no evaluation
7. Uses step by step questions rather than the straight question
8. Uses structured output

In [None]:
from src.prompts import SYSTEM_PROMPT_V4

print(SYSTEM_PROMPT_V4)


    Act as a financial computation engine that outputs valid JSON. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. JSON Output Requirements:
    - Structure response as valid JSON with this schema:
        {
            "user_question": "string",
            "user_context": "string",
            "reasoning": ["step1", "step2", ..., "stepN"],
            "final_answer": "boxed_expression"
        }
    - Maintain atomic values in JSON (no complex objects)
    - Escape special characters properly
    - final_answer must use: \boxed{operator(...)} format
    
    4. Complian

In [12]:
if 'exp5.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='step_by_step_questions',
                answers_to_use='step_by_step_answers',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V4
                )
        )
        tasks.append(res)

    results_exp5 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp5).to_json('../results/exp5.json')

if 'exp5.json' in os.listdir('../results/'):
    metrics_exp5 = evaluate_experiment('../results/exp5.json')

Experiment:
            % correct: 0.48
            % correct (percentage fix): 0.48
        


## Experiment 6: Prompt 4

> Uses structured outputs and straight questions

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V4
6. Uses maths operators + no evaluation
7. Uses straight question
8. Uses structured output

In [None]:
from src.prompts import SYSTEM_PROMPT_V4

print(SYSTEM_PROMPT_V4)


    Act as a financial computation engine that outputs valid JSON. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. JSON Output Requirements:
    - Structure response as valid JSON with this schema:
        {
            "user_question": "string",
            "user_context": "string",
            "reasoning": ["step1", "step2", ..., "stepN"],
            "final_answer": "boxed_expression"
        }
    - Maintain atomic values in JSON (no complex objects)
    - Escape special characters properly
    - final_answer must use: \boxed{operator(...)} format
    
    4. Complian

In [14]:
if 'exp6.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-chat',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V4
                )
        )
        tasks.append(res)

    results_exp6 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp6).to_json('../results/exp6.json')

if 'exp6.json' in os.listdir('../results/'):
    metrics_exp6 = evaluate_experiment('../results/exp6.json')

Experiment:
            % correct: 0.427
            % correct (percentage fix): 0.508
        


## Experiment 7: Prompt 4

> Uses `deepseek-reasoner`, structured outputs and straight questions

1. Uses `deepseek-chat`
2. Uses full context
4. Does not use example fewshots
5. Uses SYSTEM_PROMPT_V4
6. Uses maths operators + no evaluation
7. Uses straight question
8. Uses structured output

In [None]:
from src.prompts import SYSTEM_PROMPT_V4

print(SYSTEM_PROMPT_V4)


    Act as a financial computation engine that outputs valid JSON. Required behavior:
    1. Input Processing:
    - Use ONLY context provided in the query
    - Never incorporate external data or assumptions
    2. Calculation Methodology:
    - Perform and display calculations by using ONLY these Python-style operators:
        - add(a, b) → a + b
        - subtract(a, b) → a - b
        - multiply(a, b) → a * b
        - divide(a, b) → a / b
        - power(a, b) → a^b
    - Each operator must have EXACTLY two arguments
    3. JSON Output Requirements:
    - Structure response as valid JSON with this schema:
        {
            "user_question": "string",
            "user_context": "string",
            "reasoning": ["step1", "step2", ..., "stepN"],
            "final_answer": "boxed_expression"
        }
    - Maintain atomic values in JSON (no complex objects)
    - Escape special characters properly
    - final_answer must use: \boxed{operator(...)} format
    
    4. Complian

In [16]:
if 'exp7.json' not in os.listdir('../results/'):

    start_time = time()

    tasks = []
    for prompt in sample_prompts[:5]:
        res = asyncio.create_task(
            async_converse_llm(
                processed_data_entry=prompt,
                client=async_client,
                model_name='deepseek-reasoner',

                use_short_context=False,
                question_to_use='question',
                answers_to_use='answer',
                example_shots='',
                sys_prompt=SYSTEM_PROMPT_V4
                )
        )
        tasks.append(res)

    results_exp7 = await asyncio.gather(*tasks)
    
    total_time = time() - start_time
    minutes = int(total_time // 60)
    seconds = total_time % 60
    print(f"\nExecution time: {minutes} minutes and {seconds:.0f} seconds")

    pd.Series(results_exp7).to_json('../results/exp7.json')

if 'exp7.json' in os.listdir('../results/'):
    metrics_exp7 = evaluate_experiment('../results/exp7.json')

ZeroDivisionError: division by zero

# Deep dive

In [None]:
flat_results = pd.json_normalize(
    pd.DataFrame(metrics_exp4)
    .stack()
    .reset_index(drop=True)
    )
wrong_answers_4 = flat_results[flat_results['close_match'] == 0]

In [None]:
flat_results = pd.json_normalize(
    pd.DataFrame(metrics_exp5)
    .stack()
    .reset_index(drop=True)
    )
wrong_answers_5 = flat_results[flat_results['close_match'] == 0]

In [None]:
flat_results = pd.json_normalize(
    pd.DataFrame(metrics_exp6)
    .stack()
    .reset_index(drop=True)
    )
wrong_answers_6 = flat_results[flat_results['close_match'] == 0]

In [None]:
commonly_wrong = set(wrong_answers_5.index.values) & set(wrong_answers_4.index.values) & set(wrong_answers_6.index.values)
commonly_wrong = [int(x) for x in commonly_wrong]

In [None]:
wrong_answers_4.loc[commonly_wrong[0]]

question                                      what is the percent change?
complete_response       To calculate the **percent change** in the S&P...
model_pred                                                        98.09\%
eval_ans                                                            98.09
annot_ans                                                          0.9809
parsable                                                             True
close_match                                                             0
delta                                                             97.1091
close_match_perc_fix                                                    0
delta_perc_fix                                                    97.1091
Name: 23, dtype: object

In [None]:
wrong_answers_5.loc[commonly_wrong[0]].get('question')

'what is the percent change?'

In [None]:
wrong_answers_5.loc[commonly_wrong[0]]

question                                      what is the percent change?
complete_response       {\n    "user_question": "what is the percent c...
model_pred              multiply(divide(subtract(198.09,100.00),100.00...
eval_ans                                                            98.09
annot_ans                                                          0.9809
parsable                                                             True
close_match                                                             0
delta                                                             97.1091
close_match_perc_fix                                                    0
delta_perc_fix                                                    97.1091
Name: 23, dtype: object

In [None]:
wrong_answers_6.loc[commonly_wrong[0]]

question                      what was the profit margin in december 2016
complete_response       {\n    "user_question": "what was the profit m...
model_pred                                multiply(divide(1512,3410),100)
eval_ans                                                        44.340176
annot_ans                                                           13.3%
parsable                                                             True
close_match                                                             0
delta                                                           31.040176
close_match_perc_fix                                                    0
delta_perc_fix                                                  31.040176
Name: 23, dtype: object

In [None]:
print(wrong_answers_5.loc[commonly_wrong[0]].get('complete_response'))

{
    "user_question": "what was the sum including cash from financing activities?",
    "user_context": "[Row 0] ( in millions ): net cash provided by operating activities; year ended september 30 , 2019: $ 2310.2\n[Row 1] ( in millions ): net cash used for investing activities; year ended september 30 , 2019: $ -4579.6 ( 4579.6 )\n[Row 2] ( in millions ): net cash provided by ( used for ) financing activities; year ended september 30 , 2019: $ 1780.2",
    "reasoning": [
        "1. Extract net cash provided by operating activities in 2019: $2310.2 million",
        "2. Extract net cash used for investing activities in 2019: $4579.6 million (absolute value)",
        "3. Extract net cash provided by financing activities in 2019: $1780.2 million",
        "4. Sum all three values using add() operators in sequence"
    ],
    "final_answer": "\boxed{add(add(2310.2, 4579.6), 1780.2)}"
}


In [None]:
sample_prompts[1]

{'question_type': 'simple',
 'question': 'in 2019 what was the net change in cash in millions',
 'answer': '-489.2',
 'short_context': 'Context:\n\n[Row 0] ( in millions ): net cash provided by operating activities; year ended september 30 , 2019: $ 2310.2; year ended september 30 , 2018: $ 1931.2\n[Row 1] ( in millions ): net cash used for investing activities; year ended september 30 , 2019: $ -4579.6 ( 4579.6 ); year ended september 30 , 2018: $ -815.1 ( 815.1 )\n[Row 2] ( in millions ): net cash provided by ( used for ) financing activities; year ended september 30 , 2019: $ 1780.2; year ended september 30 , 2018: $ -755.1 ( 755.1 )\n\n',
 'full_context': 'Context:\n\ncredit facilities. as such, our foreign cash and cash equivalents are not expected to be a key source of liquidity to our domestic operations. at september 30, 2019, we had approximately $ 2.9 billion of availability under our committed credit facilities, primarily under our revolving credit facility, the majority of 