In [None]:
# 🎨 Math LLM Study Assistant Experiment Notebook
#@title 👨‍🔬 Simple Mathematical Benchmarking for Natural Language Processing Models
#@markdown This notebook is intended for research purposes only. Based on a legacy notebook from the repository (SIMP/legacy.ipynb) which is developed together with ChatGPT by OpenAI that I attributed to. This notebook guides you step-by-step to compare atleast three natural language processing models from Ollama on basic math questions from GSK8M (openai/gsk8m).
#@markdown <br> <br> This project is so simple that it only could benchmark simple mathemathics like GSM8K. As the result comparing only compare the numbers from the generation result and looking for the supposedly correct number, whether it is mentioned in the generation result or not.
#@markdown <br> <br> In conclusion, this notebook isn't suitable for advanced benchmarking and only can process simple mathematical prompts.
#@markdown <br> <br> Visit me on [GitHub](https://github.com/mohmirzabr)

In [None]:
#@title 0. ⚠️ MAKE SURE YOU'RE USING NVIDIA T4 AS THE GPU AND PYTHON 3 FOR THE RUNTIME TYPE
#@markdown This cell will make sure whether you're having your NVIDIA T4 enabled or not.
!nvidia-smi

In [None]:
#@title 1. 🧰 Install and Prepare Ollama
#@markdown In this cell, we will install the backend first. In which, we'll be using Ollama. In this cell, we'll also pull those three models to be tested in the next cells.
#@markdown <br> <br> Below are the variables of each model that you want to use. Make sure that the models are existent in Ollama to be downloaded.
model1 = "tinyllama:1.1b" # @param {"type":"string"}
model2 = "phi3:3.8b" # @param {"type":"string"}
model3 = "mistral:7b" # @param {"type":"string"}

# 🚀 Installing Ollama
!curl -fsSL https://ollama.com/install.sh | sh

# 🔥 Enabling Ollama
import subprocess
process = subprocess.Popen ("ollama serve", shell=True)

# ♻️ Pulling the models.
!ollama pull {model1}
!ollama pull {model2}
!ollama pull {model3}

# 📦 Confirming the models that are ready.
!ollama list

In [None]:
#@title 2. 📂 Load Math Questions
#@markdown This cell is where we're going to put the samples to be tested to each model. This cell will create a file named 'math_questions.txt' with lines: question<TAB>answer (except if the file existed in the first place).
#@markdown <br> <br> If you haven't prepare any datasets, you can use the template below.
templatequestions = False # @param {"type":"boolean"}

#@markdown <br> <br> Below are the variables that are going to be used for scraping the dataset from Hugging Face to 'math_questions.txt'.
#@markdown <br> <br> 1. 🧰 Which dataset you're going to refer to?
dataset = "openai/gsm8k" # @param {"type":"string"}
branch = "main" # @param {"type":"string"}
#@markdown <br> <br> 2. 📊 How many examples to use?
NUM_SAMPLES = 30 # @param {"type":"number"}

if templatequestions == False:
  from pathlib import Path

  questions_file = Path('math_questions.txt')

  if not questions_file.exists():
      # 🔧 1. Install and import the HF datasets library
      !pip install -q datasets
      from datasets import load_dataset

      # 🔍 2. Load the GSM8K training split
      gsm8k = load_dataset(dataset, branch, split="train")

      # ✂️ 3. Extract question + final answer (after "####")
      samples = []
      for ex in gsm8k.select(range(NUM_SAMPLES)):
          q_text = ex["question"].strip()
          # the answer field has steps + "#### <final>"
          raw_ans = ex["answer"]
          # split on "####" to get the last line as our truth
          truth = raw_ans.split("####")[-1].strip()
          samples.append((q_text, truth))

      # 📝 4. Save those to math_questions.txt as "question␉answer"
      with open(questions_file, 'w') as f:
          for q, a in samples:
              f.write(f"{q}\t{a}\n")

      print(f"✅ Pulled {NUM_SAMPLES} samples from {dataset} and wrote '{questions_file}'")

  # 📖 5. Now read back the file into `questions` for your experiment loop
  questions = []
  with open(questions_file) as f:
      for line in f:
          q, a = line.strip().split("\t")
          questions.append((q, a))

  print(f"✅ Loaded {len(questions)} questions for testing!")
elif templatequestions == True:
  from pathlib import Path

  questions_file = Path('math_questions.txt')
  if not questions_file.exists():
      # Create a sample file with simple math problems
      sample = [
          "12 x 7\t84",
          "17 * 24\t408",
          "256 / 8\t32",
          "13 + 49\t62"
      ]
      with open(questions_file, 'w') as f:
          for line in sample:
              f.write(line + '\n')  # 📝 writing sample questions

  # Read the questions into a list
  questions = []
  with open(questions_file) as f:
      for line in f:
          q, a = line.strip().split("\t")
          questions.append((q, a))

  print(f"✅ Loaded {len(questions)} questions for testing!")

In [None]:
#@title 🏁 3. Set Up Results Logging
#@markdown We will record: question, model, answer, correct (True/False), time in seconds and put it into a CSV file to be processed by the next cell and then compared for conclusion.

import csv

output_file = 'results.csv'
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Header row for our CSV
    writer.writerow(['question', 'model', 'answer', 'correct', 'time_s'])
print(f"🗄️  Results will be saved in: {output_file}")

In [None]:
#@title 4. 🚀 Experiment is running!
#@markdown This is the cell where everything begin to run, and the GPU RAM start to shrinking. The notebook is processing all of the questions and comparing the answer at the same time!

import subprocess
import time

models = [model1, model2, model3]  # The Ollama models to test

for question, truth in questions:
    print(f"\n🔍 [Q] {question}")
    for model in models:
        start = time.time()

        result = subprocess.run(
            ['ollama', 'run', model, question],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        elapsed = time.time() - start

        if result.returncode != 0:
            print(f"⚠️ Error from {model}:")
            print(result.stderr)
            continue  # Skip logging for this model if it failed

        # Get the last non-empty line of output
        lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
        answer = lines[-1] if lines else ""

        # Check if model's output starts with the correct number
        # Check correctness by searching for the expected answer number in the output
        correct = truth in answer.replace('=', '').replace('.', '').lower()
        mark = '✅' if correct else '❌'
        print(f"🔎 Checking if '{truth}' in → {answer}")

        print(f"{model:<8} ➡️  {answer:<30} {mark}  ({elapsed:.2f}s)")

        # Save result to CSV
        with open(output_file, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([question, model, answer, correct, f"{elapsed:.2f}"])

In [None]:
#@title 5. 📊 Summarize & Compare Results
#@markdown This is where everything begin to be put into a conclusion. By using pandas, this cell loads the CSV file and compute per-model metrics: Accuracy (%) and Average inference time (seconds).

import pandas as pd

df = pd.read_csv(output_file)
# Convert 'correct' to numeric for aggregation

df['correct_num'] = df['correct'].astype(int)
summary = df.groupby('model').agg(
    accuracy = ('correct_num', 'mean'),
    avg_time = ('time_s', lambda x: x.astype(float).mean())
)
# Format metrics nicely
summary['accuracy'] = (summary['accuracy'] * 100).round(1)
summary['avg_time'] = summary['avg_time'].round(2)

print("\n📈 Overall Performance Summary:")
print(summary)

# You can also display the DataFrame directly in Colab for interactivity
try:
    from IPython.display import display
    display(summary)
except ImportError:
    pass

# 🎉 End of experiment. Review the printed logs and summary to draw conclusions!