In [1]:
# 🎨 Math LLM Study Assistant Experiment Notebook

"""
Welcome! This notebook guides you step-by-step to compare three Ollama models
(GPT-2, Phi-2, Mistral) on basic math questions. Each code block is
richly commented to explain what’s happening.
"""

# 🧰 1. Install & Prepare Ollama
# --------------------------------
# (Only run this if Ollama is not yet installed)
# This shell command downloads and installs the Ollama CLI.
!curl -fsSL https://ollama.com/install.sh | sh  # 🚀 installing Ollama
import subprocess
process = subprocess.Popen("ollama serve", shell=True)

# Pull the models you’ll test: GPT-2 (tiny), Phi-2 (medium), Mistral (large)
!ollama pull tinyllama:1.1b     # 🔹 very fast, poor math
!ollama pull phi3:3.8b          # 🔹 balanced speed & reasoning
!ollama pull mistral:7b         # 🔹 best reasoning, slower

# 📦 Confirm models are ready
!ollama list

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
Error: could not connect to ollama app, is it running?
[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠏ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠏ [K[?

In [None]:
# 📂 2. Load Math Questions
# --------------------------------
# We expect a file named 'math_questions.txt' with lines: question<TAB>answer
# If it doesn't exist, we create a sample with 4 questions.

from pathlib import Path

questions_file = Path('math_questions.txt')
if not questions_file.exists():
    # Create a sample file with simple math problems
    sample = [
        "12 x 7\t84",
        "17 * 24\t408",
        "256 / 8\t32",
        "13 + 49\t62"
    ]
    with open(questions_file, 'w') as f:
        for line in sample:
            f.write(line + '\n')  # 📝 writing sample questions

# Read the questions into a list
questions = []
with open(questions_file) as f:
    for line in f:
        q, a = line.strip().split("\t")
        questions.append((q, a))

print(f"✅ Loaded {len(questions)} questions for testing!")

✅ Loaded 4 questions for testing!


In [2]:
# 📂 2. Load Math Questions (GSM8K from Hugging Face)
# ---------------------------------------------------
# If 'math_questions.txt' exists, we just read it.
# Otherwise, we pull GSM8K via `datasets`, pick N examples, and save them.

from pathlib import Path

# 📊 How many examples to use?
NUM_SAMPLES = 30

questions_file = Path('math_questions.txt')

if not questions_file.exists():
    # 🔧 1. Install and import the HF datasets library
    !pip install -q datasets
    from datasets import load_dataset

    # 🔍 2. Load the GSM8K training split
    gsm8k = load_dataset("openai/gsm8k", "main", split="train")

    # ✂️ 3. Extract question + final answer (after "####")
    samples = []
    for ex in gsm8k.select(range(NUM_SAMPLES)):
        q_text = ex["question"].strip()
        # the answer field has steps + "#### <final>"
        raw_ans = ex["answer"]
        # split on "####" to get the last line as our truth
        truth = raw_ans.split("####")[-1].strip()
        samples.append((q_text, truth))

    # 📝 4. Save those to math_questions.txt as "question␉answer"
    with open(questions_file, 'w') as f:
        for q, a in samples:
            f.write(f"{q}\t{a}\n")

    print(f"✅ Pulled {NUM_SAMPLES} samples from GSM8K and wrote '{questions_file}'")

# 📖 5. Now read back the file into `questions` for your experiment loop
questions = []
with open(questions_file) as f:
    for line in f:
        q, a = line.strip().split("\t")
        questions.append((q, a))

print(f"✅ Loaded {len(questions)} questions for testing!")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

✅ Pulled 30 samples from GSM8K and wrote 'math_questions.txt'
✅ Loaded 30 questions for testing!


In [3]:
# 🏁 3. Set Up Results Logging
# --------------------------------
# We will record: question, model, answer, correct (True/False), time in seconds
import csv

output_file = 'results.csv'
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Header row for our CSV
    writer.writerow(['question', 'model', 'answer', 'correct', 'time_s'])
print(f"🗄️  Results will be saved in: {output_file}")

🗄️  Results will be saved in: results.csv


In [4]:
import subprocess
import time

models = ['tinyllama:1.1b', 'phi3:3.8b', 'mistral:7b']  # The Ollama models to test

for question, truth in questions:
    print(f"\n🔍 [Q] {question}")
    for model in models:
        start = time.time()

        result = subprocess.run(
            ['ollama', 'run', model, question],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        elapsed = time.time() - start

        if result.returncode != 0:
            print(f"⚠️ Error from {model}:")
            print(result.stderr)
            continue  # Skip logging for this model if it failed

        # Get the last non-empty line of output
        lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
        answer = lines[-1] if lines else ""

        # Check if model's output starts with the correct number
        # Check correctness by searching for the expected answer number in the output
        correct = truth in answer.replace('=', '').replace('.', '').lower()
        mark = '✅' if correct else '❌'
        print(f"🔎 Checking if '{truth}' in → {answer}")

        print(f"{model:<8} ➡️  {answer:<30} {mark}  ({elapsed:.2f}s)")

        # Save result to CSV
        with open(output_file, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([question, model, answer, correct, f"{elapsed:.2f}"])


🔍 [Q] Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
🔎 Checking if '72' in → Natalia sold a total of 96 clip(s) between April (2 clips) and May (48 clips). Therefore, in April and May, she sold half the clip(s), or 48 clip(s).
tinyllama:1.1b ➡️  Natalia sold a total of 96 clip(s) between April (2 clips) and May (48 clips). Therefore, in April and May, she sold half the clip(s), or 48 clip(s). ❌  (16.65s)
🔎 Checking if '72' in → Altogether, Natalia sold 48 + 24 = 72 clips in April and May.
phi3:3.8b ➡️  Altogether, Natalia sold 48 + 24 = 72 clips in April and May. ✅  (13.17s)
🔎 Checking if '72' in → So, Natalia sold a total of 72 clips in April and May.
mistral:7b ➡️  So, Natalia sold a total of 72 clips in April and May. ✅  (24.12s)

🔍 [Q] Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
🔎 Checking if '10' in → Ac

KeyboardInterrupt: 

In [5]:
# 📊 5. Summarize & Compare Results
# --------------------------------
# Use pandas to load the CSV and compute per-model metrics:
#  - Accuracy (%)
#  - Average inference time (seconds)

import pandas as pd

df = pd.read_csv(output_file)
# Convert 'correct' to numeric for aggregation

df['correct_num'] = df['correct'].astype(int)
summary = df.groupby('model').agg(
    accuracy = ('correct_num', 'mean'),
    avg_time = ('time_s', lambda x: x.astype(float).mean())
)
# Format metrics nicely
summary['accuracy'] = (summary['accuracy'] * 100).round(1)
summary['avg_time'] = summary['avg_time'].round(2)

print("\n📈 Overall Performance Summary:")
print(summary)

# You can also display the DataFrame directly in Colab for interactivity
try:
    from IPython.display import display
    display(summary)
except ImportError:
    pass

# 🎉 End of experiment. Review the printed logs and summary to draw conclusions!



📈 Overall Performance Summary:
                accuracy  avg_time
model                             
mistral:7b          40.0      7.64
phi3:3.8b           60.0      5.15
tinyllama:1.1b       9.1      2.09


Unnamed: 0_level_0,accuracy,avg_time
model,Unnamed: 1_level_1,Unnamed: 2_level_1
mistral:7b,40.0,7.64
phi3:3.8b,60.0,5.15
tinyllama:1.1b,9.1,2.09
