In [None]:
import re
from sympy.parsing.latex import parse_latex
from sympy import Basic
import sympy
from pprint import pprint
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("zwhe99/DeepMath-103K")

In [3]:
train_ds = ds['train']

# Preliminary tasks

## Briefly describe the data

In [None]:
# TODO

## Parsing

### Notes
- [] Questions have different latex sytanx (`[`, `(`, `$`)
- [] How do we tokenize the latex formula?
  - [] `hybrid_tokenize` with word lenght tokens
  - [] BERT
  - [] https://github.com/google/sentencepiece

In [4]:
NUM_SAMPLES = 10

In [5]:
# Retrieve all the questions
train_ds_qs = train_ds['question'][:NUM_SAMPLES]

### Initial investigation on sympy

In [6]:

def extract_latex_bracket_content(text):
    patterns = [
        r'\\\[(.*?)\\\]',  # \[ ... \]
        r'\\\((.*?)\\\)',  # \( ... \)
        r'\$(.*?)\$',       # $ ... $
    ]
    # Combine patterns into one
    combined_pattern = '|'.join(patterns)
    #matches = [m for m in re.findall(combined_pattern, text, flags=re.DOTALL)]
    #formulas = [next(filter(None, tup)) for tup in matches]
    matches = [m for m in re.findall(combined_pattern, text, flags=re.DOTALL)]
    formulas = []
    for tup in matches:
        non_empty = list(filter(None, tup))
        if non_empty:
            formulas.append(non_empty[0])
    # Use a counter to number the placeholders
    def replacer(match, counter=[1]):
        placeholder = "{" + str(counter[0]-1) + "}"
        counter[0] += 1
        return placeholder
    new_text = re.sub(combined_pattern, replacer, text)
    return new_text, formulas

parsed_indices = []
parsed_text = []
parsed_formulas = []
for i in range(len(train_ds_qs)):
    d = train_ds_qs[i]
    #print("original input:", d)
    text, formulas = extract_latex_bracket_content(d)
    #print("parsed text: ", text, "\nequations: ", formulas)
    formulas_sympy = []
    num_parsed = 0
    for expr in formulas:
        try:
            parsed = parse_latex(expr)
            formulas_sympy.append(parsed)
            num_parsed += 1
        except Exception as e:
            pass
            #print("Could not parse:", e)
            #print(parsed)
    if num_parsed == len(formulas):
        parsed_indices.append(i)
        parsed_text.append(text)
        parsed_formulas.append(formulas_sympy)
print("correctly parsed", len(parsed_indices), "inputs over", len(train_ds_qs))

correctly parsed 7 inputs over 10


### Hybrid tokenizer

In [7]:
def print_sympy_tree(expr, indent=0):
    print('  ' * indent + str(expr.func))
    for arg in expr.args:
        print_sympy_tree(arg, indent + 1)

# --- Parse LaTeX into sympy then walk the expression tree ---
def flatten_sympy_expr(expr: Basic):
    tokens = []

    def walk(node):
        if isinstance(node, Basic):
            args_len = len(node.args)
            args = node.args
            # If only 1 we build the output as operator + walk(operand)
            if args_len == 1:
                if isinstance(node, (sympy.Symbol, sympy.Integer, sympy.Rational, sympy.Float)):
                    tokens.append(str(node))
                else:
                    tokens.append(node.func.__name__)
                walk(args[0])
            # If only 2 we build the output with inorder visit walk(operand0) + operator + walk(operand1)
            elif args_len == 2:
                walk(args[0])
                if isinstance(node, (sympy.Symbol, sympy.Integer, sympy.Rational, sympy.Float)):
                    tokens.append(str(node))
                else:
                    tokens.append(node.func.__name__)
                walk(args[1])
            # If no operands are found we just add self
            elif args_len == 0:
                if isinstance(node, (sympy.Symbol, sympy.Integer, sympy.Rational, sympy.Float)):
                    tokens.append(str(node))
                else:
                    tokens.append(node.func.__name__)
            # If there are multiple operands (> 2) we add them all after the operator
            else:
                if isinstance(node, (sympy.Limit, sympy.Integral)):
                    args = reversed(args)
                if isinstance(node, (sympy.Symbol, sympy.Integer, sympy.Rational, sympy.Float)):
                    tokens.append(str(node))
                else:
                    tokens.append(node.func.__name__)
                for arg in args:
                    walk(arg)
        else:
            raise RuntimeError("not sympy Basic object")
    
    walk(expr)
    return tokens

# --- Main tokenization function ---
def hybrid_tokenize(text):
    math_pattern = r'(\$\$.*?\$\$|\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\))'
    parts = re.split(math_pattern, text, flags=re.DOTALL)

    final_tokens = []
    is_erorr = False
    for part in parts:
        if re.match(math_pattern, part, flags=re.DOTALL):
            # Clean math delimiters
            clean = re.sub(r'^(\$+|\\\[|\\\(|\\])|(\$+|\\\]|\\\))$', '', part.strip())
            try:
                parsed = parse_latex(clean)
                math_tokens = flatten_sympy_expr(parsed)
                final_tokens.extend(math_tokens)
            except Exception as e:
                is_erorr = True
                final_tokens.extend(clean.split(' '))
        else:
            final_tokens.extend(re.findall(r'\b\w+\b', part))
    return is_erorr, final_tokens



In [8]:

parsed_inputs = []
not_parsed_inputs = []
for d in train_ds_qs:
    error, tokens = hybrid_tokenize(d)
    if not error:
        parsed_inputs.append(tokens)
    else:
        not_parsed_inputs.append(tokens)


print("correctly parsed", len(parsed_inputs), "inputs over", len(train_ds_qs))
print("not parsed", len(not_parsed_inputs), "inputs over", len(train_ds_qs))


correctly parsed 7 inputs over 10
not parsed 3 inputs over 10


In [9]:
not_parsed_inputs[:1]

[['Find',
  'the',
  'length',
  'of',
  'the',
  'polar',
  'curve',
  'given',
  'by',
  'r',
  'Equality',
  '1',
  'Add',
  'cos',
  '2',
  'Mul',
  'theta',
  'Pow',
  '1/2',
  'for',
  '',
  '0',
  '\\leq',
  '\\theta',
  '\\leq',
  '\\frac{\\pi\\sqrt{2}}{4}',
  '']]

### Learn tokenization with BERT

In [None]:
# TODO: investigate finetuning on pretrained on custom corpus

In [10]:
# TODO: investigate different pre trained
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [11]:
bert_token_ids = []
bert_tokens = []
for d in train_ds_qs:
	#tokens = tokenizer.tokenize(d)
    bert_tokens.append(tokenizer.tokenize(d))
    bert_token_ids.append(tokenizer.encode(d))

print(bert_tokens[:1])
print(bert_token_ids[:1])

[['Eva', '##lu', '##ate', 'the', 'limit', ':', '\\', '[', '\\', 'l', '##im', '_', '{', 'x', '\\', 'to', '\\', 'in', '##fty', '}', '\\', 'sq', '##rt', '{', 'x', '}', '\\', 'left', '(', '\\', 'sq', '##rt', '[', '3', ']', '{', 'x', '+', '1', '}', '-', '\\', 'sq', '##rt', '[', '3', ']', '{', 'x', '-', '1', '}', '\\', 'right', ')', '\\', ']']]
[[101, 9734, 7535, 2193, 1103, 5310, 131, 165, 164, 165, 181, 4060, 168, 196, 193, 165, 1106, 165, 1107, 27944, 198, 165, 4816, 3740, 196, 193, 198, 165, 1286, 113, 165, 4816, 3740, 164, 124, 166, 196, 193, 116, 122, 198, 118, 165, 4816, 3740, 164, 124, 166, 196, 193, 118, 122, 198, 165, 1268, 114, 165, 166, 102]]


## Perform cluster analysis on questions field

## Perform cluster analysis on questions field

In [None]:
# TODO

## Perform document index on different fields

In [None]:
# pip install -q python-terrier==0.11.0 #now there is another version but we don't use it

In [4]:
import os
os.environ["JAVA_HOME"] = "/opt/homebrew/Cellar/openjdk@11/11.0.21/libexec/openjdk.jdk/Contents/Home"
os.environ["JVM_PATH"] = "/opt/homebrew/Cellar/openjdk@11/11.0.26/libexec/openjdk.jdk/Contents/Home/lib/server/libjvm.dylib"

In [5]:
column_names = train_ds.column_names
print(column_names)

# 2. Initialize PyTerrier (only once)
import pyterrier as pt

# google colab
#if not pt.started():
#  pt.init()

# vs code in mac
if not pt.java.started():
    pt.init()

['question', 'final_answer', 'difficulty', 'topic', 'r1_solution_1', 'r1_solution_2', 'r1_solution_3']


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


### Finding "quadratic form" in 'question'

In [6]:
# 3. Build a corpus iterator over the 'question' field
def get_question_corpus(ds):
    for i, example in enumerate(ds):
        yield {
            'docno': str(i),                          # unique document ID
            'question': example['question']           # only the question text
        }

# 4. Index into a new folder, specifying that we only want the 'question' field
pt_index_path = './terrier_deepmath_questions'
indexer = pt.index.IterDictIndexer(pt_index_path, overwrite=True, meta_reverse=[])
index_ref = indexer.index(
    get_question_corpus(train_ds),
    fields=('question',),
    meta=('docno', 'question')  # store docno + question in the metadata
)

# 5. Load the index and run a TF-IDF retrieval over the 'question' field
index = pt.IndexFactory.of(index_ref)
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
results = tfidf.search("quadratic form")
print(results.head(6))

  tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")


  qid  docid  docno  rank     score           query
0   1  29503  29503     0  9.515177  quadratic form
1   1  95512  95512     1  9.180302  quadratic form
2   1  96657  96657     2  9.049716  quadratic form
3   1  97982  97982     3  9.049716  quadratic form
4   1  10764  10764     4  8.987538  quadratic form
5   1   7207   7207     5  8.978068  quadratic form


### Perform a 'question' search in only documents with x difficulty

In [None]:
# Imposto difficulty = 0.0
filtered_ds = [ex for ex in train_ds if float(ex['difficulty']) == 0.0]

# Build a corpus iterator over the 'question' field
def get_question_corpus(ds):
    for i, example in enumerate(ds):
        yield {
            'docno': str(i),                          # unique document ID
            'question': example['question']           # only the question text
        }

# Index into a new folder, specifying that we only want the 'question' field
pt_index_path = './terrier_deepmath_questions'
indexer = pt.index.IterDictIndexer(pt_index_path, overwrite=True, meta_reverse=[])
index_ref = indexer.index(
    get_question_corpus(filtered_ds),
    fields=('question',),
    meta=('docno', 'question')
)

# Load the index and run a TF-IDF retrieval over the 'question' field
index = pt.IndexFactory.of(index_ref)
tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
query = "quadratic form"
query1 = "quadratic form optimization minimize" #puoi mettere piu parole e funziona
results = tfidf.search(query)
print(results.head(6))

### Finding 'quadratic form' in more fields (NON RIUSCITO)

## Generate embeddings and analyze them

In [None]:
# TODO

# Training

## M1C: Base model (topic classification)

## M1R: Base model (difficulty regression)

## M1S: Base model (short answer)

## M2: Introduce reasoning

## Comparisons