In [1]:
from latex2sympy2 import latex2sympy
from pylatexenc.latex2text import LatexNodes2Text
import sympy as sp
import pandas as pd
import re
import ast
import warnings
import csv
#warnings.filterwarnings('ignore')

In [2]:
def parse_assigment_with_dots(expression):
    equivalents = {
        r"N == 9 + 99 + 999 + 9999 + \cdots + \underbrace{99\ldots 99}_\text{321 digits}": r"N = \sum_{k=1}^{321} 9 \times 10^{k-1}",
        r"(10-1)+(10^2-1)+(10^3-1)+\cdots+(10^{321}-1)": r"\sum_{k=1}^{321} (10^k - 1)",
        r"(10+10^2+10^3+\cdots+10^{321})-321": r"\sum_{k=1}^{321} 10^k - 321",
        r"(10+10^2+10^3+\cdots+10^{321})": r"\sum_{k=1}^{321} 10^k",
        r"1111\cdots.10": r"\frac{10^{322} - 1}{9}",
        r"(10+100+\cdots 10^{320}+10^{321})": r"\sum_{k=0}^{321} 10^k",
        r"11\cdots0": r"\frac{10^{322} - 10}{9}",
        r"11\cdots10-n": r"\frac{10^{322} - 1}{9} - n",
        r"1, 2, 3,\ldots, 19, 20": r"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20",
        r"\dfrac{1}{2} |(a_1b_2 + a_2b_3 + \cdots + a_nb_1) - (b_1a_2 + b_2a_3 + \cdots + b_na_1)|": r"(\sum_{i=1}^{n} a_ib_{i+1} - \sum_{i=1}^{n} b_ia_{i+1})*(1/2)",
        r"(b_1a_2 + b_2a_3 + \cdots + b_na_1)": r"\sum_{i=1}^{n} b_ia_{i+1}",
        r"z_1,z_2,\dots,z_{673}": r"\sum_{i=1}^{673} z_i",
        r"(x-z_1)^3(x-z_2)^3 \cdots (x-z_{673})^3": r"\prod_{i=1}^{673} (x - z_i)^3",
        r"(z_1z_2+z_1z_3+ \dots + z_1z_{672}+z_1z_{673})+(z_2z_3+z_2z_4+ \dots +z_2z_{673}) + (z_3z_4+z_3z_5+ \dots +z_3z_{673}) + \dots +z_{672}z_{673}": r"\sum_{i=1}^{672} \left( \sum_{j=i+1}^{673} z_iz_j \right)",
        r"P==(x-z_1)(x-z_1)(x-z_1)(x-z_2)(x-z_2)(x-z_2) \dots (x-z_{673})(x-z_{673})(x-z_{673})": r"P = \prod_{i=1}^{673} (x - z_i)^3",
        r"z_1,z_1,z_1,z_2,z_2,z_2, \dots , z_{673},z_{673},z_{673}": r"\sum_{i=1}^{673} \left( z_i + z_i + z_i \right)",
        r"(-1)^1 \cdot \dfrac{20}{1}==-20==z_1+z_1+z_1+z_2+z_2+z_2+ \dots + z_{673}+z_{673}+z_{673}==3(z_1+z_2+z_3+ \dots +z_{673})": r"(-1)^1 \frac{20}{1} == -20 == \sum_{i=1}^{673} 3z_i == 3 \sum_{i=1}^{673} z_i",
        r"z_1+z_2+z_3+ \dots +z_{673}==- \dfrac{20}{3}": r"\sum_{i=1}^{673} z_i = -\frac{20}{3}",
        r"z_1^2+z_1^2+z_1^2+z_1z_2+z_1z_2+z_1z_2+ \dots ==  \\ 3(z_1^2+z_2^2+ \dots + z_{673}^2) + 9(z_1z_2+z_1z_3+z_1z_4+ \dots + z_{672}z_{673}) ==  3(z_1^2+z_2^2+ \dots + z_{673}^2) + 9S": r"\sum_{i=1}^{673} z_i^2 = \frac{400}{9} - 2S",
        r"(z_1+z_2+z_3+ \dots +z_{673})^2== (-20/3)^2==\dfrac{400}{9} \\ ==(z_1^2+z_2^2+ \dots + z_{673}^2)+2(z_1z_2+z_1z_3+z_1z_4+ \dots + z_{672}z_{673})==(z_1^2+z_2^2+ \dots + z_{673}^2)+2S": r"\left( \sum_{i=1}^{673} z_i \right)^2 = \frac{400}{9} - 2S",
        r"z_1+z_2+z_3+ \dots +z_{673}": r"\sum_{i=1}^{673} z_i",
        r"z_1^2+z_2^2+ \dots + z_{673}^2== \dfrac{400}{9} -2S": r"\sum_{i=1}^{673} z_i^2 = \frac{400}{9} - 2S",
        r"f(x)==(x-z_{1})(x-z_{1})\cdots (x-z_{673})": r"f(x) == \prod_{i=1}^{673} (x - z_i)",
        r"(x-z_1)^3(x-z_2)^3(x-z_3)^2\cdots (x-z_{673})^3": r"\prod_{i=1}^{673} (x - z_i)^3",
        r"3(x_1+x_2+\cdots+x_{673})": r"3 \sum_{i=1}^{673} x_i",
        r"3(z_1^2+z_2^2+z_3^2+\dots+z_{673}^2)": r"3 \sum_{i=1}^{673} z_i^2",
        r"-20 == 3(z_1+z_2+z_3+z_4 \dots+z_{673})": r"-20 = 3 \sum_{i=1}^{673} z_i",
        r"19 == 3(z_1^2+z_2^2+z_3^2+\dots+z_{673}^2) + 9S": r"19 = 3 \sum_{i=1}^{673} z_i^2 + 9S",
        r"P_2 == z_1^2+z_2^2+z_3^2+\dots+z_{673}^2": r"P_2 = \sum_{i=1}^{673} z_i^2",
        r"z_1^2+z_2^2+ \dots + z_{673}^2": r"\sum_{i=1}^{673} z_i^2",
        r'(B == 2 \ldots 20), (J == 1 \ldots 19)':r'And(2 <= B, B <= 20, 1 <= J, J <= 19)',
        r'(\{1,2\},\{2,3\},\{3,4\}\dots,\{19,20\})':r'(1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19), (19, 20)',
        r'1,2,3,\ldots': r'Sum(n, (n, 1, oo))',
        r'\frac{S}{20}==\frac{2000+80(3+7+11+\cdots+23)}{20}==100+4(3+7+11+\cdots+23)=={472}':r'\frac{S}{20}==\frac{2000+80\left(\sum_{k=0}^{5} (3 + 4k)\right)}{20}==100+4\left(\sum_{k=0}^{5} (3 + 4k)\right)=={472}',
        r'\frac{S}{20}==\frac{2000+80(3+7+11+\cdots+23)}{20}==100+4(3+7+11+\cdots+23)=={472} ':r'\frac{S}{20}==\frac{2000+80\left(\sum_{k=0}^{5} (3 + 4k)\right)}{20}==100+4\left(\sum_{k=0}^{5} (3 + 4k)\right)=={472}',
        r'2^0\theta \equiv 2^3\theta \equiv 2^6\theta \equiv \cdots \mod{180^{\circ}}.' : r'2^0 * \theta \mod{180} == 2^3 * \theta \mod{180} == 2^6 * \theta \mod{180}',
        r'0.11011011011\cdots_2 == \frac{6}{7}_{10}':r'\sum_{k=0}^{\infty} \left(\frac{1}{2}\right)^{2k+1} + \sum_{k=0}^{\infty} \left(\frac{1}{2}\right)^{2k+2} == \frac{6}{7}',
        r'90 - (u + \frac{u}{8} + \frac{u}{64} + \cdots)':r'90 - \frac{8u}{7}',
        r'(a_1, a_2 \ldots a_n)': r'Sum(a_i, (i, 1, n))',
        r'\phantom{\cdots\cdots\cdots\cdots\cdots\cdots.}15 \cdot 35 == 15l^2 \implies l == \sqrt{35} \implies AC == 8 \sqrt{35}': r'15 \cdot 35 = 15l^2 && 15l^2 = 15 \cdot 35 && l^2 = 35 && l = \sqrt{35} && AC = 8 \sqrt{35}',
        r'\phantom{\cdots\cdots\cdots\cdots\cdots\cdots.}AB \cdot AC == 112 \sqrt{350} == 112 \cdot 5 \sqrt{14} == 560 \sqrt{14}': r'AB \cdot AC == 112 \sqrt{350} == 112 \cdot 5 \sqrt{14} == 560 \sqrt{14}',
        r'\sum_{k==0}^{11}\binom{11}{k}\binom{12}{k+1}==\binom{11}{0}\binom{12}{1}+\binom{11}{1}\binom{12}{2}+\cdots + \binom{11}{11}\binom{12}{12}':r'\sum_{k=0}^{11} \binom{11}{k} \binom{12}{k+1}',
        r'\sum_{k==0}^{11}\binom{11}{k}\binom{12}{k+1}==\binom{12}{12}+\binom{12}{1}\left(\binom{11}{0}+\binom{11}{1}\right)+\cdots':r'\sum_{k=0}^{11} \binom{11}{k} \binom{12}{k+1}',
        r'5\left(1 + \frac{e^{\frac{i\pi}{3}}}{2} + \left(\frac{e^{\frac{i\pi}{3}}}{2}\right)^2 + \cdots\right)':r'5 \left( \frac{1}{1 - \frac{e^{\frac{i\pi}{3}}}{2}} \right)',
        r'1-1/8+1/64-1/512\cdots==(7/8)(1+1/64+1/4096\cdots)==(7/8)(64/63)==8/9':r'\frac{8}{9} == \left( \frac{7}{8} \right) \left( \frac{64}{63} \right) == \frac{8}{9}',
        r'\pmb v_{1}+\pmb v_{2}+\pmb v_3+\cdots':r'\sum_{i=1}^{\infty} v_i',
        r'a\big(\!\cos0^\circ+r\cos60^\circ + r^{2}\cos120^\circ+r^{3}\cos180^\circ+r^{4}\cos240^\circ+r^{5}\cos300^\circ+r^{6}\cos360^\circ+\cdots\big).':r'a \sum_{n=0}^{\infty} r^n \cos \left( \frac{n\pi}{3} \right)',
        r'aS(1+r^{6}+r^{12}+r^{18}+\cdots)==\frac{aS}{1-r^{6}},':r'aS == \left( \frac{1}{1 - r^6} \right) == \frac{aS}{1 - r^6}',
        r'5-\frac{5}{8}+\frac{5}{64}-\cdots==\frac{40}{9}':r'\frac{40}{9} == \frac{40}{9}',
        r'\begin{tabular}{|c|c|c|r|} \hline a  & b                  & c & triples \\ \hline -8 & \{9,10\}           & -6a - 2b - 20    &             2              \\ -7 & \{6, 7, 8, 9, 10\} &   -6a - 2b - 20  &         5                  \\ -6 & \{-10, \ldots, 10\} &  \{-10, \ldots, 10\}   &                  441        \\ -5 & \{0, \ldots, 10\}      &  -6a - 2b - 20   &    11                       \\ -4 & \{-3, \ldots, 7\}        &  -6a - 2b - 20   &   11                       \\ -3 & \{-6, \ldots, 4\}        &  -6a - 2b - 20   &   11                       \\ -2 & \{-9, \ldots, 1\}         &  -6a - 2b - 20   &   11                        \\ -1 & \{-10, \ldots, -2\}   &  -6a - 2b - 20   &           9                \\ 0  & \{-10, \ldots, -5\}        &   -6a - 2b - 20  &      6                     \\ 1  & \{-10, -9, -8\}        &  -6a - 2b - 20   &      3                     \\ \hline Total & & & 510\\ \hline \end{tabular}':r'c(a, b) == -6a - 2b - 20',
        r'\{-10, \ldots, 10\}':r'\{-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10\}',
        r'\{0, \ldots, 10\}':r'\{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10\}',
        r'\{-3, \ldots, 7\}':r'\{-3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7\}',
        r'\{-6, \ldots, 4\}':r'\{-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4\}',
        r'\{-9, \ldots, 1\}':r'\{-9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1\}',
        r'\{-10, \ldots, -2\}':r'\{-10, -9, -8, -7, -6, -5, -4, -3, -2\}',
        r'\{-10, \ldots, -5\}':r'\{-10, -9, -8, -7, -6, -5\}',
        r'149^{a \cdot 3^b} - 2^{a \cdot 3^b} ==  (149^{3^b} - 2^{3^b})(149^{3^b(a - 1)} + 149^{3^b(a - 2)}\cdot2^{3^b}+\cdots2^{3^b(a - 1)})':r'149^{a \cdot 3^b} - 2^{a \cdot 3^b} == (149^{3^b} - 2^{3^b}) \sum_{k=0}^{a-1} 149^{3^b(a - 1 - k)} \cdot 2^{3^bk}',
        r'v_7(149^{7^{k+1}} - 2^{7^{k+1}}) == v_7(149^{7^k} - 2^{7^k}) + v_7(149^{6\cdot7^k} + 2^{7^k}\cdot149^{5\cdot7^k} + \cdots + 2^{5\cdot7^k}\cdot149^{7^k} + 2^{6\cdot7^k})':r'v_7(149^{7^{k+1}} - 2^{7^{k+1}}) == v_7(149^{7^k} - 2^{7^k}) + v_7 \left( \sum_{i=0}^{6} 2^{i \cdot 7^k} \cdot 149^{(6 - i) \cdot 7^k} \right)',
        r'S(k) == 149^{6\cdot7^k} + 2^{7^k}\cdot149^{5\cdot7^k} + \cdots + 2^{5\cdot7^k}\cdot149^{7^k} + 2^{6\cdot7^k} \equiv 7 \cdot 2^{6\cdot7^k}\mod{7} \equiv 7 \cdot 2^{6\cdot7^k}\mod{49}':r'S(k) = \sum_{i=0}^{6} 2^{i \cdot 7^k} \cdot 149^{(6 - i) \cdot 7^k} \equiv 7 \cdot 2^{6 \cdot 7^k} \mod 7 \equiv 7 \cdot 2^{6 \cdot 7^k} \mod 49',
        r'v_5(149^{4\cdot5^{k+1}} - 2^{4\cdot5^{k+1}}) == (k+1) + v_5(149^{4\cdot4\cdot5^k}+2^{4\cdot5^k}\cdot149^{3\cdot4\cdot5^k}+\cdots+2^{3\cdot4\cdot5^k}\cdot149^{4\cdot5^k}+2^{4\cdot4\cdot5^k})':r'v_5(149^{4 \cdot 5^{k+1}} - 2^{4 \cdot 5^{k+1}}) = (k+1) + v_5 \left( \sum_{i=0}^{4} 2^{i \cdot 4 \cdot 5^k} \cdot 149^{(4 - i) \cdot 4 \cdot 5^k} \right)',
        r"S'(k) ==  149^{4\cdot4\cdot5^k}+2^{4\cdot5^k}\cdot149^{3\cdot4\cdot5^k}+\cdots+2^{3\cdot4\cdot5^k}\cdot149^{4\cdot5^k}+2^{4\cdot4\cdot5^k}":r"S'(k) == \sum_{i=0}^{4} 2^{i \cdot 4 \cdot 5^k} \cdot 149^{(4 - i) \cdot 4 \cdot 5^k}",
        r"S''(k) == 1 + 2^{4\cdot5^k} + \cdots + 2^{16\cdot5^k}": r"S''(k) == \sum_{i=0}^{4} 2^{4i \cdot 5^k}",
        r"S''(k) == 1 + 2^{4\cdot5^k} + \cdots + 2^{16\cdot5^k} ": r"S''(k) == \sum_{i=0}^{4} 2^{4i \cdot 5^k}",
        r'2^0\theta \equiv 2^3\theta \equiv 2^6\theta \equiv \cdots \mod{180*\frac{\pi}{180}}':r'\sum_{k=0}^{\infty} 2^{3k} \theta \equiv \theta \mod{180*\frac{\pi}{180}}',
        r'aS(1+r^{6}+r^{12}+r^{18}+\cdots)==\frac{aS}{1-r^{6}}': r'aS \sum_{n=0}^{\infty} r^{6n} == \frac{aS}{1 - r^6}'
        }
    
    if expression in equivalents:
        interpreted_expression = equivalents[expression]
        return interpreted_expression
    
    return expression

In [3]:
def sympy_helper(exp):
    equivalents = {
        r'(c^2 == a^2 + b^2 - 2abcos C)': r'c^2 == a^2 + b^2 - 2ab \cos C',
        r'\angleKLN == \angleKMN == 90^∘': r'\angleKLN == \angleKMN == 90^{\angle}',
        r'm\angleKPL == 90^∘': r'\angleKPL == 90^\angle',
        r'\angleKPL ≅\angleKLN': r'\angleKPL ==\angle',
        r'\anglePKL ≅\angleLKN':r'\anglePKL ==\angleLKN',
        r'PKL ∼LKN':r'\triangle PKL == \triangle LKN',
        r'KMN ∼KPO': r'\triangle KMN == \triangle KPO',
        r'KOL ∼KHP':r'\triangle KOL == \triangle KHP',
        r'd=={090': r'd=={090}',
        r'(a,b)==(20,190).':r'Eq((a, b), (20, 190))',
        r'xy(x, y)^2[x, y]^2==10^630':r'Eq(x * y * (x + y)**2 * x**2 * y**2, 10**630)',
        r'(a, b)*[a, b]==ab':r'Eq(a**2 + b**2, a*b)',
        r'd^3α==10^60, d^3α^2β^3==10^510 αβ^3 == 10^510==2^510 *5^510': [r'Eq(d**3*alpha, 10**60), Eq(d**3*alpha**2*beta**3, 10**510),Eq(alpha*beta**3, 10**510)'],
        r'(a,b,c,d)==(20,20,190,190))))))))':r'Eq((a, b, c, d), (20, 20, 190, 190))',
        r'sin^2x*cos^2x≠=5/6':r'\sin^2 x \cdot \cos^2 x \neq \frac{5}{6}',
        r'a == 1 ±2/3/2':r'[Eq(a,(1 + 2/3) / 2), Eq(a,(1 - 2/3) / 2)]',
        r'p==q^4±1':r'[Eq(p,(q**4 + 1)), Eq(p,(q**4 - 1))]',
        r'| ∑_1 ≤j <k ≤673 z_jz_k |':[r'Abs(Sum(z[j]*z[k], (j, 1, 672), (k, j + 1, 673)))'],
        r'x==∑_1≤j<k≤673 z_jz_k': [r'Eq(x, Sum((z[j] * z[k]), (j, 1, 672), (k, j+1, 673)))'],
        r'3∑_i==1^673z_i==-20∑_i==1^673z_i==-20/3.': r'Eq(3*Sum(z_i, (i, 1, 673)), -20), Eq(Sum(z_i, (i, 1, 673)), -1*20/3)',
        r'1≤j<k≤673': r'1\le j < k\le 673',
        r'| ∑_1 ≤j <k ≤673 z_jz_k |  == S': [r'Eq(S,Abs(Sum(z[j]*z[k], (j, 1, 672), (k, j + 1, 673))))'],
        r'== 3*673+':r'3 * 673',
        r'\ge 2' : r'v_2(n) \ge 2',
        r'> 2019': r'2^3*5^4 > 2019'
    }

    if exp in equivalents:
        interpreted_expression = equivalents[exp]
        return interpreted_expression
    
    return exp

In [4]:
import re

def clean_exp(exp):
    if exp is None:
        print('IS NONE', exp)
        return exp

    exp = re.sub(r'\\frac\s*(\d)(\d)', lambda match: r'\frac{{{0}}}{{{1}}}'.format(match.group(1), match.group(2)), exp)
    exp = re.sub(r'\\frac\s*\{([^{}]*\{[^{}]*\}[^{}]*|[^{}]+)\}(\d+)', lambda match: r'\frac{{{0}}}{{{1}}}'.format(match.group(1), match.group(2)), exp)
    #exp = re.sub(r'([a-zA-Z])(=\\pm|\\pm)(\\sqrt\{[^}]+\})', lambda match: f'{match.group(1)}= {match.group(1)} + {match.group(3)}, {match.group(1)}= {match.group(1)} - {match.group(3)}', exp)
    #exp = re.sub(r'\\pm\s*\\sqrt\{(\d+)\}', r'{\\sqrt{\\1}, -\\sqrt{\\1}}', exp)
    exp = re.sub(r'\\implies', lambda match: ',' if re.search(r'\\pm', exp) else '\\implies', exp)
    exp = re.sub(
        r'(\w+|\\frac\{[^}]+\}\{[^}]+\}|\\text\{[^}]+\})\s*=\s*\\pm\s*(\\sqrt\{[^}]+\}|\d+)|\\pm\s*(\\sqrt\{[^}]+\}|\d+)',
        lambda match: (
            f'{match.group(1)} = +{match.group(2)}, {match.group(1)} = -{match.group(2)}' 
            if match.group(1) and match.group(2) 
            else f'+{match.group(3)}, -{match.group(3)}'
        ),
        exp
    )
    exp = re.sub(
        r'(\w+)\s*=\s*(\w+|\\[a-zA-Z]+\{[^}]+\})\s*\\cap\s*(\w+|\\[a-zA-Z]+\{[^}]+\})',
        lambda match: f'{match.group(1)} = \\text{{Intersection}}({match.group(2)}, {match.group(3)})',
        exp
    )
    exp = re.sub(
        r'(\w+|\\[a-zA-Z]+\{[^}]+\})\s*\\cap\s*(\w+|\\[a-zA-Z]+\{[^}]+\})\s*=\s*(\w+)',
        lambda match: f'{match.group(3)} = \\text{{Intersection}}({match.group(1)}, {match.group(2)})',
        exp
    )
    
    exp = re.sub(r'\\sqrt\s*(\d+|x)', r'\\sqrt{\1}', exp)
    exp = re.sub(r'\\sim', r'\\text{sim}', exp)
    exp = re.sub(r'\\in\s*(i(?:\\mathbb\{R\}|\\mathbb\s*R)|\{[^}]+\})',
                 lambda match: 'c * I' if match.group(1) in ['i\\mathbb{R}', 'i \\mathbb R'] else match.group(1),
                 exp)
    if exp.count('|') == 1:
        exp = re.sub(
                r'([^$|]+)\s*\|\s*([^$|]+)', lambda match: f'\\frac{{{match.group(2)}}}{{{match.group(1)}}}', exp)
    exp = re.sub(r'E','e', exp)
    exp = re.sub(r'\\parallel', r',\\text{parallel},', exp)
    exp = re.sub(r'\\stackrel', r',\\text{stackrel},', exp)
    exp = exp.replace('=', '==') 
    exp = exp.replace('pmod', 'mod')
    exp = exp.replace('bmod', 'mod')
    exp = re.sub(r'\\boxed', '', exp)
    exp = re.sub(r'\\cong', '==', exp)
    exp = re.sub(r'\^\\circ', r'*\\frac{\\pi}{180}', exp)
    exp = re.sub(r'\^{\\circ}', r'*\\frac{\\pi}{180}', exp)
    exp = re.sub('–', '-', exp)
    exp = re.sub(r'\\perp', '', exp) # Check this out

    if exp.startswith('$') and exp.endswith('$'):
        exp = exp[1:-1]
        if exp.startswith(r'\implies'):
            exp = exp[8:]
        if exp.endswith(r'\implies'):
            exp= exp[:-8]
        exp = re.sub(r'\\implies', '==', exp)
        if exp.endswith('=='):
            exp = re.sub('==','',exp)
        return exp

    if exp.startswith(r'\[') and exp.endswith(r'\]'):
        exp = exp[2:-2]
        if exp.startswith(r'\implies'):
            exp = exp[8:]
        if exp.endswith(r'\implies'):
            exp= exp[:-8] 
        exp = re.sub(r'\\implies', '==', exp)
        if exp.endswith('=='):
            exp = re.sub('==','',exp)
        return exp

    return exp

In [5]:
errorsdictionary =['(an argument of 90 degrees puts a complex number on the imaginary axis)','|',r'\]',
                    '(No integer greater than $1$ can have fewer than $2$ divisors.)',
                    '(Note that $32k$ and $64k$ can both be expressed in the form $16k$.)',
                    '(note that there are $6^4 = 1296$ outcomes)',
                    '(either 2, 3, or 6)',
                    "(you will have 1 non-dud and that's never going to be square)",
                    r'(We can have any two non-duds twice. For example, 2,2,5,5. There are $\binom{4}{2} = 6$ ways of choosing which two non-duds to use and $\binom{4}{2} = 6$ ways of choosing how to arrange those 4 numbers. That gives us 6*6=36 combinations. We can also have 2,2,2,2 or 3,3,3,3 or 5,5,5,5 or 6,6,6,6. This gives us a total of 40 combinations)',
                    '(in the diagram, 2, 1, 1, 4)',
                    '($12$ options)',
                    '($6$ permutations)',
                    '(at $n=0$)',
                    '(and therefore $A_3A_4$ is vertical)',
                    r'(as $K=\frac{1}{2}bh$ and the blue line represents $h$ while $b$ is fixed)',
                    '(since we can continually add $5$ to each residue)',
                    '(by repeatedly adding $5$ to either $n$ or $n+1$)',
                    r'(since $n$ and $n+1$ only contribute $1$ more residue $\bmod 5$)',
                    '(since $92$ is unobtainable)',
                    r'(Also note that in the $3 \pmod{5}$ case, the residue $2 \pmod{5}$ has will not be produced until $3(n+1)$ while the $1\pmod5$ case has already been produced, so the highest possible value that cannot be produced would not be a number equivalent to $1 \pmod5$)'
                    '(95 can always be formed)',
                    '(case 1)',
                    '(case 2)',
                    '(with $13$ factored out)',
                    '(not equal to $f(2) = f(4)$)',
                    '(necessitated by $149^n \equiv 2^n \mod 5$ in order to set up LTE)',
                    '(They happen to both be $-1$ and $2$ respectively, so you only need to compute once)',
                    '(just $1$$2$$4$ using mods-not too bad)',
                    r'(where $n \ge 1$)',
                    r'(We know that $\triangle ADE$ is a $6-8-10$, since $\triangle DEB$ is an $8-15-17$.)',
                    '(the only way to have a square product is rolling a 2,3 and 6. There are 3! ways of doing that and a total of $4^3$ ways to roll 3 non-duds)',
                    '(e.g. $k=21$ below)'
]

In [7]:
def parse_expression(expr):
    if expr in errorsdictionary:
         return 0
    cleaned_expr = clean_exp(expr)
    try:
        if isinstance(sp.sympify(cleaned_expr), bool):
            try:
                return latex2sympy(cleaned_expr)
            except Exception as e:
                print('BOOL ERROR', str(e))
        else:
            return sp.sympify(cleaned_expr)
    except Exception as e:
        try:
            parsed_expr = parse_assigment_with_dots(cleaned_expr)
            #print('TEST',parsed_expr)
            if parsed_expr.startswith('And'):
                        parsed_expr_and = parsed_expr.replace('==', '=')
                        return(sp.sympify(parsed_expr_and))
            return latex2sympy(parsed_expr)
        except Exception as e:
            #print('FIRST FULL TRY',str(e),parsed_expr, expr)
            try:
                if re.search(r'\\cdots|\\ldots|\\dots', cleaned_expr):
                    text_expr = LatexNodes2Text().latex_to_text(parsed_expr)
                    text_expr = sympy_helper(text_expr)
                    return sp.sympify(str(text_expr))
                text_expr = LatexNodes2Text().latex_to_text(cleaned_expr)
                text_expr = text_expr.replace('·', '*')
                text_expr = text_expr.replace('∠', '\\angle')
                text_expr = sympy_helper(text_expr)
                return sp.sympify(str(text_expr))
            except Exception as e:
                try:
                    return latex2sympy(text_expr)
                except Exception as e:
                    #print('LAST TRY',str(e),text_expr, expr)
                    return str(e)

In [8]:
def get_variables(parsed_expr):
    try:
        if isinstance(parsed_expr, bool) or parsed_expr is False:
            return []
        if isinstance(parsed_expr, tuple):
            parsed_expr = list(parsed_expr)
        if isinstance(parsed_expr, list):
            variables_list = []
            for item in parsed_expr:
                if isinstance(item, bool) or item is False:
                    continue
                variables_list.extend(item.atoms(sp.Symbol))
            return list(set(variables_list))  # Deleting duplicates
        variables_list = list(parsed_expr.atoms(sp.Symbol))
        return variables_list
    except Exception as e:
        return str(e)

In [9]:
def process_dataframe(dfexpre):
    parsed_results = []
    variables_results = []
    context_variables = set()

    for index, row in dfexpre.iterrows():
        row_results = []
        row_var_results = []
        for cell in row:
            cell_results, cell_var_results = process_cell(cell)
            row_results.append(cell_results)
            row_var_results.append(cell_var_results)
            for _, var_colect in cell_var_results:
                context_variables.update(var_colect)
        parsed_results.append(row_results)
        variables_results.append(row_var_results)

    # Flat the results
    flat_parsed_results = [item for sublist in parsed_results for item in sublist]
    flat_parsed_results = [item for sublist in flat_parsed_results for item in sublist]

    # Create DataFrame with the flatted results
    flat_parsed_df = pd.DataFrame(flat_parsed_results, columns=['Expression', 'Value', 'Parsed', 'Variables'])

    # Parsed counter
    flat_parsed_df['Parsed_Success'] = flat_parsed_df['Parsed'].apply(lambda x: not isinstance(x, str))
    parsed_success_count = flat_parsed_df['Parsed_Success'].sum()
    parsed_failure_count = len(flat_parsed_df) - parsed_success_count
    
    print(f"Successfully parsed expressions: {parsed_success_count}")
    print(f"Expressions that could not be parsed: {parsed_failure_count}")

    return parsed_results, variables_results, context_variables, flat_parsed_df

In [10]:
def process_cell(cell):
    cell_results = []
    cell_var_results = []
    for key, value in cell.items():
        parsed = parse_expression(value)
        var_colect = get_variables(parsed)
        cell_results.append((key, value, parsed, var_colect))
        cell_var_results.append((key, var_colect))
    return cell_results, cell_var_results

In [11]:
# load Complete_Expressions.csv

complete_expressions = pd.read_csv('Data/Complete_Expressions.csv', header=None)
dfexpre=pd.DataFrame(complete_expressions)
dfexpre = dfexpre.map(lambda x: ast.literal_eval(x))

In [12]:
parsed_results, variables_results, context_variables, flat_parsed_df= process_dataframe(dfexpre)

BOOL ERROR I expected something else here
F==(0,10)
~~~~~^
BOOL ERROR I expected something else here
(a,b)==(20,190)
~~^
BOOL ERROR I expected something else here
(x,y)==(2^a5^b,2^c5^d)
~~^
BOOL ERROR I expected something else here
(a,b,c,d)==(20,20,190,190)
~~^
BOOL ERROR I expected something else here
(c,d,e) == (1,71,1)
~~^
BOOL ERROR I expected something else here
(a,b) == (3,4)
~~^




BOOL ERROR I expected something else here
(n,m) == (121,286)
~~^
BOOL ERROR I expected something else here
B==(0,0)
~~~~~^
BOOL ERROR I expected something else here
B==(0,0)
~~~~~^
BOOL ERROR I expected something else here
C==(5,0)
~~~~~^
BOOL ERROR I expected something else here
D==(2,0)
~~~~~^
BOOL ERROR I expected something else here
A==(1,0,0)
~~~~~^
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ERROR argument of type 'Symbol' is not iterable
BOOL ER

In [12]:
flat_parsed_df['Parsed'].apply(type).value_counts()

Parsed
<class 'sympy.core.relational.Equality'>             1444
<class 'sympy.core.symbol.Symbol'>                   1079
<class 'sympy.core.numbers.Integer'>                  581
<class 'sympy.core.mul.Mul'>                          355
<class 'tuple'>                                       190
<class 'str'>                                         181
<class 'sympy.core.add.Add'>                          150
<class 'list'>                                        118
<class 'set'>                                         104
<class 'sympy.core.numbers.One'>                       50
<class 'bool'>                                         38
<class 'sympy.core.power.Pow'>                         37
<class 'sympy.core.numbers.Zero'>                      33
<class 'int'>                                          30
<class 'NoneType'>                                     28
<class 'function'>                                     23
<class 'sympy.core.relational.StrictLessThan'>         21
<class 

In [13]:
parsed_resultsdf = pd.DataFrame(parsed_results)

In [14]:
tuples_df = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, tuple))]
setdf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, set))]
strdf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, str))]
listdf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, list))]
booldf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, bool))]
symboldf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, sp.core.symbol.Symbol))]

In [15]:
tuples_df.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
31,expression2,"($9$, $108$, $1107$, $11106$)","(9, 108, 1107, 11106)",[],True
81,expression3,"$(\{1,2\},\{2,3\},\{3,4\}\dots,\{19,20\})$","((1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7...",'tuple' object has no attribute 'atoms',True
143,expression1,"$A=(4,12), B=(16,3), C=(15,0), D=(5,0), E=(0,5)$","(False, False, False, False, False)",[],True
147,expression5,"$(a_1, b_1)$","(a_1, b_1)","[b_1, a_1]",True
148,expression6,"$(a_2, b_2)$","(a_2, b_2)","[b_2, a_2]",True


In [16]:
booldf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
82,expression4,$1-\frac{19}{190}=\frac9{10}$,True,[],True
84,expression6,$\frac12\cdot\frac9{10}=\frac{9}{20}$,True,[],True
442,expression4,"$\log_{10}{x}+\log_{10}{(x, y)^2}=60$",False,[],True
679,expression67,$8+9+16+25+121+361 = \boxed{\textbf{540}}$,True,[],True
815,expression18,$m+n = 343+9 = \fbox{352}$,False,[],True


In [17]:
setdf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
26,expression4,$\boxed{342}$,{342},'set' object has no attribute 'atoms',True
35,expression6,$\boxed{342}$,{342},'set' object has no attribute 'atoms',True
39,expression3,{321},{321},'set' object has no attribute 'atoms',True
40,expression4,{321},{321},'set' object has no attribute 'atoms',True
41,expression5,{321},{321},'set' object has no attribute 'atoms',True


In [18]:
strdf['Parsed']

1210    I don't understand this\nf(f(z))-f(z)/f(z)-z==...
1620    I expected something else here\n-1==(A,B;X,Y),...
1671    I don't understand this\n{1/2 (11 +√(61), -√(6...
1682    I don't understand this\n{1/2 (11 +√(61), -√(6...
1684    I don't understand this\nPY, PX ∈{1/2 (11 +√(6...
                              ...                        
4640    I don't understand this\nsinOAH ==MH/AH == 2/3...
4642    I don't understand this\nR == AP/2cosOAP == 7/...
4643            argument of type 'Symbol' is not iterable
4644    I don't understand this\n BC/2 == √(R^2 - OM'^...
4646    I don't understand this\n[ABC]==BC/2 *(AH + HD...
Name: Parsed, Length: 181, dtype: object

In [19]:
count = strdf['Value'].str.count(r'\\implies').sum()

print(f"El patrón '\\implies' aparece {count} veces en la columna 'Value'.")

El patrón '\implies' aparece 8 veces en la columna 'Value'.


In [20]:
filtered_df = strdf[strdf['Value'].str.contains(r'\\implies', regex=True)]
for i in filtered_df['Value']:
    print(i)

$\phantom{\cdots\cdots\cdots\cdots\cdots\cdots.}15 \cdot 35 = 15l^2 \implies l = \sqrt{35} \implies AC = 8 \sqrt{35}$
\[2R \cos \alpha = AB = 7 \implies R = \frac {\frac{7}{2} } {\frac{7}{2}\sqrt \frac{13}{160}} = \sqrt {\frac{160}{13}} \]
$\implies$
$\frac{\frac{5\sqrt{14}}{7}\times\frac{3\sqrt{2}}{2}}{2}={\frac{15\sqrt{7}}{14}}\implies{m+n+p=\boxed{036}}$
$OA = O'H, OA || O'H \implies M$
\[\sin OAH =\frac{MH}{AH} = \frac{2}{3}  \implies  \cos OAH = \frac{\sqrt{5}}{3}\]
\[[ABC]=\frac{BC}{2} \cdot (AH + HD) = 3\cdot \sqrt{55}  \implies 3+55 = \boldsymbol{\boxed{058}}\]


In [21]:
strdf.head() # These are the expressions that failed to be parsed and returned an error. They are for the TODO list 

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
1210,expression0,\[\frac{f(f(z))-f(z)}{f(z)-z}=-\frac{f(f(z))-f...,I don't understand this\nf(f(z))-f(z)/f(z)-z==...,'str' object has no attribute 'atoms',False
1620,expression9,"\[-1=(A,B;X,Y)\stackrel{A}{=}(R,P;X,Y)\]","I expected something else here\n-1==(A,B;X,Y),...",'str' object has no attribute 'atoms',False
1671,expression7,\{\frac 12 (11 \pm \sqrt{61})\right\},"I don't understand this\n{1/2 (11 +√(61), -√(6...",'str' object has no attribute 'atoms',False
1682,expression18,\{\frac 12 (11 \pm \sqrt{61})\right\},"I don't understand this\n{1/2 (11 +√(61), -√(6...",'str' object has no attribute 'atoms',False
1684,expression20,"\[PY, PX \in \left\{\frac 12 (11 \pm \sqrt{61}...","I don't understand this\nPY, PX ∈{1/2 (11 +√(6...",'str' object has no attribute 'atoms',False


In [22]:
listdf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
45,expression1,"$1, 2, 3,\ldots, 19, 20$","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",[],True
47,expression3,"$1, 2, 3,\ldots, 19, 20$","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",[],True
79,expression1,"$\{J,B\}$","[J, B]","[B, J]",True
190,expression2,"$\triangle PAF,\triangle QCB$","[A*F*P*\triangle, B*C*Q*\triangle]","[P, B, A, C, Q, F, \triangle]",True
193,expression5,"$a,\,b$","[a, b]","[b, a]",True


In [23]:
symboldf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
32,expression3,$n$,n,[n],True
44,expression0,$J$,J,[J],True
46,expression2,$B$,B,[B],True
48,expression4,$J$,J,[J],True
52,expression8,$m$,m,[m],True


In [24]:
unique_elements = flat_parsed_df['Parsed'].apply(type).value_counts() == 1
unique_elements_df = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: type(x) in unique_elements[unique_elements].index)]

In [25]:
unique_elements_df.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
75,expression6,"$(B = 2 \ldots 20), (J = 1 \ldots 19)$",(1 <= J) & (2 <= B) & (B <= 20) & (J <= 19),"[B, J]",True
89,expression3,$\dbinom{n-k+1}{k}$,"binomial(-k + n + 1, k)","[n, k]",True
608,expression0,$\tau(n)$,\tau(n),[n],True
1425,expression19,$cos$,cos,[],True
2433,expression5,$d(n)$,d(n),[n],True


## For the next section, we must tokenize the parsed expressions.

As the Sympy format cannot be saved directly as CSV or JSON without transformation, due to the limitations of these formats in handling complex symbolic representations. Saving without proper conversion would likely result in loss of symbolic structure and functionality, rendering the expressions unrecognizable or plain text.



In [26]:
#parsed_resultsdf.applymap(lambda x: repr(x))
#parsed_resultsdf.to_csv('Data/parsed_results.csv', index=None)

In [27]:

reference_text = pd.read_csv('Data/refefenceText.csv', header=None)

# Convert the reference_text list directly to a DataFrame
dfreference=pd.DataFrame(reference_text)




In [28]:
def extract_parsed_expressions(parsed_resultsdf):
    parsed_list = []
    variables_set = set()
    for i in range(len(parsed_resultsdf)):
        parsed_list_row = []
        for j in range(len(parsed_resultsdf.columns)):
            row = parsed_resultsdf.iloc[i, j]
            parsed_dict = {}
            for exp in row:
                if isinstance(exp, tuple) and len(exp) >= 3:
                    expr_key = exp[0]
                    parsed_expr = exp[2]
                    variables = exp[3]
                    parsed_dict[expr_key] = parsed_expr
                    variables_set.update(variables)
                else:
                    print(len(exp), exp)
            parsed_list_row.append(parsed_dict)
        parsed_list.append(parsed_list_row)
    
    return parsed_list, list(variables_set)

In [29]:
parsed_list_with_dict , variables= extract_parsed_expressions(parsed_resultsdf) 

In [30]:
# Define SymPy symbols for each variable in the list
symbols = sp.symbols(' '.join(str(var) for var in variables))
expression_types = {
   }

In [31]:

def get_expression_type(expr_type):
    if expr_type not in expression_types:
        description = expr_type.__name__.split('.')[-1]
        expression_types[expr_type] = description
    return expression_types[expr_type]

def tokenize_expression(expr):
    expr_type = type(expr)
    description = get_expression_type(expr_type)
    
    if isinstance(expr, sp.Equality):
        return [description] + tokenize_expression(expr.lhs) + tokenize_expression(expr.rhs)
    elif isinstance(expr, sp.Sum):
        return [description] + tokenize_expression(expr.args[0]) + ['over'] + tokenize_expression(expr.args[1])
    elif isinstance(expr, sp.Mul):
        return [description] + [tokenize_expression(arg) for arg in expr.args]
    elif isinstance(expr, sp.Pow):
        return [description] + tokenize_expression(expr.base) + tokenize_expression(expr.exp)
    elif isinstance(expr, sp.Add):
        return [description] + tokenize_expression(expr.args[0]) + tokenize_expression(expr.args[1])
    elif isinstance(expr, sp.Symbol):
        return [expr.name]
    elif isinstance(expr, sp.Integer):
        return [str(expr.numerator)]
    elif isinstance(expr, sp.Rational):
        return [str(expr)]
    else:
        return [str(expr)]

In [32]:
tokenized_expressions = []
for sublist in parsed_list_with_dict:
    tokenized_expressions_row = []
    for dictionary in sublist:
        tokenized_expressions_cell = {}
        for key, value in dictionary.items():
            tokens = tokenize_expression(value)
            tokenized_expressions_cell[key] = tokens
        tokenized_expressions_row.append(tokenized_expressions_cell)
    tokenized_expressions.append(tokenized_expressions_row)

In [33]:
tokenized_expressions[0][0]

{'expression0': ['Sum',
  'Mul',
  ['9'],
  ['Pow', '10', 'Add', 'k', '-1'],
  'over',
  '(k, 1, 321)'],
 'expression1': ['<function N at 0x0000027A4DE17880>']}

#### EXPORT `tokenized_expressions` for next Section

In [34]:
 # Save File for the next section
with open('Data/tokenized_expressions.csv', 'w', newline='') as file:
   writer = csv.writer(file)
   writer.writerows(tokenized_expressions)