In [32]:
from latex2sympy2 import latex2sympy
from pylatexenc.latex2text import LatexNodes2Text
import sympy as sp
import pandas as pd
import re
import ast
import warnings
import csv
warnings.filterwarnings('ignore')

In [33]:
def parse_assigment_with_dots(expression):
    equivalents = {
        r"N = 9 + 99 + 999 + 9999 + \cdots + \underbrace{99\ldots 99}_\text{321 digits}.": r"N = \sum_{k=1}^{321} 9 \times 10^{k-1}",
        r"(10-1)+(10^2-1)+(10^3-1)+\cdots+(10^{321}-1)": r"\sum_{k=1}^{321} (10^k - 1)",
        r"(10+10^2+10^3+\cdots+10^{321})-321": r"\sum_{k=1}^{321} 10^k - 321",
        r"(10+10^2+10^3+\cdots+10^{321})": r"\sum_{k=1}^{321} 10^k",
        r"1111\cdots.10": r"\frac{10^{322} - 1}{9}",
        r"(10+100+\cdots 10^{320}+10^{321})": r"\sum_{k=0}^{321} 10^k",
        r"11\cdots0": r"\frac{10^{322} - 10}{9}",
        r"11\cdots10-n": r"\frac{10^{322} - 1}{9} - n",
        r"1, 2, 3,\ldots, 19, 20": r"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20",
        r"\dfrac{1}{2} |(a_1b_2 + a_2b_3 + \cdots + a_nb_1) - (b_1a_2 + b_2a_3 + \cdots + b_na_1)|": r"(\sum_{i=1}^{n} a_ib_{i+1} - \sum_{i=1}^{n} b_ia_{i+1})*(1/2)",
        r"(b_1a_2 + b_2a_3 + \cdots + b_na_1)": r"\sum_{i=1}^{n} b_ia_{i+1}",
        r"z_1,z_2,\dots,z_{673}": r"\sum_{i=1}^{673} z_i",
        r"(x-z_1)^3(x-z_2)^3 \cdots (x-z_{673})^3": r"\prod_{i=1}^{673} (x - z_i)^3",
        r"(z_1z_2+z_1z_3+ \dots + z_1z_{672}+z_1z_{673})+(z_2z_3+z_2z_4+ \dots +z_2z_{673}) + (z_3z_4+z_3z_5+ \dots +z_3z_{673}) + \dots +z_{672}z_{673}.": r"\sum_{i=1}^{672} \left( \sum_{j=i+1}^{673} z_iz_j \right)",
        r"P=(x-z_1)(x-z_1)(x-z_1)(x-z_2)(x-z_2)(x-z_2) \dots (x-z_{673})(x-z_{673})(x-z_{673})": r"P = \prod_{i=1}^{673} (x - z_i)^3",
        r"z_1,z_1,z_1,z_2,z_2,z_2, \dots , z_{673},z_{673},z_{673}": r"\sum_{i=1}^{673} \left( z_i + z_i + z_i \right)",
        r"(-1)^1 \cdot \dfrac{20}{1}=-20=z_1+z_1+z_1+z_2+z_2+z_2+ \dots + z_{673}+z_{673}+z_{673}=3(z_1+z_2+z_3+ \dots +z_{673})": r"(-1)^1 \frac{20}{1} = -20 = \sum_{i=1}^{673} 3z_i = 3 \sum_{i=1}^{673} z_i",
        r"z_1+z_2+z_3+ \dots +z_{673}=- \dfrac{20}{3}.": r"\sum_{i=1}^{673} z_i = -\frac{20}{3}",
        r"z_1^2+z_1^2+z_1^2+z_1z_2+z_1z_2+z_1z_2+ \dots =  \\ 3(z_1^2+z_2^2+ \dots + z_{673}^2) + 9(z_1z_2+z_1z_3+z_1z_4+ \dots + z_{672}z_{673}) =  3(z_1^2+z_2^2+ \dots + z_{673}^2) + 9S.": r"\sum_{i=1}^{673} z_i^2 = \frac{400}{9} - 2S",
        r"(z_1+z_2+z_3+ \dots +z_{673})^2= (-20/3)^2=\dfrac{400}{9} \\ =(z_1^2+z_2^2+ \dots + z_{673}^2)+2(z_1z_2+z_1z_3+z_1z_4+ \dots + z_{672}z_{673})=(z_1^2+z_2^2+ \dots + z_{673}^2)+2S.": r"\left( \sum_{i=1}^{673} z_i \right)^2 = \frac{400}{9} - 2S",
        r"z_1+z_2+z_3+ \dots +z_{673}": r"\sum_{i=1}^{673} z_i",
        r"z_1^2+z_2^2+ \dots + z_{673}^2= \dfrac{400}{9} -2S": r"\sum_{i=1}^{673} z_i^2 = \frac{400}{9} - 2S",
        r"f(x)=(x-z_{1})(x-z_{1})\cdots (x-z_{673})": r"f(x) = \prod_{i=1}^{673} (x - z_i)",
        r"(x-z_1)^3(x-z_2)^3(x-z_3)^2\cdots (x-z_{673})^3": r"\prod_{i=1}^{673} (x - z_i)^3",
        r"3(x_1+x_2+\cdots+x_{673})": r"3 \sum_{i=1}^{673} x_i",
        r"3(z_1^2+z_2^2+z_3^2+\dots+z_{673}^2)": r"3 \sum_{i=1}^{673} z_i^2",
        r"-20 = 3(z_1+z_2+z_3+z_4 \dots+z_{673})": r"-20 = 3 \sum_{i=1}^{673} z_i",
        r"19 = 3(z_1^2+z_2^2+z_3^2+\dots+z_{673}^2) + 9S": r"19 = 3 \sum_{i=1}^{673} z_i^2 + 9S",
        r"P_2 = z_1^2+z_2^2+z_3^2+\dots+z_{673}^2": r"P_2 = \sum_{i=1}^{673} z_i^2",
        r"z_1^2+z_2^2+ \dots + z_{673}^2": r"\sum_{i=1}^{673} z_i^2"
    }
    
    if expression in equivalents:
        interpreted_expression = equivalents[expression]
        return interpreted_expression
    
    return expression

In [34]:
def sympy_helper(exp):
    equivalents = {
        r'(c^2 == a^2 + b^2 - 2abcos C)': r'c^2 == a^2 + b^2 - 2ab \cos C',
        r'\angleKLN == \angleKMN == 90^∘': r'\angleKLN == \angleKMN == 90^{\angle}',
        r'm\angleKPL == 90^∘': r'\angleKPL == 90^\angle',
        r'\angleKPL ≅\angleKLN': r'\angleKPL ==\angle',
        r'\anglePKL ≅\angleLKN':r'\anglePKL ==\angleLKN',
        r'PKL ∼LKN':r'\triangle PKL == \triangle LKN',
        r'KMN ∼KPO': r'\triangle KMN == \triangle KPO',
        r'KOL ∼KHP':r'\triangle KOL == \triangle KHP',
        r'd=={090': r'd=={090}',
        r'(a,b)==(20,190).':r'Eq((a, b), (20, 190))',
        r'xy(x, y)^2[x, y]^2==10^630':r'Eq(x * y * (x + y)**2 * x**2 * y**2, 10**630)',
        r'(a, b)*[a, b]==ab':r'Eq(a**2 + b**2, a*b)',
        r'd^3α==10^60, d^3α^2β^3==10^510 αβ^3 == 10^510==2^510 *5^510': [r'Eq(d**3*alpha, 10**60), Eq(d**3*alpha**2*beta**3, 10**510),Eq(alpha*beta**3, 10**510)'],
        r'(a,b,c,d)==(20,20,190,190))))))))':r'Eq((a, b, c, d), (20, 20, 190, 190))',
        r'sin^2x*cos^2x≠=5/6':r'\sin^2 x \cdot \cos^2 x \neq \frac{5}{6}',
        r'a == 1 ±2/3/2':r'[Eq(a,(1 + 2/3) / 2), Eq(a,(1 - 2/3) / 2)]',
        r'p==q^4±1':r'[Eq(p,(q**4 + 1)), Eq(p,(q**4 - 1))]',
        r'| ∑_1 ≤j <k ≤673 z_jz_k |':[r'Abs(Sum(z[j]*z[k], (j, 1, 672), (k, j + 1, 673)))'],
        r'x==∑_1≤j<k≤673 z_jz_k': [r'Eq(x, Sum((z[j] * z[k]), (j, 1, 672), (k, j+1, 673)))'],
        r'3∑_i==1^673z_i==-20∑_i==1^673z_i==-20/3.': r'Eq(3*Sum(z_i, (i, 1, 673)), -20), Eq(Sum(z_i, (i, 1, 673)), -1*20/3)',
        r'1≤j<k≤673': r'1\le j < k\le 673',
        r'| ∑_1 ≤j <k ≤673 z_jz_k |  == S': [r'Eq(S,Abs(Sum(z[j]*z[k], (j, 1, 672), (k, j + 1, 673))))'],
        r'== 3*673+':r'3 * 673'  
    }

    if exp in equivalents:
        interpreted_expression = equivalents[exp]
        return interpreted_expression
    
    return exp

In [35]:
def clean_exp(exp):
    exp = exp.replace('=', '==') 
    exp = exp.replace('pmod', 'mod')
    exp = exp.replace('bmod', 'mod')
    if exp.startswith('$'):
        if re.search(r'\\boxed', exp):
            exp = re.sub(r'\\boxed', '', exp)
        return exp[1:-1]
    if exp.startswith(r'\['):
        if re.search(r'\\boxed', exp):
            exp = re.sub(r'\\boxed', '', exp)
        return exp[2:-2]
    if exp is None:
        print('IS NONE', exp)
    return exp

In [36]:
def generate_context(expr):
    symbols = sp.symbols(expr.replace(',', ' ').replace('=', ' ').split())
    context = {str(symbol): symbol for symbol in symbols}
    return context

In [37]:
def parse_expression(expr, context):
    cleaned_expr = clean_exp(expr)
    try:
        return latex2sympy(cleaned_expr)
    except Exception as e:
        try:
            return sp.sympify(cleaned_expr)
        except Exception as e:
            if re.search(r'\\cdots|\\ldots|\\dots', cleaned_expr):
                cleaned_expr = cleaned_expr.replace('==', '=')
                parsed_expr = parse_assigment_with_dots(cleaned_expr)
                parsed_expr = parsed_expr.replace('=', '==')
                try:
                    return latex2sympy(parsed_expr)
                except Exception as e:
                    print('CDOTS error: ', parsed_expr, e)
            else:
                try:
                    text_expr = LatexNodes2Text().latex_to_text(cleaned_expr)
                    text_expr = text_expr.replace('·', '*')
                    text_expr = text_expr.replace('∠', '\\angle')
                    return latex2sympy(text_expr)
                except Exception as e:
                    try:
                        text_expr = sympy_helper(text_expr)
                        return sp.sympify(text_expr)
                    except Exception as e:
                        try:
                            return latex2sympy(text_expr)
                        except Exception as e:
                            return str(e)
                

In [38]:
def get_variables(parsed_expr):
    try:
        if isinstance(parsed_expr, bool) or parsed_expr is False:
            return []
        if isinstance(parsed_expr, tuple):
            parsed_expr = list(parsed_expr)
        if isinstance(parsed_expr, list):
            variables_list = []
            for item in parsed_expr:
                if isinstance(item, bool) or item is False:
                    continue
                variables_list.extend(item.atoms(sp.Symbol))
            return list(set(variables_list))  # Deleting duplicates
        variables_list = list(parsed_expr.atoms(sp.Symbol))
        return variables_list
    except Exception as e:
        return str(e)

In [39]:
def process_cell(cell):
    cell_results = []
    cell_var_results = []
    for key, value in cell.items():
        context = generate_context(value)
        parsed = parse_expression(value, context)
        var_colect = get_variables(parsed)
        cell_results.append((key, value, parsed, var_colect))
        cell_var_results.append((key, var_colect))
    return cell_results, cell_var_results

In [40]:

def process_dataframe(dfexpre):
    parsed_results = []
    variables_results = []
    context_variables = set()

    for index, row in dfexpre.iterrows():
        row_results = []
        row_var_results = []
        for cell in row:
            cell_results, cell_var_results = process_cell(cell)
            row_results.append(cell_results)
            row_var_results.append(cell_var_results)
            for _, var_colect in cell_var_results:
                context_variables.update(var_colect)
        parsed_results.append(row_results)
        variables_results.append(row_var_results)

    # Flat the results
    flat_parsed_results = [item for sublist in parsed_results for item in sublist]
    flat_parsed_results = [item for sublist in flat_parsed_results for item in sublist]

    # Create DataFrame with the flatted results
    flat_parsed_df = pd.DataFrame(flat_parsed_results, columns=['Expression', 'Value', 'Parsed', 'Variables'])

    # Parsed counter
    flat_parsed_df['Parsed_Success'] = flat_parsed_df['Parsed'].apply(lambda x: not isinstance(x, str))
    parsed_success_count = flat_parsed_df['Parsed_Success'].sum()
    parsed_failure_count = len(flat_parsed_df) - parsed_success_count
    
    print(f"Successfully parsed expressions: {parsed_success_count}")
    print(f"Expressions that could not be parsed: {parsed_failure_count}")

    return parsed_results, variables_results, context_variables, flat_parsed_df

In [41]:
complet_expressions = pd.read_csv('Data/Complete_Expressions.csv', header=None)
dfexpre=pd.DataFrame(complet_expressions)
dfexpre = dfexpre.map(lambda x: ast.literal_eval(x))

In [42]:
parsed_results, variables_results, context_variables, flat_parsed_df= process_dataframe(dfexpre)

Successfully parsed expressions: 897
Expressions that could not be parsed: 2


In [43]:
flat_parsed_df['Parsed'].apply(type).value_counts()

Parsed
<class 'sympy.core.relational.Equality'>             310
<class 'sympy.core.numbers.Integer'>                 140
<class 'sympy.core.symbol.Symbol'>                   133
<class 'sympy.core.mul.Mul'>                         103
<class 'tuple'>                                       49
<class 'list'>                                        39
<class 'sympy.core.add.Add'>                          34
<class 'sympy.core.power.Pow'>                        24
<class 'sympy.core.numbers.One'>                      19
<class 'sympy.concrete.summations.Sum'>                7
<class 'sympy.core.relational.StrictLessThan'>         6
<class 'set'>                                          5
<class 'bool'>                                         5
<class 'function'>                                     3
<class 'sympy.core.relational.LessThan'>               3
<class 'sympy.core.numbers.Zero'>                      2
<class 'str'>                                          2
<class 'sympy.core.relat

In [44]:
parsed_resultsdf = pd.DataFrame(parsed_results)

In [45]:
tuples_df = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, tuple))]
setdf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, set))]
strdf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, str))]
listdf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, list))]
symboldf = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: isinstance(x, sp.core.symbol.Symbol))]

In [46]:
tuples_df.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
31,expression2,"($9$, $108$, $1107$, $11106$)","(9, 108, 1107, 11106)",[],True
154,expression1,"$A=(4,12), B=(16,3), C=(15,0), D=(5,0), E=(0,5)$","(False, False, False, False, False)",[],True
158,expression5,"$(a_1, b_1)$","(a_1, b_1)","[a_1, b_1]",True
159,expression6,"$(a_2, b_2)$","(a_2, b_2)","[b_2, a_2]",True
204,expression5,"$a,\,b,$","(a, b)","[b, a]",True


In [47]:
setdf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
643,expression20,"$\boxed{(8,9)}$","{(8, 9)}",'set' object has no attribute 'atoms',True
644,expression21,"$\boxed{(9,10)}$","{(9, 10)}",'set' object has no attribute 'atoms',True
650,expression27,"$\boxed{(16,17)}$","{(16, 17)}",'set' object has no attribute 'atoms',True
656,expression33,"$\boxed{(25,26)}$","{(25, 26)}",'set' object has no attribute 'atoms',True
850,expression9,{i=1},{False},'set' object has no attribute 'atoms',True


In [48]:
strdf.head() # These are the only 2 expressions not parsed

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
162,expression9,|,I expected something else here\n|\n~^,'str' object has no attribute 'atoms',False
628,expression5,(No integer greater than $1$ can have fewer th...,I expected something else here\n(No integer gr...,'str' object has no attribute 'atoms',False


In [49]:
listdf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
45,expression1,"$1, 2, 3,\ldots, 19, 20$","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",[],True
47,expression3,"$1, 2, 3,\ldots, 19, 20$","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",[],True
201,expression2,"$\triangle PAF,\triangle QCB$","[A*F*P*\triangle, B*C*Q*\triangle]","[\triangle, P, F, A, C, Q, B]",True
207,expression8,"$\triangle PAF,\triangle QCB$","[A*F*P*\triangle, B*C*Q*\triangle]","[\triangle, P, F, A, C, Q, B]",True
213,expression14,"$\triangle PAF,\triangle QCB$","[A*F*P*\triangle, B*C*Q*\triangle]","[\triangle, P, F, A, C, Q, B]",True


In [50]:
symboldf.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
1,expression1,$N$,N,[N],True
32,expression3,$n$,n,[n],True
42,expression6,$N$,N,[N],True
44,expression0,$J$,J,[J],True
46,expression2,$B$,B,[B],True


In [51]:
unique_elements = flat_parsed_df['Parsed'].apply(type).value_counts() == 1
unique_elements_df = flat_parsed_df[flat_parsed_df['Parsed'].apply(lambda x: type(x) in unique_elements[unique_elements].index)]

In [52]:
unique_elements_df.head()

Unnamed: 0,Expression,Value,Parsed,Variables,Parsed_Success
100,expression3,$\dbinom{n-k+1}{k}$,"binomial(-k + n + 1, k)","[n, k]",True
134,expression10,$E$,E,[],True
187,expression10,$12.5$,25/2,[],True
450,expression1,"$\gcd(a, b)$",1,[],True
554,expression18,$\sin^2x\cdot\cos^2x\not=\frac{5}{6}$,"Ne(sin(x*cos(x)**2)**2, 5/6)",[x],True


## For the next section, we must tokenize the parsed expressions.

As the Sympy format cannot be saved directly as CSV or JSON without transformation, due to the limitations of these formats in handling complex symbolic representations. Saving without proper conversion would likely result in loss of symbolic structure and functionality, rendering the expressions unrecognizable or plain text.



In [53]:
#parsed_resultsdf.applymap(lambda x: repr(x))
#parsed_resultsdf.to_csv('Data/parsed_results.csv', index=None)

In [54]:
reference_text = pd.read_csv('Data/refefenceText.csv', header=None)
dfreference=pd.DataFrame(reference_text)


In [55]:
def extract_parsed_expressions(parsed_resultsdf):
    parsed_list = []
    variables_set = set()
    for i in range(len(parsed_resultsdf)):
        parsed_list_row = []
        for j in range(len(parsed_resultsdf.columns)):
            row = parsed_resultsdf.iloc[i, j]
            parsed_dict = {}
            for exp in row:
                if isinstance(exp, tuple) and len(exp) >= 3:
                    expr_key = exp[0]
                    parsed_expr = exp[2]
                    variables = exp[3]
                    parsed_dict[expr_key] = parsed_expr
                    variables_set.update(variables)
                else:
                    print(len(exp), exp)
            parsed_list_row.append(parsed_dict)
        parsed_list.append(parsed_list_row)
    
    return parsed_list, list(variables_set)

In [56]:
parsed_list_with_dict , variables= extract_parsed_expressions(parsed_resultsdf) 

In [57]:
# Define SymPy symbols for each variable in the list
symbols = sp.symbols(' '.join(str(var) for var in variables))
expression_types = {
   }

In [58]:

def get_expression_type(expr_type):
    if expr_type not in expression_types:
        description = expr_type.__name__.split('.')[-1]
        expression_types[expr_type] = description
    return expression_types[expr_type]

def tokenize_expression(expr):
    expr_type = type(expr)
    description = get_expression_type(expr_type)
    
    if isinstance(expr, sp.Equality):
        return [description] + tokenize_expression(expr.lhs) + tokenize_expression(expr.rhs)
    elif isinstance(expr, sp.Sum):
        return [description] + tokenize_expression(expr.args[0]) + ['over'] + tokenize_expression(expr.args[1])
    elif isinstance(expr, sp.Mul):
        return [description] + [tokenize_expression(arg) for arg in expr.args]
    elif isinstance(expr, sp.Pow):
        return [description] + tokenize_expression(expr.base) + tokenize_expression(expr.exp)
    elif isinstance(expr, sp.Add):
        return [description] + tokenize_expression(expr.args[0]) + tokenize_expression(expr.args[1])
    elif isinstance(expr, sp.Symbol):
        return [expr.name]
    elif isinstance(expr, sp.Integer):
        return [str(expr.numerator)]
    elif isinstance(expr, sp.Rational):
        return [str(expr)]
    else:
        return [str(expr)]

In [59]:
tokenized_expressions = []
for sublist in parsed_list_with_dict:
    tokenized_expressions_row = []
    for dictionary in sublist:
        tokenized_expressions_cell = {}
        for key, value in dictionary.items():
            tokens = tokenize_expression(value)
            tokenized_expressions_cell[key] = tokens
        tokenized_expressions_row.append(tokenized_expressions_cell)
    tokenized_expressions.append(tokenized_expressions_row)

In [60]:
tokenized_expressions[0][0]

{'expression0': ['Equality',
  'N',
  'Sum',
  'Mul',
  ['9'],
  ['Pow', '10', 'Add', 'k', '-1'],
  'over',
  '(k, 1, 321)'],
 'expression1': ['N']}

In [61]:
# Save File for the next section

with open('Data/tokenized_expressions.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(tokenized_expressions)