In [1]:
import pandas as pd
import sympy as sp
import numpy as np
import ast
import spacy
import gensim.downloader as api
from transformers import BertTokenizer, BertModel
import torch
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="A parameter name that contains `beta` will be renamed internally to `bias`.")
warnings.filterwarnings("ignore", message="A parameter name that contains `gamma` will be renamed internally to `weight`.")

In [2]:
dfreference= pd.read_csv('Data/refefenceText.csv', header=None)
tokenized_expressionsdf = pd.read_csv('Data/tokenized_expressions.csv', header=None)
dfreference = dfreference.map(lambda x: ast.literal_eval(x))
tokenized_expressionsdf = tokenized_expressionsdf.map(lambda x: ast.literal_eval(x))

In [3]:
def replace_expressions(text, tokenized_expressionsdf):
    def process_value(value):
        if isinstance(value, list):
            # Recursive processing if process_value recibe list type
            processed_list = [process_value(item) for item in value]
            return [item for sublist in processed_list for item in sublist]
        else:
            return [str(value)]

    for key, value in tokenized_expressionsdf.items():
        processed_value = process_value(value)
        text = text.replace(key, ' '.join(processed_value))
    return text

In [4]:
completdf = dfreference.copy()

num_rows, num_cols = dfreference.shape
tokenized_num_rows, tokenized_num_cols = tokenized_expressionsdf.shape

for i in range(num_cols):
    if i >= tokenized_num_cols:
        break
    for j in range(num_rows):
        if j >= tokenized_num_rows:
            break
        row = dfreference.iloc[j, i]
        new_row = []
        for word in row:
            if word.startswith('expression'):
                # Replace the expression with his tokenized version
                parsed = replace_expressions(word, tokenized_expressionsdf.iloc[j, i])
                new_row.append(parsed)
            else:
                new_row.append(word)
        # Update the row in df
        completdf.at[j, dfreference.columns[i]] = new_row


In [5]:
completdf.head()

Unnamed: 0,0,1,2,3,4,5
0,"[Consider, the, integer, Sum Mul 9 Pow 10 Add ...","[Let, s, express, the, number, in, terms, of, ...","[A, similar, and, simpler, way, to, consider, ...","[We, can, see, that, Equality 9 9, Equality Ad...","[Observe, how, adding, results, in, the, last,...","[Write, N, 9, 99, Add Pow 10 321 -1, {321}, {3..."
1,"[Jenn, randomly, chooses, a, number, J, from, ...","[By, symmetry, the, desired, probability, is, ...","[Ne(B - J, 0), because, Ne(B, J), so, the, pro...","[There, are, Equality 190 190, equally, likely...","[This, problem, is, essentially, asking, how, ...","[Create, a, grid, using, graph, paper, with, 2..."
2,"[In, Mul \triangle P Q R, Equality Mul P R 15,...","[We, know, the, area, of, the, hexagon, ABCDeF...","[Let, R, be, the, origin, Noticing, that, the,...","[Note, that, Mul \triangle P Q R, has, area, 1...","[Knowing, that, Mul \triangle P Q R, has, area...","[Triangle, PQR, is, a, right, triangle, with, ..."
3,"[A, soccer, team, has, 22, available, players,...","[There, are, -3, substitutions, The, number, o...","[There, is, 1, way, of, making, no, substituti...","[We, can, perform, casework, Call, the, substi...","[no, more, solutions, explained]","[no, more, solutions, explained]"
4,"[A, moving, particle, starts, at, the, point, ...","[One, could, recursively, compute, the, probab...","[Obviously, the, only, way, to, reach, (0, 0),...","[Since, the, particle, stops, at, one, of, the...","[All, paths, that, first, hit, the, axes, at, ...","[no, more, solutions, explained]"


In [6]:
def convert_string_to_sentence(lista):
    return ' '.join(lista)  # join the words in a complete sentence
completStringdf = completdf.copy()  

for col in completStringdf.columns:
    if completStringdf[col].dtype == 'object':  # Check the col has texts
        completStringdf[col] = completStringdf[col].apply(convert_string_to_sentence)
completStringdf.head()


Unnamed: 0,0,1,2,3,4,5
0,Consider the integer Sum Mul 9 Pow 10 Add k -1...,Let s express the number in terms of Pow 10 n ...,A similar and simpler way to consider the init...,We can see that Equality 9 9 Equality Add 9 99...,Observe how adding results in the last term bu...,Write N 9 99 Add Pow 10 321 -1 {321} {321} {32...
1,"Jenn randomly chooses a number J from [1, 2, 3...",By symmetry the desired probability is equal t...,"Ne(B - J, 0) because Ne(B, J) so the probabili...",There are Equality 190 190 equally likely pair...,This problem is essentially asking how many wa...,Create a grid using graph paper with 20 column...
2,In Mul \triangle P Q R Equality Mul P R 15 Equ...,We know the area of the hexagon ABCDeF to be A...,Let R be the origin Noticing that the triangle...,Note that Mul \triangle P Q R has area 150 and...,Knowing that Mul \triangle P Q R has area 150 ...,Triangle PQR is a right triangle with are Equa...
3,A soccer team has 22 available players A fixed...,There are -3 substitutions The number of ways ...,There is 1 way of making no substitutions to t...,We can perform casework Call the substitution ...,no more solutions explained,no more solutions explained
4,"A moving particle starts at the point (4, 4) a...",One could recursively compute the probabilitie...,"Obviously the only way to reach (0, 0) is to g...",Since the particle stops at one of the axes we...,All paths that first hit the axes at the origi...,no more solutions explained


In [7]:
# In order to avoid issues with BERT's handling of `beta` and `gamma`, 
# we are making replacements for these words throughout the DataFrame.
substitutions = {
    'beta': 'betaa',
    'gamma': 'gammaa2'
}

completStringdf.replace(substitutions, regex=True, inplace=True)

In [8]:
# Load tokenizer model and embeddings models
nlp = spacy.load("en_core_web_sm")
glove_model = api.load("glove-wiki-gigaword-300")
word2vec_model = api.load("word2vec-google-news-300")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



In [9]:

def tokenize_lemmatize_and_embed(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]
    
    # GloVe in float32
    glove_embeddings = [np.array(glove_model[token], dtype=np.float32) if token in glove_model else np.zeros(100, dtype=np.float32) for token in tokens]
    
    # Word2Vec in float32
    word2vec_embeddings = [np.array(word2vec_model[token], dtype=np.float32) if token in word2vec_model else np.zeros(300, dtype=np.float32) for token in tokens]
    
    # BERT ein float32
    bert_inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        bert_outputs = bert_model(**bert_inputs)
    bert_embeddings = bert_outputs.last_hidden_state.mean(dim=1).squeeze().numpy().astype(np.float32)
    
    return tokens, glove_embeddings, word2vec_embeddings, bert_embeddings

In [10]:
tokenizerdf = pd.DataFrame()
for col in completStringdf.columns:
    if completStringdf[col].dtype == 'object':
        tokenizerdf[f'{col}_Tokens'], tokenizerdf[f'{col}_GloVe'], tokenizerdf[f'{col}_Word2Vec'], tokenizerdf[f'{col}_BERT'] = zip(*completStringdf[col].apply(lambda x: tokenize_lemmatize_and_embed(' '.join(x)) if isinstance(x, list) else tokenize_lemmatize_and_embed(x)))

In [11]:
tokenizerdf.tail()

Unnamed: 0,0_Tokens,0_GloVe,0_Word2Vec,0_BERT,1_Tokens,1_GloVe,1_Word2Vec,1_BERT,2_Tokens,2_GloVe,...,3_Word2Vec,3_BERT,4_Tokens,4_GloVe,4_Word2Vec,4_BERT,5_Tokens,5_GloVe,5_Word2Vec,5_BERT
40,"[for, integer, a, b, c, and, d, let, Equality,...","[[-0.24132, 0.12063, 0.1919, -0.26692, 0.06107...","[[-0.011779785, -0.04736328, 0.044677734, 0.06...","[-0.052034423, 0.13999115, 0.5393157, -0.14268...","[there, can, be, two, different, case, for, th...","[[-0.47618, 0.069478, -0.013086, 0.1742, -0.31...","[[0.09423828, -0.022827148, 0.052246094, 0.026...","[-0.2292367, 0.046207458, 0.48947033, -0.16208...","[define, Equality, h(x, Add, Pow, x, 2, Mul, c...","[[-0.038996, 0.19765, -0.22989, 0.1622, -0.450...",...,"[[-0.011779785, -0.04736328, 0.044677734, 0.06...","[-0.2660388, -0.056148946, 0.7367518, -0.30964...","[no, more, solution, explain]","[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.08251953, -0.15136719, 0.06591797, 0.02001...","[0.12488315, -0.06317487, 0.054326605, -0.0229...","[no, more, solution, explain]","[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.08251953, -0.15136719, 0.06591797, 0.02001...","[0.12488315, -0.06317487, 0.054326605, -0.0229..."
41,"[let, n, be, the, least, positive, integer, fo...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....","[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.07803878, 0.011846034, 0.31168038, -0.3635...","[as, usual, denote, v_p(n, the, high, power, o...","[[-0.056826, 0.23863, 0.44515, -0.014863, 0.17...","[[-0.03149414, 0.064453125, -0.060546875, 0.01...","[-0.19947278, 0.047746897, 0.5569893, -0.17363...","[note, that, for, all, n, Add, Pow, 149, n, Mu...","[[-0.12192, -0.0082955, -0.23558, -0.16759, -0...",...,"[[-0.03149414, 0.064453125, -0.060546875, 0.01...","[-0.31946492, 0.043768875, 0.32381696, -0.2626...","[analyze, each, prime, power, separately, star...","[[-0.2966, 0.4133, -0.21151, -0.067363, -0.288...","[[-0.22753906, -0.10449219, 0.14550781, -0.084...","[-0.43419778, -0.09012956, 0.4206545, -0.14571...","[no, more, solution, explain]","[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.08251953, -0.15136719, 0.06591797, 0.02001...","[0.12488315, -0.06317487, 0.054326605, -0.0229..."
42,"[point, D, lie, on, side, \overline{bc, of, Mu...","[[0.17795, 0.29535, 0.14876, -0.11697, -0.2045...","[[0.122558594, -0.0065307617, 0.0625, 0.182617...","[-0.2102158, 0.06806501, 0.70676243, -0.159374...","[point, be, define, as, show, it, be, pretty, ...","[[0.17795, 0.29535, 0.14876, -0.11697, -0.2045...","[[0.122558594, -0.0065307617, 0.0625, 0.182617...","[-0.4187026, 0.041861027, 0.38689014, -0.29092...","[let, m_a, m_b, m_c, be, the, midpoint, of, ar...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....",...,"[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.37565812, -0.041621458, 0.4639818, -0.2217...","[let, None, and, BC, be, the, line, Equality, ...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....","[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.12449332, 0.25853702, 0.34152272, -0.19017...","[as, usual, we, will, use, homogenize, barycen...","[[-0.056826, 0.23863, 0.44515, -0.014863, 0.17...","[[-0.03149414, 0.064453125, -0.060546875, 0.01...","[-0.35006186, 0.12291278, 0.5717463, -0.199251..."
43,"[let, P(x, be, a, quadratic, polynomial, with,...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....","[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.050336286, -0.063132085, 0.66800505, -0.28...","[either, Equality, p(3, p(4, or, not, we, firs...","[[-0.098851, 0.41044, -0.13943, -0.046376, 0.0...","[[0.057617188, -0.16894531, 0.053466797, 0.232...","[-0.18731806, -0.031583793, 0.62543523, -0.226...","[let, the, root, of, p(x, be, m, and, n, then,...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....",...,"[[-0.03930664, -0.18164062, 0.037597656, -0.04...","[-0.25054744, -0.05319507, 0.79869515, -0.2383...","[let, Equality, p(x, Add, Pow, Add, x, Mul, -1...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....","[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.1555044, 0.05342049, 0.5134533, -0.1371459...","[note, that, because, False, p(3, and, P(4, be...","[[-0.12192, -0.0082955, -0.23558, -0.16759, -0...","[[-0.087402344, 0.25390625, -0.21777344, -0.13...","[-0.31960288, -0.009811534, 0.80121994, -0.202..."
44,"[let, ABC, be, an, acute, triangle, with, circ...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....","[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.2447324, -0.11287031, 0.36203042, -0.33244...","[the, following, be, a, power, of, a, point, s...","[[0.04656, 0.21318, -0.0074364, -0.45854, -0.0...","[[0.080078125, 0.10498047, 0.049804688, 0.0534...","[-0.22187835, 0.10671925, 0.6704156, -0.261709...","[we, first, observe, that, argument, of, type,...","[[-0.050835, 0.24826, -0.19384, 0.083809, -0.1...",...,"[[-0.26171875, 0.051513672, -0.12695312, 0.318...","[-0.40233818, -0.079661466, 0.3308136, -0.3020...","[let, <, class, sympy.series.order, Order, >, ...","[[0.31646, -0.0018851, -0.36329, -0.11607, -0....","[[0.20800781, 0.039794922, 0.25, 0.24414062, -...","[-0.11800052, 0.061917618, 0.48991987, -0.2563...","[no, more, solution, explain]","[[-0.16843, -0.037651, -0.17304, -0.069757, -0...","[[0.08251953, -0.15136719, 0.06591797, 0.02001...","[0.12488315, -0.06317487, 0.054326605, -0.0229..."


In [12]:
#tokenizerdf.to_csv('Data/tokenizer.csv', index= False)

# Note: CSV format does not perform well with vectors. Therefore, we will check for file types that support float32 data 
# and convert them to JSON, which preserves vectors better, especially for models like GloVe, Word2Vec, and BERT.

In [13]:
for col in tokenizerdf.columns:
    if col.endswith('_GloVe') or col.endswith('_Word2Vec') or col.endswith('_BERT'):
        print(f"Columna: {col}, Tipo de datos: {type(tokenizerdf[col].iloc[0])}, dtype: {tokenizerdf[col].dtype}")
        for i, embedding in enumerate(tokenizerdf[col]):
            if isinstance(embedding, list):
                print(f"  Element {i}: Data type: {type(embedding[0])}, dtype: {np.array(embedding).dtype}")
            elif isinstance(embedding, np.ndarray):
                print(f"  Element {i}: Data type: {type(embedding)}, dtype: {embedding.dtype}")

Columna: 0_GloVe, Tipo de datos: <class 'list'>, dtype: object
  Element 0: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 1: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 2: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 3: Data type: <class 'numpy.ndarray'>, dtype: float32
  Element 4: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 5: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 6: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 7: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 8: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 9: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 10: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 11: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 12: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 13: Data type: <class 'numpy.ndarray'>, dtype: object
  Element 14: Data type: <class 'numpy.ndarray'>, 

In [14]:
# Save DataFrame in JSON format fot the next Section
tokenizerdf.to_json('Data/jsonFile_withTokens.json', orient='records', lines=True, default_handler=lambda x: x.tolist() if isinstance(x, np.ndarray) else x)