## Goal

Chunk all text and check if similar chunks are together

In [None]:
import pandas as pd

problemset_df = pd.read_csv("C:\\Users\\mokrota\\Documents\\GitHub\\math_problem_recommender\\math_problem_recommender\\benchmark\\benchmarkv3\\df.csv")
problemset_df

## Testing on small head

In [None]:
small_df = problemset_df.head(10)

In [None]:
from similarity import BERTCLSMeanPooler, EmbSummarizer
from transformers import AutoModel, AutoTokenizer

model_name = "math-similarity/Bert-MLM_arXiv-MP-class_zbMath"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def chunk_maping(inputs):
    maping = inputs['offset_mapping']
    chunks = []
    for chunk in maping:
        start = None
        end = None
        for token in chunk:
            token_item = token.tolist()
            if token_item == [0, 0]:
                continue
            if start is None:
                start = token_item[0]
                break
        for token in chunk.flip(0):
            token_item = token.tolist()
            if token_item == [0, 0]:
                continue
            if end is None:
                end = token_item[1]
                break
        chunks.append((start, end))
    return chunks


In [None]:
import pickle
def chunk_util(row, col_name, summarizer: EmbSummarizer):
    col = row[col_name]
    inputs = summarizer.tokenize(col)
    emb = summarizer.embed(inputs).last_hidden_state
    emb_serialized = [pickle.dumps(emb[i]) for i in range(emb.shape[0])]
    chunks_map = chunk_maping(inputs)
    return {
        "embeddings_pickle": emb_serialized,
        "chunk_maping": chunks_map,
        "embeddings": emb
    }

In [None]:
summarizer = BERTCLSMeanPooler(model, tokenizer, **{"return_offsets_mapping": True, "max_length": 128})

new_column = small_df.apply(lambda row: chunk_util(row, "Problem", summarizer), axis=1, result_type='expand')

In [None]:
test_df = pd.concat([small_df, new_column], axis=1)
test_df

In [None]:
with open("test_df_with_emb.pkl", "wb") as f:
    pickle.dump(test_df, f)

with open("test_df_with_emb.pkl", "rb") as f:
    df_loaded = pickle.load(f)

In [None]:
import torch
emb_pickle_list = df_loaded['embeddings_pickle'].tolist()
emb = df_loaded['embeddings'].tolist()
emb_list = [[pickle.loads(c) for c in emb] for emb in emb_pickle_list]
any_wrong = False
for e1, e2 in zip(emb, emb_list):
    for c1, c2 in zip(e1, e2):
        if not torch.allclose(c1, c2, atol=1e-10):
            print(f"Not equal:\n\n{e1}\n\nand\n\n{e2}\n\n")
            any_wrong = True

if not any_wrong:
    print("All equal!")

In [None]:
def chunk_util(row, col_name, summarizer: EmbSummarizer):
    col = row[col_name]
    inputs = summarizer.tokenize(col)
    emb = summarizer.embed(inputs).last_hidden_state
    emb_serialized = [pickle.dumps(emb[i]) for i in range(emb.shape[0])]
    chunks_map = chunk_maping(inputs)
    return {
        "embeddings_pickle": emb_serialized,
        "chunk_maping": chunks_map
    }

In [None]:
def chunk_explode_df(df: pd.DataFrame, col_name, summarizer):
    new_column = df.apply(lambda row: chunk_util(row, col_name, summarizer), axis=1, result_type='expand')
    df = pd.concat([df, new_column], axis=1)
    df = df.rename({"id": "parent_id"}, axis=1)
    df = df.explode(['embeddings_pickle', 'chunk_maping'], ignore_index=True)
    return df

In [None]:
p_emb_df = chunk_explode_df(small_df, "Problem", summarizer)
sol_emb_df = chunk_explode_df(small_df, "Solution", summarizer)

In [None]:
p_emb_df

In [None]:
sol_emb_df

## Doing for the whole dataset

In [None]:
p_emb_df = chunk_explode_df(problemset_df, "Problem", summarizer)
sol_emb_df = chunk_explode_df(problemset_df, "Solution", summarizer)

In [None]:
p_emb_df = p_emb_df[["parent_id", "Problem", "Problem Book No", "TopicMetadata", "embeddings_pickle", "chunk_maping"]]
p_emb_df

In [None]:
sol_emb_df = sol_emb_df[["parent_id", "Solution", "Problem Book No", "TopicMetadata", "embeddings_pickle", "chunk_maping"]]
sol_emb_df

In [None]:
with open("problem_embeddings.pkl", "wb") as f:
    pickle.dump(p_emb_df, f)

In [None]:
with open("solution_embeddings.pkl", "wb") as f:
    pickle.dump(sol_emb_df, f)

In [None]:
with open("solution_embeddings.pkl", "rb") as f:
    df_loaded = pickle.load(f)

In [None]:
embeddings = [pickle.loads(e) for e in df_loaded['embeddings_pickle'].tolist()]

In [None]:
embeddings

In [None]:
cls_embeddings = [e[0] for e in embeddings]

In [None]:
cls_embeddings = torch.stack(cls_embeddings)

In [None]:
cls_embeddings.shape

In [None]:
anchor = "Find problems that use divisibility to limit number of options."
anchor_emb = summarizer.summarize(anchor)

In [None]:
anchor_emb.shape

In [None]:
from similarity import CosineSimScorer

ranker = CosineSimScorer()
ranks = ranker.rank(anchor_emb, cls_embeddings)

In [None]:
to_check = ranks[0:10].cpu().numpy()

In [None]:
sub_df = df_loaded.loc[to_check]
chunks_ids = sub_df['chunk_maping'].tolist()
solutions = sub_df['Solution'].tolist()
chunks = []
for i in range(len(solutions)):
    sol = solutions[i]
    ids = chunks_ids[i]
    chunk = sol[ids[0]:ids[1]]
    chunks.append(chunk)

In [None]:
sub_df

In [None]:
for c, s in zip(chunks, solutions):
    print("Solution:", s)
    print()
    print("Chunk:", c)
    print('-' * 100)