In [39]:
# !python -m spacy download en_core_web_md

In [40]:
import fitz
from tqdm.auto import tqdm
import pandas as pd
from spacy.lang.en import English
import spacy
from sentence_transformers import SentenceTransformer
import re
from transformers import AutoModel, AutoTokenizer

In [41]:
pdf_path = "msme-features.pdf" 
doc = fitz.open(pdf_path)
print(doc)

Document('msme-features.pdf')


In [42]:
# nlp = spacy.load("en_core_web_md")
# for page_index in range(len(doc)):
#     page = doc[page_index]
#     tables = page.get_tables()
#     for table in tables:
#         for row in table:
#             doc = nlp(" ".join(row))
#             for token in doc:
#                 print(token.text, token.pos_, token.dep_)

In [43]:
def open_and_read_pdf(pdf: str) -> list[dict]:
    doc = fitz.open(pdf)
    pages_and_text = []
    nlp = spacy.load("en_core_web_md")
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()  # get plain text encoded as UTF-8
        doc = nlp(text)  # Process text with spaCy

        word_count = len([token for token in doc if not token.is_space and not token.is_punct])
        sentence_count = len(list(doc.sents)) 
        token_count = len(list(doc))
        
        pages_and_text.append({
            "page_number": page_number, 
            "page_char_count": len(text),
            "page_word_count": word_count,
            "page_sentence_count": sentence_count,
            "page_token_count": token_count, 
            "text": text
        })
    return pages_and_text
pages = open_and_read_pdf(pdf = pdf_path)
pages[0:2]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 2722,
  'page_word_count': 458,
  'page_sentence_count': 21,
  'page_token_count': 539,
  'text': 'TallyHelp shows Search results that are relevant to your location, which is India.\nClick here to explore the articles for all countries.\nHome \x00 TallyPrime \x00 Accounting \x00 MSME in TallyPrime\nKeyboard Shortcuts\nIn this topic we will learn about the challenges faced by the MSME businesses due to the lack of\nvisibility of their MSME status. We will also look into\xa0the provisions made by the Government to\nsupport such businesses and how the use of TallyPrime will improve the visibility in their business\ntransactions and accelerate the movement of funds.\nMicro and small businesses registered under the MSMED Act,\xa02006, get benefits such as loans at a\nlower interest, payment guarantees from buyers, and so on. The payment guarantee provision\nprotects these businesses from delays in payment from their buyers. As per this act, any buyer

In [44]:
nlp = English()
nlp.add_pipe("sentencizer")
for item in tqdm(pages):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])
print(item)

  0%|          | 0/11 [00:00<?, ?it/s]

{'page_number': 10, 'page_char_count': 1586, 'page_word_count': 302, 'page_sentence_count': 18, 'page_token_count': 372, 'text': 'Show PAN/IT No.: Enable this option to display the PAN/IT number of the MSME party.\nShow UDYAM Reg No.: Enable this option to display the UDYAM number of the party, this is enabled\nby default.\nShow Due on: Enable this option to display the Due on of each bill. This is set to Yes by default.\nBased on Credit Days provided in: Under this you can select the option to arrive at Due On date, The\noptions provided are:\nBill-wise Details (as per invoice)– Due on date is arrived based on credit days specified in\nbill-wise details of the invoice.\nMSME Credit Period (from Company features, as per agreement) – Due on date is arrived\nbased on msme credit days set for the party.\nBill-wise or MSME Credit Period (whichever is earlier) – Due on date is arrived based on\nearliest credit days set in bill-wise and in msme credit days.\nShow Cleared On: Enable this opti

In [45]:
num_sentence_chunk_size = 3
def split_list(input_list: list, slice_size: int) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

for item in tqdm(pages):
    item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/11 [00:00<?, ?it/s]

In [46]:
# Sample an example from the group (note: many samples have only 1 chunk as they have <=10 sentences total)
# random.sample(pages, k=1)

In [47]:
pages_and_chunks = []
for item in tqdm(pages):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 
        
        pages_and_chunks.append(chunk_dict)
len(pages_and_chunks)

  0%|          | 0/11 [00:00<?, ?it/s]

56

In [48]:
# random.sample(pages_and_chunks, k=1)

In [49]:
df = pd.DataFrame(pages_and_chunks)
# df.describe().round(2)

In [50]:
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].sample(2).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 25.5 | Text: Show PAN/IT No.:Enable this option to display the PAN/IT number of the MSME party.
Show UDYAM Reg No.:
Chunk token count: 0.0 | Text: 


In [51]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
# pages_and_chunks_over_min_token_len[:2]

In [52]:
embedding_model = SentenceTransformer(model_name_or_path="Alibaba-NLP/gte-base-en-v1.5", trust_remote_code = True, device="cpu")

In [53]:
# model_path = 'Alibaba-NLP/gte-large-en-v1.5'
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# embedding_model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

### CPU Based embedding creation

In [54]:
%%time
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/36 [00:00<?, ?it/s]

CPU times: total: 19.5 s
Wall time: 7.85 s


### For GPU Run Below 

In [55]:
# %%time

# # Send the model to the GPU
# embedding_model.to("cuda") # requires a GPU installed, for reference on my local machine, I'm using a NVIDIA RTX 4090

# # Create embeddings one by one on the GPU
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [56]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

### Batch embeddings for shorter times on CPU embeddings

In [57]:
# %%time

# Embed all texts in batches
# text_chunk_embeddings = embedding_model.encode(text_chunks,
#                                                batch_size=32, # Change batch size according to speed obtained
#                                                convert_to_tensor=True) # optional to return embeddings as tensor instead of array

# text_chunk_embeddings

In [58]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "../text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [59]:
# Import saved file and view
text_chunks_and_embedding_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embedding_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,TallyHelp shows Search results that are releva...,337,52,84.25,[-7.43505716e-01 -4.66394484e-01 6.55767620e-...
1,0,We will also look into the provisions made by ...,470,68,117.5,[-2.14783937e-01 -5.26265800e-01 1.60889161e+...
2,0,"As per this act, any buyer of\ngoods or servic...",360,64,90.0,[ 3.87885153e-01 -3.77041906e-01 -2.23355308e-...
3,0,"Additionally, the Ministry of Corporate Affair...",580,88,145.0,[-1.19639993e-01 -2.50907034e-01 9.96267200e-...
4,0,"As a result, the auditors do not detect such o...",415,66,103.75,[-5.76742351e-01 -1.85536414e-01 1.31263995e+...
