In [2]:
import fitz
from tqdm.auto import tqdm
import pandas as pd
import re
from nltk import word_tokenize
import spacy
import torch
from transformers import GPT2Tokenizer, GPT2Model

  from .autonotebook import tqdm as notebook_tqdm


# preprocessing

In [3]:
# Function to clean up text
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").replace("\t", " ").strip()
    return cleaned_text

# Function to open and read PDF, returning a list of pages with cleaned text
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # Open the PDF document
    pages_and_texts = ""
    
    for page_number, page in tqdm(enumerate(doc), total=len(doc), desc="Processing pages"):
        text = page.get_text()  # Get plain text from the page
        text = text_formatter(text)  # Clean up the text
        pages_and_texts+=text
    
    return pages_and_texts

In [4]:
# Assuming you have a PDF file path
pdf_path = "C:/Users/netha/OneDrive/Desktop/srachllm/finalLLM/data/Baseline_Report.pdf"

# Run the function and get the page data
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

Processing pages: 100%|██████████| 138/138 [00:00<00:00, 141.07it/s]


In [5]:
#pages_and_texts

'i ICAR - Central Research Institute for Dryland Agriculture Santoshnagar, Hyderabad - 500 059, Telangana, India www.icar-crida.res.in A. Amarender Reddy M. Osman V. K. Singh Baseline Survey of SC-Sub Plan  Villages for  Building Local Capabilities A Problem-Driven Iterative Adaptation (PDIA) Approachii Citation:  Reddy Amarender A., Osman, M and Singh, V. K. 2021. Baseline Survey of SC-Sub Plan Villages for  Building Local Capabilities - A Problem-Driven Iterative Adaptation (PDIA) Approach. ICAR-Central  Research Institute for Dryland Agriculture (CRIDA), Santoshnagar, Hyderabad, Telangana, India. 126 p.  ISBN : 978-93-80883-61-8 © ICAR-CRIDA 2021 Number of copies: 100 Editorial assistance: Shailesh Borkar, SRF. Published by The Director ICAR-Central Research Institute for Dryland Agriculture (CRIDA) Santoshnagar, Hyderabad-500 059, Telangana, India Ph: 040-24530177 Fax: 040-24531802 Printed at: Balaji Scan Pvt. Ltd., 11-2-1145, Beside Matas Temple, Nampally,  Hyderabad-500 001, Tela

In [6]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions
nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")
sentences=str(nlp(pages_and_texts))
sentences

'i ICAR - Central Research Institute for Dryland Agriculture Santoshnagar, Hyderabad - 500 059, Telangana, India www.icar-crida.res.in A. Amarender Reddy M. Osman V. K. Singh Baseline Survey of SC-Sub Plan  Villages for  Building Local Capabilities A Problem-Driven Iterative Adaptation (PDIA) Approachii Citation:  Reddy Amarender A., Osman, M and Singh, V. K. 2021. Baseline Survey of SC-Sub Plan Villages for  Building Local Capabilities - A Problem-Driven Iterative Adaptation (PDIA) Approach. ICAR-Central  Research Institute for Dryland Agriculture (CRIDA), Santoshnagar, Hyderabad, Telangana, India. 126 p.  ISBN : 978-93-80883-61-8 © ICAR-CRIDA 2021 Number of copies: 100 Editorial assistance: Shailesh Borkar, SRF. Published by The Director ICAR-Central Research Institute for Dryland Agriculture (CRIDA) Santoshnagar, Hyderabad-500 059, Telangana, India Ph: 040-24530177 Fax: 040-24531802 Printed at: Balaji Scan Pvt. Ltd., 11-2-1145, Beside Matas Temple, Nampally,  Hyderabad-500 001, Tela

# chunking

In [29]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 777

def split_string(input_string: str, chunk_size: int) -> list[str]:
    return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]

sentence_chunks= split_string(input_string=sentences,
                                         chunk_size=num_sentence_chunk_size)

#sentence_chunks

In [30]:
sentence_chunks_final=[]
for page_number, chunk in tqdm(enumerate(sentence_chunks), total=len(sentence_chunks), desc="Processing chunks"):
    # Clean up the chunk
    joined_sentence_chunk = chunk.replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)  # ".A" -> ". A" for any full-stop/capital letter combo
    sentence_chunks_final.append(joined_sentence_chunk)
#sentence_chunks_final

Processing chunks: 100%|██████████| 316/316 [00:00<00:00, 35202.25it/s]


# embedding

In [31]:
#Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")#sentence_chunks_final

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(sentence_chunks_final,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings



tensor([[ 0.0039, -0.0285, -0.0262,  ...,  0.0486, -0.0451,  0.0162],
        [-0.0426, -0.0088, -0.0117,  ...,  0.0523, -0.0166,  0.0080],
        [-0.0235, -0.0032,  0.0011,  ..., -0.0121,  0.0475,  0.0214],
        ...,
        [-0.0139, -0.0193, -0.0114,  ...,  0.0095, -0.0099,  0.0029],
        [-0.0117,  0.0029,  0.0032,  ..., -0.0144, -0.0767,  0.0159],
        [ 0.0054,  0.0008,  0.0004,  ..., -0.0009, -0.0585,  0.0008]])

In [32]:
text_chunk_embeddings.shape

torch.Size([316, 768])

In [33]:

import time
from sentence_transformers import util

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = time.perf_counter()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = time.perf_counter()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

In [34]:
query = "what are the acheivements done by ICAR-CRIDA?"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=text_chunk_embeddings)
scores, indices

[INFO] Time taken to get scores on 316 embeddings: 0.00037 seconds.


(tensor([0.4541, 0.4049, 0.3434, 0.3418, 0.3388]),
 tensor([  5, 290,  35, 295, 294]))

In [35]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [42]:
import torch

results_list = []
def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[str],
                                 n_resources_to_return: int = 5):
    # Retrieve relevant resources
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    
    for index in indices:
        result = pages_and_chunks[index]
        results_list.append(result)
    
    return results_list

In [43]:
query = "who is the director of ICAR-CRIDA website?"

# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=text_chunk_embeddings,pages_and_chunks=sentence_chunks_final)

[INFO] Time taken to get scores on 316 embeddings: 0.00044 seconds.
Query: who is the director of ICAR-CRIDA website?

Results:


['beneficiaries to be considered for assistance in a financial year will be finalized by the SC communities themselves which will be facilitated by the CRIDA team.110 The skill developing training programmes will be conducted with the support of other line departments like NIRD, ICAR Institutes, etc., and formulated in such a way that after the completion of training, the placement of trained candidates either in waged employment or in self-employment is ensured. Follow up of the beneficiaries after the planned activities will be conducted to ascertain whether they have acquired necessary assets and utilizing the assets for income generating activities. ICAR-CRIDA team will do all efforts for ensuring full utilization of SC-Sub Plan funds and effective implementa',
 's faced by the SC households in these three villages to come out of poverty and plan, prioritize and implement household-oriented schemes for overall socio-economic development. ICAR- CRIDA will provide resources and techn

In [52]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Function to combine query and relevant texts into a single input string
def prepare_input(query, relevant_texts):
    context = ' '.join(relevant_texts)
    input_text = f"{query}\n\nRelevant Information:\n{context}"
    return input_text

# Function to generate a response
def generate_response(input_text, tokenizer, model):
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=500, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def main(query, relevant_texts):
    # Prepare the input text
    input_text = prepare_input(query, relevant_texts)

    # Initialize the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Generate the response
    response = generate_response(input_text, tokenizer, model)
    return response

# Example usage
query = ""
relevant_texts = results_list
result = main(query, relevant_texts[:3])
result

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'horticulture\n\nRelevant Information:\nbeneficiaries to be considered for assistance in a financial year will be finalized by the SC communities themselves which will be facilitated by the CRIDA team.110 The skill developing training programmes will be conducted with the support of other line departments like NIRD, ICAR Institutes, etc., and formulated in such a way that after the completion of training, the placement of trained candidates either in waged employment or in self-employment is ensured. Follow up of the beneficiaries after the planned activities will be conducted to ascertain whether they have acquired necessary assets and utilizing the assets for income generating activities. ICAR-CRIDA team will do all efforts for ensuring full utilization of SC-Sub Plan funds and effective implementa s faced by the SC households in these three villages to come out of poverty and plan, prioritize and implement household-oriented schemes for overall socio-economic development. ICAR- CRID

In [57]:
print(result)

horticulture

Relevant Information:
beneficiaries to be considered for assistance in a financial year will be finalized by the SC communities themselves which will be facilitated by the CRIDA team.110 The skill developing training programmes will be conducted with the support of other line departments like NIRD, ICAR Institutes, etc., and formulated in such a way that after the completion of training, the placement of trained candidates either in waged employment or in self-employment is ensured. Follow up of the beneficiaries after the planned activities will be conducted to ascertain whether they have acquired necessary assets and utilizing the assets for income generating activities. ICAR-CRIDA team will do all efforts for ensuring full utilization of SC-Sub Plan funds and effective implementa s faced by the SC households in these three villages to come out of poverty and plan, prioritize and implement household-oriented schemes for overall socio-economic development. ICAR- CRIDA wi