In [4]:
import fitz
from tqdm.auto import tqdm
import pandas as pd
import re
from nltk import word_tokenize
import spacy
import torch
from transformers import GPT2Tokenizer, GPT2Model

  from .autonotebook import tqdm as notebook_tqdm


# preprocessing

In [8]:
# Function to clean up text
def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").replace("\t", " ").strip()
    return cleaned_text

# Function to open and read PDF, returning a list of pages with cleaned text
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)  # Open the PDF document
    pages_and_texts = ""
    
    for page_number, page in tqdm(enumerate(doc), total=len(doc), desc="Processing pages"):
        text = page.get_text()  # Get plain text from the page
        text = text_formatter(text)  # Clean up the text
        pages_and_texts+=text
    
    return pages_and_texts

In [9]:
# Assuming you have a PDF file path
pdf_path = "C:/Users/netha/OneDrive/Desktop/srachllm/chat/Baseline_Report.pdf"

# Run the function and get the page data
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

Processing pages: 100%|██████████| 138/138 [00:00<00:00, 376.63it/s]


In [10]:
#pages_and_texts

In [11]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions
nlp = English()

# Add a sentencizer pipeline
nlp.add_pipe("sentencizer")
sentences=str(nlp(pages_and_texts))
#sentences

# chunking

In [12]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 777

def split_string(input_string: str, chunk_size: int) -> list[str]:
    return [input_string[i:i + chunk_size] for i in range(0, len(input_string), chunk_size)]

sentence_chunks= split_string(input_string=sentences,
                                         chunk_size=num_sentence_chunk_size)

#sentence_chunks

In [13]:
sentence_chunks_final=[]
for page_number, chunk in tqdm(enumerate(sentence_chunks), total=len(sentence_chunks), desc="Processing chunks"):
    # Clean up the chunk
    joined_sentence_chunk = chunk.replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)  # ".A" -> ". A" for any full-stop/capital letter combo
    sentence_chunks_final.append(joined_sentence_chunk)
#sentence_chunks_final

Processing chunks: 100%|██████████| 316/316 [00:00<00:00, 158521.72it/s]


# embedding

In [15]:
#Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")#sentence_chunks_final

# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(sentence_chunks_final,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

#text_chunk_embeddings

In [16]:
text_chunk_embeddings.shape

torch.Size([316, 768])

In [17]:

import time
from sentence_transformers import util

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, 
                                   convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = time.perf_counter()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = time.perf_counter()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

In [18]:
query = "what are the acheivements done by ICAR-CRIDA?"

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=text_chunk_embeddings)
scores, indices

[INFO] Time taken to get scores on 316 embeddings: 0.00377 seconds.


(tensor([0.4541, 0.4049, 0.3434, 0.3418, 0.3388]),
 tensor([  5, 290,  35, 295, 294]))

In [19]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [20]:
import torch

results_list = []
def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[str],
                                 n_resources_to_return: int = 5):
    # Retrieve relevant resources
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    
    for index in indices:
        result = pages_and_chunks[index]
        results_list.append(result)
    
    return results_list

In [21]:
query = "tell about horticulture"

# Print out the texts of the top scores
print_top_results_and_scores(query=query,
                             embeddings=text_chunk_embeddings,pages_and_chunks=sentence_chunks_final)

[INFO] Time taken to get scores on 316 embeddings: 0.00020 seconds.
Query: tell about horticulture

Results:


['ences on all sides should be trained with cucurbitaceous vegetables (Bottle gourd, Bitter gourd and Snake gourd) ► Some vegetables are direct sown-(Amaranthus, Bottle gourd, Bitter gourd and Snake gourd) ► Some vegetables are nursery transplanted (Tomato, Brinjal, Chillies and Onion) ► Divide the area into equal sized plots for raising annual vegetable crops ► As intensive and continuous cropping is done in a kitchen garden ► Fertility and texture of soil may be maintained by applying adequate quantities of organic manures frequently ► Ridges and furrows are formed in each plots ► Season of planting: June-July and September-October ► Bee-hive may be provided for ensuring adequate pollination of crops besides obtaining honey ► However, in order to harvest good crop,',
 've study in agriculturally intensive regions of Uttar Pradesh, India. Environment, Development and Sustainability, 23(4), 5822-5845. Reddy, A. (2014). Profitability and labour use in cropping systems. Indian J. Dryland

In [23]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Function to combine query and relevant texts into a single input string
def prepare_input(query, relevant_texts):
    context = ' '.join(relevant_texts)
    input_text = f"{query}\n\nRelevant Information:\n{context}"
    return input_text

# Function to generate a response
def generate_response(input_text, tokenizer, model):
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=500, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def main(query, relevant_texts):
    # Prepare the input text
    input_text = prepare_input(query, relevant_texts)

    # Initialize the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Generate the response
    response = generate_response(input_text, tokenizer, model)
    return response

# Example usage
query = ""
relevant_texts = results_list
result = main(query, relevant_texts[:2])
result

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'\n\nRelevant Information:\nences on all sides should be trained with cucurbitaceous vegetables (Bottle gourd, Bitter gourd and Snake gourd) ► Some vegetables are direct sown-(Amaranthus, Bottle gourd, Bitter gourd and Snake gourd) ► Some vegetables are nursery transplanted (Tomato, Brinjal, Chillies and Onion) ► Divide the area into equal sized plots for raising annual vegetable crops ► As intensive and continuous cropping is done in a kitchen garden ► Fertility and texture of soil may be maintained by applying adequate quantities of organic manures frequently ► Ridges and furrows are formed in each plots ► Season of planting: June-July and September-October ► Bee-hive may be provided for ensuring adequate pollination of crops besides obtaining honey ► However, in order to harvest good crop, ve study in agriculturally intensive regions of Uttar Pradesh, India. Environment, Development and Sustainability, 23(4), 5822-5845. Reddy, A. (2014). Profitability and labour use in cropping syst

In [24]:
print(result)



Relevant Information:
ences on all sides should be trained with cucurbitaceous vegetables (Bottle gourd, Bitter gourd and Snake gourd) ► Some vegetables are direct sown-(Amaranthus, Bottle gourd, Bitter gourd and Snake gourd) ► Some vegetables are nursery transplanted (Tomato, Brinjal, Chillies and Onion) ► Divide the area into equal sized plots for raising annual vegetable crops ► As intensive and continuous cropping is done in a kitchen garden ► Fertility and texture of soil may be maintained by applying adequate quantities of organic manures frequently ► Ridges and furrows are formed in each plots ► Season of planting: June-July and September-October ► Bee-hive may be provided for ensuring adequate pollination of crops besides obtaining honey ► However, in order to harvest good crop, ve study in agriculturally intensive regions of Uttar Pradesh, India. Environment, Development and Sustainability, 23(4), 5822-5845. Reddy, A. (2014). Profitability and labour use in cropping systems.