In [None]:
#######Step 1 [Installed and Upgraded Necessary Packages]###########
#Action 1 package-management system Upgrade
#Action 2 Installed sentence_transformers [A Python framework for state-of-the-art sentence, text and image embeddings.]
#Action 3 Installed pinecone [A Python library designed for similarity search and vector indexing tasks]
#Action 4 Installed openai [OpenAI Python library provides convenient access to the OpenAI REST API]
#Action 5 Installed docx2txt [A Python library used for converting  Microsoft Word (.docx) files to text]
!pip3 install --upgrade pip
!pip3 install sentence-transformers
!pip3 install pinecone-client
!pip3 install openai --upgrade
!pip3 install python-docx
!pip3 install docx2txt


In [44]:
import os
#store OPENAI_API_KEY in the OPENAI_API_KEY environment variable
os.environ['OPENAI_API_KEY'] = "YOUR_OPENAI_API_KEY"

In [45]:
#This is for embedding. In here, one LM model from huggingface used.
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')


text ='Abc'
model.encode(text).tolist() #exmple how to do encoding.

[-0.007829328067600727,
 0.007352911401540041,
 -0.030942974612116814,
 0.04150359332561493,
 -0.07909709960222244,
 0.10331298410892487,
 0.04914173483848572,
 0.0030658261384814978,
 0.07064641267061234,
 -0.012973358854651451,
 -0.06321518868207932,
 -0.038936812430620193,
 -0.002514953725039959,
 -0.014077841304242611,
 -0.0417715348303318,
 -0.022063976153731346,
 -0.025870082899928093,
 -0.08123704046010971,
 -0.08500199764966965,
 -0.024818837642669678,
 -0.019687073305249214,
 0.0208884347230196,
 -0.032474175095558167,
 0.03827577084302902,
 -0.016411352902650833,
 0.039388056844472885,
 0.012336735613644123,
 0.03142797201871872,
 -0.05450306460261345,
 -0.12199369817972183,
 0.016606569290161133,
 0.06999655812978745,
 0.1426408290863037,
 -0.020055463537573814,
 -0.008527873083949089,
 -0.045705851167440414,
 -0.04244673624634743,
 -0.03091576136648655,
 0.05930527672171593,
 0.15464414656162262,
 -0.024057865142822266,
 -0.0003202113148290664,
 0.06959783285856247,
 0.0536

In [46]:
import docx2txt
def extract_text_from_docx(docx_file):
    text = docx2txt.process(docx_file)
    return text

# Example usage
docx_file_path = 'DataLaw.docx'
word_text = extract_text_from_docx(docx_file_path)
# print(word_text)

In [47]:
#Function to split long documents in to smaller parts
def split_text_into_chunks(plain_text, max_chars=2000):
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

corpusData = split_text_into_chunks(word_text)
#print(corpusData)

In [48]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key='956d00d5-0cb6-4ba9-8a8f-8d2be26baa3e')

#Function to check if index name exist from list of indexes
def index_name_exists(indexes, name):
    for index in indexes:
        if index["name"] == name:
            return True
    return False

#######[Setup the serverless index[vector embeddings are stored in indexes.]########
index_name = "codingdstask-20240418-index" #my index name
existing_indexes = pc.list_indexes() #List of Existing Indexes [print(pc.list_indexes())]
index_exists =  index_name_exists(existing_indexes, index_name) # Check if index name exists 

if not index_exists:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )
    print("Creating New Index")
index = pc.Index(index_name)


In [49]:
def addData(corpusData):
    id  = index.describe_index_stats()['total_vector_count']
    for i in range(len(corpusData)):
        chunk=corpusData[i]
        chunkInfo=(str(id+i),
                model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.
                {'context': chunk}) #In metadata we are storing the original text here as context. 
        index.upsert(vectors=[chunkInfo])

addData(corpusData)

In [50]:
#Get my Index  Description & Stat
print(pc.describe_index(index_name))
print(index.describe_index_stats())

{'dimension': 384,
 'host': 'codingdstask-20240418-index-57y5906.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'codingdstask-20240418-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 257}},
 'total_vector_count': 257}


In [51]:
#This function is responsible for matching the input string with alread existing data on vector database.
def find_match(query,k):
    # Perform semantic search using Pinecone index
    vector_query = model.encode(query).tolist() #converts the query into a vector representation
    # print(query_em)
    result = index.query(vector=vector_query, top_k=10, includeMetadata=True)
    return [result['matches'][i]['metadata']['context'] for i in range(k)]

In [52]:
def create_prompt(contexts, query):
    #Todo: Should be generated with the context/contexts we find by doing semantaic search
    # If no relevant contexts are found, return None
    if not contexts:
        return None
    # Generate the prompt based on the found contexts and the user's query
    prompt = f"Contexts: {' | '.join(contexts)}\nUser Query: {query}\n"
    return prompt

In [70]:
from openai import OpenAI
client = OpenAI()
# openai.api_key = os.getenv('OPENAI_API_KEY')
# org-m10pfqoDuVjSp0tfoZSVQzh9
def generate_answer(prompt):
  try:
      #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.
      # Set parameters for completion
      response = client.completions.create(
            model="gpt-3.5-turbo-instruct",  # You can use other engines like "text-ada-002" as well
            prompt=prompt,
            stream=False,
            max_tokens=140  # Adjust the max_tokens as needed
      )
      answer = response.choices[0].text.strip()  # Get the generated answer
      return answer
  except Exception as e:
        # Handle other types of errors
        return  ({"error": f"An error occurred: {e}"})



In [68]:
def user_query(query):
  #Todo: Make all the things together.
  contexts = find_match(query=query, k=10)
  prompt = create_prompt(contexts, query)  
  print(prompt)
  answer = generate_answer(prompt)
  return answer


In [71]:
user_query("Agricultural Land Act suspensive deadline ?")

Contexts: -         use farming methods appropriate for the land and location in order to prevent soil compaction, erosion and pollution and to ensure the permanent fertility of land.  For more efficient implementation of indent one of the preceding paragraph, notwithstanding the provisions of the Law of Property Code, the following shall be sufficient for the determination of agricultural land user and land use practices:  -         the consent of co-owners whose undivided interest represents more than half of the surface area of the land, if agricultural land is co-owned,  -         the consent of more than half of the owners if agricultural land is co-owned.  If the agricultural inspection authority establishes that an owner, lessee or other user of agricultural land fails to act in accordance with indent one of paragraph one of this Article, it shall impose the implementation of appropriate measures by a decision. An owner, lessee or other user of agricultural land must carry out s

"I'm not sure what specific deadline you are referring to. Generally, the Agricultural Land Act does not mention a specific deadline, but rather imposes the obligation on owners, lessees, and users of agricultural land to use farming methods appropriate for the land and location in order to prevent soil compaction, erosion, and pollution and to ensure the permanent fertility of land. Failure to comply with this requirement may result in the imposition of appropriate measures by the agricultural inspection authority. These measures must be implemented in a timely manner, within the time limits set by the decision, which cannot exceed a period of one year."

In [56]:
# Summary about my alternative approach.
'''
1)Semantic Search with Pre-trained Models: Instead of manually vectorizing and storing chunks of data in a database, 
utilize a pre-trained language model (such as BERT or RoBERTa) for semantic similarity computation.

2)Real-time Contextual Prompt Generation: When a user query is received, dynamically generate a prompt that incorporates 
relevant contexts from the entire document corpus. 
'''

'\n1)Semantic Search with Pre-trained Models: Instead of manually vectorizing and storing chunks of data in a database, \nutilize a pre-trained language model (such as BERT or RoBERTa) for semantic similarity computation.\n\n2)Real-time Contextual Prompt Generation: When a user query is received, dynamically generate a prompt that incorporates \nrelevant contexts from the entire document corpus. \n'