## data integrationa and preprocessing

In [6]:
import pandas as pd
import re
import json

# Load CSV file
df = pd.read_csv("questionsv3.csv")

# Safe cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9@:\.\,\-/ ]', '', text)
    return text

# Formatting functions
def format_question(q):
    if not q:
        return "No question provided"
    q = q.replace("asking about", "What is")
    q = q.replace("query regarding", "What is")
    return q.strip().capitalize() + "?"

def format_answer(a):
    if not a:
        return "No answer provided"
    a = a.replace("suggested him to", "You can")
    a = a.replace("recommended to", "You should")
    a = a.replace("advise to", "You should")
    a = a.replace("advice him to", "You should")
    a = a.replace("ask to", "Please")
    return a.strip().capitalize()

# Apply cleaning and formatting
df["clean_question"] = df["questions"].apply(clean_text).apply(format_question)
df["clean_answer"] = df["answers"].apply(clean_text).apply(format_answer)

# Export cleaned data
df.to_csv("cleaned_kcc.csv", index=False)
print("✅ Cleaned data saved to `cleaned_kcc.csv`")


✅ Cleaned data saved to `cleaned_kcc.csv`


In [7]:
# Export
df.to_csv("cleaned_kcc.csv", index=False)
qa_data = [{"query": q, "response": a} for q, a in zip(df["clean_question"], df["clean_answer"])]
json.dump(qa_data, open("kcc_qa_pairs.json", "w"), indent=2)


In [10]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [27]:
# Define chunk parameters
chunk_size = 10000

chunk_overlap = 200

# chunking

In [28]:
documents= CSVLoader(file_path="cleaned_kcc.csv", encoding="utf-8").load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)

In [29]:
texts[4].page_content  # Display the first 500 characters of the first chunk

'questions: query regarding nutrient management in gimger.\nanswers: advise to apply urea:ssp:mop @ 12:30:15 kg/bigha.\nclean_question: What is nutrient management in gimger.?\nclean_answer: You should apply urea:ssp:mop @ 12:30:15 kg/bigha.'

In [30]:
documents

[Document(metadata={'source': 'cleaned_kcc.csv', 'row': 0}, page_content='questions: asking about the control measure of aphid in cabbage\nanswers: suggested him to spray malathion @2ml/lit of water\nclean_question: What is the control measure of aphid in cabbage?\nclean_answer: You can spray malathion @2ml/lit of water'),
 Document(metadata={'source': 'cleaned_kcc.csv', 'row': 1}, page_content='questions: query regarding layer farm\nanswers: ask to inform in local govt dispensary or dept of poultry science khanapara ghy\nclean_question: What is layer farm?\nclean_answer: Please inform in local govt dispensary or dept of poultry science khanapara ghy'),
 Document(metadata={'source': 'cleaned_kcc.csv', 'row': 2}, page_content='questions: asking about the fertilizer dose for chilli\nanswers: recommended to apply dried cowdung,urea 30 kg/bigha,ssp 50 kg/bigha,mop 13 kg/bigha\nclean_question: What is the fertilizer dose for chilli?\nclean_answer: You should apply dried cowdung,urea 30 kg/b

In [31]:
length = len(texts)
print(f"Number of chunks: {length}")

Number of chunks: 3271


## embeddings

In [None]:
# from langchain_huggingface import HuggingFaceEmbeddings
# from tqdm import tqdm

# embeddings_model_name = 'BAAI/bge-large-en'
# embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

In [None]:
# from sentence_transformers import SentenceTransformer
# model_name="all-MiniLM-L6-v2"
# embeding_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="./cache")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [59]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [60]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="kcc_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [61]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(texts))]


vector_store.add_documents(documents=texts, ids=uuids)

['c6fc29e4-a132-481e-b64a-47aa5b16eaec',
 'f49ffc4c-10ff-439f-92fe-99db23c6b210',
 '46e46071-5d29-459b-b8bb-749b1aac3742',
 'e274094b-1705-42fd-b03e-ed56777f58de',
 '6afb99b7-43a9-4fcc-acf5-4351eb92b679',
 '91119045-64ab-4686-b299-309c01906674',
 'dd5b6423-79d7-4148-be8b-0dd58f680bd2',
 'a225c7fc-6e0d-4b06-a7ae-b1536bcd0920',
 '3ddd12b2-407e-4142-a29e-1f76c8026349',
 '2a393086-c977-4114-9509-56742ab59f8c',
 '44f1d7fc-57bb-4542-80c1-eadb7465f668',
 '97af866e-2556-4d89-85c2-c5d01c766382',
 '9400cf7b-f2bb-4cc8-950f-4f5626e63d59',
 '3d7b8c23-2721-4557-bad4-c2858685b463',
 'dad56481-dbe2-45db-b9c9-d4d656246b2e',
 'c1897b8f-9637-49d6-ad3d-25ad7bed557a',
 '13935601-745b-4ffd-bb14-7385303b2db0',
 '1f652847-f5f3-4698-a574-13955d1473f0',
 '16d409d3-f00c-454e-b9e5-04e7fad061ea',
 '807dc317-8c00-47f3-ab3b-eb29890c7062',
 'c77ab836-15bf-4ab8-a3ad-44555719bc3f',
 'a5191b8c-d207-439d-9f2f-f20a229e25cf',
 'd8ad1c13-526b-406e-9786-e55532f4b304',
 'bed7fac6-fda7-4753-bb13-b2bb6191d716',
 '13fbb139-edf2-

In [64]:
results = vector_store.similarity_search(
    "What pest-control methods are recommended for paddy in Tamil Nadu?",
    k=3,
    
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* questions: asking about control of pest in paddy
answers: spray tricel 20 ec @2ml/l water.
clean_question: What is control of pest in paddy?
clean_answer: Spray tricel 20 ec @2ml/l water. [{'row': 2666, 'source': 'cleaned_kcc.csv'}]
* questions: asking about the control of pest in paddy.
answers: spray tricel @2ml/l water. 3 spray at 3 days interval.
clean_question: What is the control of pest in paddy.?
clean_answer: Spray tricel @2ml/l water. 3 spray at 3 days interval. [{'row': 3112, 'source': 'cleaned_kcc.csv'}]
* questions: asking about control of stemm borer in paddy
answers: spray chloropyriphos (tricel 20 or classic 20 )@ 2ml/l water, 3 spray at 5 days interval.
clean_question: What is control of stemm borer in paddy?
clean_answer: Spray chloropyriphos tricel 20 or classic 20 @ 2ml/l water, 3 spray at 5 days interval. [{'row': 2291, 'source': 'cleaned_kcc.csv'}]


## model

In [1]:
%pip install -qU langchain-ollama

Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="gemma3:1b",
    temperature=0,
    
)

In [None]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# template = """Answer the question based only on the following context:

# {context}

# Question: {question}
# """

from langchain.prompts import ChatPromptTemplate

from langchain.prompts import ChatPromptTemplate

from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that answers questions based only on the provided context. "
            "Be clear, detailed, and explain your reasoning in simple terms. Quote relevant parts of the context if helpful."
        ),
        (
            "human",
            "Answer the question based only on the following context:\n\n{context}\n\nQuestion: {question}"
        ),
    ]
)



#prompt = ChatPromptTemplate.from_template(template)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [68]:
response = chain.invoke("what pest-control methods are recommended for paddy in Tamil Nadu?")
print(response)

Spray chloropyriphos (tricel 20 or classic 20 )@ 2ml/l water, 3 spray at 5 days interval.


In [74]:
response = chain.invoke("what pest-control methods are recommended for paddy in Tamil Nadu?")
print(response)

answers: spray tricel @2ml/l water. 3 spray at 3 days interval.


In [75]:

response = chain.invoke("How to manage drought stress in groundnut cultivation?")
print(response)

answers: You can use monopower 2 kg per bigha of land along with 1.3 tonne dry cowdung per bigha of land. before sowing treat the seed with trichoderma viridi 10 gram per kg of seed.


In [78]:

response = chain.invoke("What issues do sugarcane farmers in Maharashtra commonly face?")
print(response)

answers: The context does not provide information about issues faced by sugarcane farmers in Maharashtra.
