In [2]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%pip install pinecone

In [None]:
%pip install datasets

In [None]:
%pip install python-dotenv

In [5]:
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import DLAIUtils

import os
import time
import torch

In [6]:
from tqdm.auto import tqdm

In [35]:
my_dataset = load_dataset('csv', data_files="haryana 2024.csv", split='train[:200000]')

In [41]:
import re
def clean_text(record):
    record['QueryText'] = re.sub(r'\s+', ' ', record['QueryText'])  
    record['QueryText'] = re.sub(r'[^a-zA-Z0-9\s]', '', record['QueryText'])  
    return record

cleaned_dataset = my_dataset.map(clean_text)

Map:   0%|          | 0/200000 [00:00<?, ? examples/s]

In [42]:
questions = []
for record in cleaned_dataset['QueryText']:
  if record.strip():
    questions.append(record)
question = list(set(questions))
print(f'Number of questions: {len(question)}')

Number of questions: 18927


In [1]:
from sentence_transformers import SentenceTransformer

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:

PINECONE_API_KEY ="your key"

In [None]:

pinecone = Pinecone(api_key=PINECONE_API_KEY)


In [None]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME="my-kissan-cleansed"


if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME,
    dimension=model.get_sentence_embedding_dimension(),
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)
print(index)

In [45]:
batch_size=200
vector_limit=18927

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    
    i_end = min(i+batch_size, len(questions))
    
    ids = [str(x) for x in range(i, i_end)]
    
    metadatas = [{'text': text} for text in questions[i:i_end]]
    
    xc = model.encode(questions[i:i_end])
    
    records = zip(ids, xc, metadatas)
    
    index.upsert(vectors=records)

  0%|          | 0/95 [00:00<?, ?it/s]

In [18]:
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [47]:
run_query('what to sow in this season?')

0.54: Late Sowing 
0.45: When is the right time for sowing lobia 
0.45: When is the right time for sowing lobia
0.42: FARMER ASKED ME ABOUT CROP SOWING IN THIS SEASON 
0.42: Give information about which vegetables we can sowing at this time  
0.4: farmer asked about the weather for sowing sorghum 
0.39: Give information about which vegetables we can sowing in February Month
0.39: Give information about which vegetables we can sow at this time
0.39: Give information about which vegetables we can sow at this time 
0.39:  Give information about which vegetables we can sow at this time 


In [48]:
run_query('what is rabi crop')

0.8: Information regarding to MSP Rabi crop 
0.79: Information regarding new MSP of rabi crop
0.76: Information regarding to MSP different Rabi crop 
0.74: Information regarding in msp rate of rabi crop 
0.72: Information regarding rabi crops  wheat 20242025 new msp  
0.71: Information regarding of radish crop 
0.7: information about radish crop 
0.69: farmer asked about the crops to be sowing in Rabi season 
0.65: Farmer asked about radish crop
0.65: Information regarding to improve growth of radish crop  


In [49]:
query = 'how perform farming'
run_query(query)

0.68: information regarding Natural farming 
0.67:  Information about natural farming 
0.67: Information about natural farming
0.67: Information about natural farming 
0.64:  nformation regarding organic farming
0.63: INFORMATION REGARDING GOAT FARMING
0.63: Information regarding Goat farming
0.63: Information regarding goat farming
0.62: Information regarding agriculture implement Farm Machinery 
0.61: Information regarding to organic farming 


In [50]:
query = 'how you doing'
run_query(query)

0.41: regarding 
0.32: nipping 
0.26: pmfby 
0.25: wEATHER
0.23: Wrong Number
0.23: ASKED ABOUT TO TRAINING AND PRUINING INFORMATON 
0.23: information regarding 
0.22: Information regarding to 
0.22: ASKED ABOUT TO WEST DECOMPOSER  
0.22: Late Sowing 
