## RAG TEST FOR ROMAN NEPALI

In [1]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
embedding_model = r"D:\\Generative AI\\local LLM\\local embedding\\sentence-transformers-local\\all-MiniLM-L6-v2"

In [15]:
embeddingsLocal = SentenceTransformer(embedding_model)

In [16]:
embeddingsLocal

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [17]:
from numpy import dot
from numpy.linalg import norm

text1 = "store ko naam ke ho?"
text2 = "what is the store name?"

# Encode text1 and text2
embedding1 = embeddingsLocal.encode(text1)
embedding2 = embeddingsLocal.encode(text2)

# Compute cosine similarity

cosine_similarity = dot(embedding1, embedding2) / (norm(embedding1) * norm(embedding2))
cosine_similarity

0.44182497

In [18]:
text_embeddings = embeddingsLocal.encode(text1)

In [19]:
text_embeddings.shape

(384,)

In [12]:
pinecone_api = "pcsk_2MXqC4_6TLQe3ThqXfoQeAtE3K52QE9V8phYmn8xRF2uhTJzW3LU912wNx2nrbEMTBUDRn"

In [13]:
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=pinecone_api)

# Define a sample dataset where each item has a unique ID, text, and category
data = [
    {
        "id": "rec1",
        "text": "Product-name Redmi Note 9 pro Price: Rs 10000 RAM: 8GB Storage: 128GB description: Redmi Note 9 pro with gorilla glass",
        "category": "product" 
    },
    {
        "id": "rec2",
        "text": "Product-name Samsung Galaxy Price: Rs 130000 RAM: 16GB Storage: 128GB description: Brand new Samsung Galaxy with AI",
        "category": "product" 
    },
    {
        "id": "rec3",
        "text": """Hamro store ko name All Electronics Store ho.
        Hamro store Dharan ma xa.
        Store ko contact number 9812345678 ho.
        Email: allelectronics@gmail.com
        """,
        "category": "information"
    },
    {
        "id": "rec4",
        "text": "Delivery option All Nepal ho. Inside Dharan delivery same day free hunxa ra outside Dharan Rs. 120 laagxa. Delivery huna 2-3 din laagna sakxa.",
        "category": "Delivery"
    }
]

# Convert the text into numerical vectors that Pinecone can index
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d["text"] for d in data],
    parameters={
        "input_type": "passage", 
        "truncate": "END"
    }
)

print(embeddings)


EmbeddingsList(
  model='multilingual-e5-large',
  vector_type='dense',
  data=[
    {'vector_type': dense, 'values': [0.013641357421875, -0.025390625, ..., -0.041229248046875, -0.01568603515625]},
    {'vector_type': dense, 'values': [0.03857421875, -0.0311126708984375, ..., -0.0439453125, -0.03155517578125]},
    {'vector_type': dense, 'values': [0.0601806640625, -0.0296630859375, ..., -0.05267333984375, -0.00276947021484375]},
    {'vector_type': dense, 'values': [-0.0081329345703125, -0.0267181396484375, ..., -0.036468505859375, 0.00519561767578125]}
  ],
  usage={'total_tokens': 145}
)


In [14]:
index = pc.Index("majorproject")

In [20]:
records = []
for d, e in zip(data, embeddings):
    records.append({
        "id": d["id"],
        "values": e["values"],
        "metadata": {
            "source_text": d["text"],
            "category": d["category"]
        }
    })

## Upload Index

In [22]:
index.upsert(
    vectors=records,
    namespace="roman-nepali-v1"
)

upserted_count: 4

In [25]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'roman-nepali-v1': {'vector_count': 4}},
 'total_vector_count': 4}


## Search Index

In [26]:
query = "Store ko name ke ho?"

In [27]:
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [28]:
results = index.query(
    namespace="roman-nepali-v1",
    vector=query_embedding[0].values,
    top_k=1,
    include_values=False,
    include_metadata=True
)

In [29]:
print(results)

{'matches': [{'id': 'rec3',
              'metadata': {'category': 'information',
                           'source_text': 'Hamro store ko name All Electronics '
                                          'Store ho.\n'
                                          '        Hamro store Dharan ma xa.\n'
                                          '        Store ko contact number '
                                          '9812345678 ho.\n'
                                          '        Email: '
                                          'allelectronics@gmail.com\n'
                                          '        '},
              'score': 0.8375271,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': 'roman-nepali-v1',
 'usage': {'read_units': 6}}


In [36]:
print(results["matches"][0]["metadata"]["source_text"])

Hamro store ko name All Electronics Store ho.
        Hamro store Dharan ma xa.
        Store ko contact number 9812345678 ho.
        Email: allelectronics@gmail.com
        


In [37]:
def retrieve_document(query: str):
    query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
    )
    results = index.query(
    namespace="roman-nepali-v1",
    vector=query_embedding[0].values,
    top_k=1,
    include_values=False,
    include_metadata=True
    )
    print(results["matches"][0]["metadata"]["source_text"])

In [38]:
retrieve_document("Store ko location kaha xa?")

Hamro store ko name All Electronics Store ho.
        Hamro store Dharan ma xa.
        Store ko contact number 9812345678 ho.
        Email: allelectronics@gmail.com
        


In [39]:
retrieve_document("store ma kasari samparka garne")

Hamro store ko name All Electronics Store ho.
        Hamro store Dharan ma xa.
        Store ko contact number 9812345678 ho.
        Email: allelectronics@gmail.com
        


In [40]:
retrieve_document("Redmi mobile ko barema jankari pam na")

Product-name Redmi Note 9 pro Price: Rs 10000 RAM: 8GB Storage: 128GB description: Redmi Note 9 pro with gorilla glass


In [41]:
retrieve_document("Samsung mobile ko barema jankari dinu")

Product-name Samsung Galaxy Price: Rs 130000 RAM: 16GB Storage: 128GB description: Brand new Samsung Galaxy with AI


In [42]:
retrieve_document("delivery option xa ki xaina?")

Delivery option All Nepal ho. Inside Dharan delivery same day free hunxa ra outside Dharan Rs. 120 laagxa. Delivery huna 2-3 din laagna sakxa.
