# **Hybrid Search**
**BM25** is a sophisticated ranking function used in information retrieval. Acting like a highly efficient librarian, it excels in navigating through extensive collections of documents. Its effectiveness lies in term Frequency: Evaluating how often search terms appear in each document.Vector Search extends our search capabilities beyond mere keyword matching. It brings in a layer of contextual understanding, interpreting the semantics of search queries to provide results that align with the intended meaning

**Hybrid Search Approach** - Our hybrid search system synergizes BM25's keyword-focused precision with Vector search's semantic understanding. This duo delivers nuanced, comprehensive search results, perfect for complex and varied datasets.

#Install Libraries

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!pip install tqdm



In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import pandas as pd
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

In [None]:
data.isnull().sum()

Unnamed: 0,0
uniq_id,0
crawl_timestamp,0
product_url,0
product_name,0
product_category_tree,0
pid,0
retail_price,0
discounted_price,0
image,0
is_FK_Advantage_product,0


#**KEY WORD SEARCH USING BM25**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm

# Load data
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

# Preprocessing Function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return tokens

# Apply preprocessing to the product_name and description with tqdm
tqdm.pandas(desc="Processing texts")
data['processed_text'] = data.progress_apply(
    lambda row: preprocess_text(f"{row['product_name']} {row['description']} {row['brand']} {row['top_level_category']}"),
    axis=1
)

# Indexing
corpus = data['processed_text'].tolist()
bm25 = BM25Okapi(corpus)

# Search Function
def search(query, bm25, data, top_n=10):
    query = preprocess_text(query)
    scores = bm25.get_scores(query)
    top_n_indices = np.argsort(scores)[::-1][:top_n]
    return data.iloc[top_n_indices]

Processing texts: 100%|██████████| 19995/19995 [06:18<00:00, 52.82it/s] 


#**SEARCH USING BM25**

In [None]:
query = "Fabric Double Sofa Bed"
results = search(query, bm25, data)

# Display the top 10 results
print(results[['product_name', 'description', 'discounted_price']])

                                    product_name  \
7            FabHomeDecor Fabric Double Sofa Bed   
16           FabHomeDecor Fabric Double Sofa Bed   
1            FabHomeDecor Fabric Double Sofa Bed   
19           FabHomeDecor Fabric Double Sofa Bed   
11494              ARRA Solid Wood 2 Seater Sofa   
11346              ARRA Solid Wood 3 Seater Sofa   
11336              ARRA Solid Wood 4 Seater Sofa   
11662              ARRA Solid Wood 4 Seater Sofa   
14839     Fashion Centre Double Bed Mosquito Net   
18115  Unnati Floral Double Top Sheet Multicolor   

                                             description  discounted_price  
7      FabHomeDecor Fabric Double Sofa Bed (Finish Co...           22646.0  
16     FabHomeDecor Fabric Double Sofa Bed (Finish Co...           22646.0  
1      FabHomeDecor Fabric Double Sofa Bed (Finish Co...           22646.0  
19     FabHomeDecor Fabric Double Sofa Bed (Finish Co...           22646.0  
11494  ARRA Solid Wood 2 Seater Sofa (Fini

#**BM25 + SEMTANTIC SEARCH**

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
import numpy as np
import re
from tqdm import tqdm

# Load data
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

# Preprocessing Function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Combine text fields and preprocess
data['combined_text'] = data.apply(lambda row: preprocess_text(f"{row['product_name']} {row['description']} {row['brand']} {row['top_level_category']}"), axis=1)

# Load Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the combined text
tqdm.pandas(desc="Generating embeddings")
#data['embedding'] = data['combined_text'].progress_apply(lambda x: model.encode(x))
embeddings_df = pd.read_csv('/content/drive/MyDrive/zepto DS/embeddings.csv')

# Convert the DataFrame to a list of embeddings
# Assuming each row in the CSV represents an embedding vector
embeddings = embeddings_df.values.tolist()

# Convert the list of lists into a NumPy array
embeddings_array = np.array(embeddings)
# Initialize BM25
corpus = data['combined_text'].tolist()
bm25 = BM25Okapi([text.split() for text in corpus])  # BM25 requires tokenized texts

# Search Function
def search(query, bm25, model, data, top_n=10):
    # Preprocess and generate embedding for the query
    query = preprocess_text(query)
    query_embedding = model.encode(query)

    # Compute BM25 scores
    query_tokens = query.split()
    bm25_scores = bm25.get_scores(query_tokens)

    # Compute vector similarity scores
    query_vector = model.encode(query)
    embeddings = np.vstack(embeddings_array)
    vector_scores = cosine_similarity([query_vector], embeddings)[0]

    # Normalize scores
    bm25_scores = np.array(bm25_scores)
    vector_scores = np.array(vector_scores)

    # Combine scores
    combined_scores = 0.5 * bm25_scores + 0.5 * vector_scores  # Adjust weights as needed

    # Get indices of top_n results
    top_n_indices = combined_scores.argsort()[::-1][:top_n]

    return data.iloc[top_n_indices]


#SEARCH USING THE HYBRID

In [None]:

query = "alisha womens footwear"
results = search(query, bm25, model, data)

# Display the top 10 results
print(results[['product_name', 'description', 'discounted_price']])

                             product_name  \
21    Alisha Solid Women's Cycling Shorts   
3     Alisha Solid Women's Cycling Shorts   
13    Alisha Solid Women's Cycling Shorts   
15    Alisha Solid Women's Cycling Shorts   
0     Alisha Solid Women's Cycling Shorts   
9     Alisha Solid Women's Cycling Shorts   
6     Alisha Solid Women's Cycling Shorts   
3417      Roha Collections Womens Loafers   
3372       Urban Monkey Womens Pu Loafers   
4454         Womens Trendz Alloy Necklace   

                                            description  discounted_price  
21    Alisha Solid Women's Cycling Shorts - Buy Blac...       1973.401767  
3     Key Features of Alisha Solid Women's Cycling S...        267.000000  
13    Key Features of Alisha Solid Women's Cycling S...        379.000000  
15    Key Features of Alisha Solid Women's Cycling S...        379.000000  
0     Key Features of Alisha Solid Women's Cycling S...        379.000000  
9     Key Features of Alisha Solid Women's Cycli

#**BM25 + SEMANTIC SEARCH + HEURISTIC**

In [None]:
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer

# Load data
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

# Initialize Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocessing Function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Combine text fields and preprocess
data['combined_text'] = data.apply(lambda row: preprocess_text(f"{row['product_name']} {row['description']} {row['brand']} {row['top_level_category']}"), axis=1)

# Compute embeddings
#data['embedding'] = data['combined_text'].progress_apply(lambda x: model.encode(x))
embeddings_df = pd.read_csv('/content/drive/MyDrive/zepto DS/embeddings.csv')

# Convert the DataFrame to a list of embeddings
# Assuming each row in the CSV represents an embedding vector
embeddings = embeddings_df.values.tolist()

# Convert the list of lists into a NumPy array
embeddings_array = np.array(embeddings)
# Initialize BM25
tokenized_corpus = [text.split() for text in data['combined_text']]
bm25 = BM25Okapi(tokenized_corpus)

# Function to compute relevance scores
def compute_relevance_scores(query, query_embedding, data):
    # Preprocess and tokenize query
    query_processed = preprocess_text(query)
    query_tokens = query_processed.split()

    # Compute BM25 scores
    bm25_scores = bm25.get_scores(query_tokens)

    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer()
    X_tfidf = tfidf_vectorizer.fit_transform(data['combined_text'])
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = X_tfidf.dot(query_tfidf.T).toarray().flatten()

    # Compute cosine similarity scores
    cosine_scores = cosine_similarity(query_embedding.reshape(1, -1), embeddings_array).flatten()

    # Combine scores
    weights = {'bm25': 0.35, 'tfidf': 0.3, 'cosine': 0.35}
    combined_scores = (weights['bm25'] * bm25_scores +
                       weights['tfidf'] * tfidf_scores +
                       weights['cosine'] * cosine_scores)

    return combined_scores
# Search function
def search(query, data, top_n=10):
    # Compute query embedding
    query_embedding = model.encode(preprocess_text(query))

    # Compute relevance scores
    scores = compute_relevance_scores(query, query_embedding, data)

    # Get top N results
    top_n_indices = np.argsort(scores)[::-1][:top_n]
    return data.iloc[top_n_indices]



#**SEARCH USING HUERISTIC**

In [None]:
query = "alisha womens footwear"
results = search(query, data)

# Display the top 10 results
print(results[['product_name', 'description', 'discounted_price']])


                             product_name  \
21    Alisha Solid Women's Cycling Shorts   
3     Alisha Solid Women's Cycling Shorts   
13    Alisha Solid Women's Cycling Shorts   
15    Alisha Solid Women's Cycling Shorts   
0     Alisha Solid Women's Cycling Shorts   
9     Alisha Solid Women's Cycling Shorts   
6     Alisha Solid Women's Cycling Shorts   
3417      Roha Collections Womens Loafers   
3372       Urban Monkey Womens Pu Loafers   
4454         Womens Trendz Alloy Necklace   

                                            description  discounted_price  
21    Alisha Solid Women's Cycling Shorts - Buy Blac...       1973.401767  
3     Key Features of Alisha Solid Women's Cycling S...        267.000000  
13    Key Features of Alisha Solid Women's Cycling S...        379.000000  
15    Key Features of Alisha Solid Women's Cycling S...        379.000000  
0     Key Features of Alisha Solid Women's Cycling S...        379.000000  
9     Key Features of Alisha Solid Women's Cycli

In [None]:
#embeddings_df = pd.DataFrame(data['embedding'].tolist())

# Save DataFrame to a CSV file
#embeddings_df.to_csv('embeddings.csv', index=False)

#**EVALUATION METRICS / RELEVANCE SCORES**

#PRECISION@K

In [None]:
import pandas as pd
import numpy as np
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Load data
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

# Initialize Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocessing Function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Combine text fields and preprocess
data['combined_text'] = data.apply(lambda row: preprocess_text(f"{row['product_name']} {row['description']} {row['brand']} {row['top_level_category']}"), axis=1)

# Compute embeddings
#data['embedding'] = data['combined_text'].apply(lambda x: model.encode(x))
embeddings_df = pd.read_csv('/content/drive/MyDrive/zepto DS/embeddings.csv')

# Convert the DataFrame to a list of embeddings
# Assuming each row in the CSV represents an embedding vector
embeddings = embeddings_df.values.tolist()

# Convert the list of lists into a NumPy array
embeddings_array = np.array(embeddings)
# Initialize BM25
tokenized_corpus = [text.split() for text in data['combined_text']]
bm25 = BM25Okapi(tokenized_corpus)

# Function to compute relevance scores
def compute_relevance_scores(query, query_embedding, data):
    # Preprocess and tokenize query
    query_processed = preprocess_text(query)
    query_tokens = query_processed.split()

    # Compute BM25 scores
    bm25_scores = bm25.get_scores(query_tokens)

    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer()
    X_tfidf = tfidf_vectorizer.fit_transform(data['combined_text'])
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = X_tfidf.dot(query_tfidf.T).toarray().flatten()

    # Compute cosine similarity scores
    cosine_scores = cosine_similarity(query_embedding.reshape(1, -1), embeddings_array).flatten()

    # Combine scores
    weights = {'bm25': 0.4, 'tfidf': 0.3, 'cosine': 0.3}
    combined_scores = (weights['bm25'] * bm25_scores +
                       weights['tfidf'] * tfidf_scores +
                       weights['cosine'] * cosine_scores)

    return combined_scores

# Search function
def search(query, data, model, top_n=10):
    # Compute query embedding
    query_embedding = model.encode(preprocess_text(query))

    # Compute relevance scores
    scores = compute_relevance_scores(query, query_embedding, data)

    # Get top N results
    top_n_indices = np.argsort(scores)[::-1][:top_n]
    return data.iloc[top_n_indices]

# Function to compute Precision at K
def precision_at_k(retrieved_docs, query, k):
    relevant_docs = [doc for doc in retrieved_docs[:k] if query in doc['combined_text']]
    return len(relevant_docs) / k

# Function to compute query-document similarity
def query_document_similarity(query, documents, model):
    query_embedding = model.encode(preprocess_text(query))
    doc_embeddings = [model.encode(preprocess_text(doc)) for doc in documents]
    similarities = cosine_similarity([query_embedding], doc_embeddings).flatten()
    return similarities

# Evaluate search approaches
def evaluate_search(query, data, model, top_n=10):
    # Retrieve search results
    results = search(query, data, model, top_n=top_n)

    # Compute similarity scores
    query_embedding = model.encode(preprocess_text(query))
    doc_embeddings = [model.encode(preprocess_text(doc)) for doc in results['combined_text']]
    cosine_similarities = cosine_similarity([query_embedding], doc_embeddings).flatten()

    # Compute BM25 scores
    query_tokens = preprocess_text(query).split()
    bm25_scores = bm25.get_scores(query_tokens)

    # Precision at K
    precision_at_k_value = precision_at_k(results.to_dict('records'), query, top_n)

    # Display results
    print(f"Top {top_n} results for query '{query}':")
    for i, (index, row) in enumerate(results.iterrows()):
        print(f"{i+1}. {row['product_name']} - BM25 Score: {bm25_scores[index]}, Cosine Similarity: {cosine_similarities[i]}")

    print(f'Precision at {top_n}: {precision_at_k_value}')

query = "furniture for pets"
evaluate_search(query, data, model, top_n=10)


Top 10 results for query 'furniture for pets':
1. Jerry's Jppb11552 M Pet Bed - BM25 Score: 16.908848995561268, Cosine Similarity: 0.5272480845451355
2. Jerry's Jppb11592 M Pet Bed - BM25 Score: 15.718117997867587, Cosine Similarity: 0.5517610311508179
3. Jerry's Jppb11584 XL Pet Bed - BM25 Score: 15.669032587171094, Cosine Similarity: 0.5451546907424927
4. BM WOOD FURNITURE Hexagon Wall Shelves MDF Wall Shelf - BM25 Score: 10.876524516584913, Cosine Similarity: 0.388668417930603
5. Surbhi Bunny  - 60 cm - BM25 Score: 10.690180299819243, Cosine Similarity: 0.31556057929992676
6. Hunter Electric Insect Killer - BM25 Score: 10.323929778650841, Cosine Similarity: 0.18109910190105438
7. Petshop7 PS7DB0065 M Pet Bed - BM25 Score: 8.86622805636226, Cosine Similarity: 0.624779224395752
8. Petshop7 PS7BED000429 M Pet Bed - BM25 Score: 8.86622805636226, Cosine Similarity: 0.6099498271942139
9. Petshop7 PS7DB0066 S Pet Bed - BM25 Score: 8.86622805636226, Cosine Similarity: 0.6082963347434998
10.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#**NCDG SCORES**

In [None]:
import pandas as pd
import numpy as np
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score

# Load data
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

# Initialize Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocessing Function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Combine text fields and preprocess
data['combined_text'] = data.apply(lambda row: preprocess_text(f"{row['product_name']} {row['description']} {row['brand']} {row['top_level_category']}"), axis=1)

# Load precomputed embeddings
embeddings_df = pd.read_csv('/content/drive/MyDrive/zepto DS/embeddings.csv')
embeddings_array = np.array(embeddings_df.values.tolist())

# Initialize BM25
tokenized_corpus = [text.split() for text in data['combined_text']]
bm25 = BM25Okapi(tokenized_corpus)

# Function to compute relevance scores
def compute_relevance_scores(query, query_embedding, data):
    # Preprocess and tokenize query
    query_processed = preprocess_text(query)
    query_tokens = query_processed.split()

    # Compute BM25 scores
    bm25_scores = bm25.get_scores(query_tokens)

    # Compute TF-IDF scores
    tfidf_vectorizer = TfidfVectorizer()
    X_tfidf = tfidf_vectorizer.fit_transform(data['combined_text'])
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = X_tfidf.dot(query_tfidf.T).toarray().flatten()

    # Compute cosine similarity scores
    cosine_scores = cosine_similarity(query_embedding.reshape(1, -1), embeddings_array).flatten()

    # Combine scores
    weights = {'bm25': 0.4, 'tfidf': 0.3, 'cosine': 0.3}
    combined_scores = (weights['bm25'] * bm25_scores +
                       weights['tfidf'] * tfidf_scores +
                       weights['cosine'] * cosine_scores)

    return combined_scores

# Function to compute and print NDCG for each retrieved document
def ndcg_per_document(query, data, model, top_n=10):
    # Retrieve search results
    query_embedding = model.encode(preprocess_text(query))
    scores = compute_relevance_scores(query, query_embedding, data)
    top_n_indices = np.argsort(scores)[::-1][:top_n]

    # Generate relevance scores for top N results
    relevance_scores = np.array([scores[i] for i in top_n_indices])

    # Assume a perfect ranking for true relevance (for demonstration purposes)
    true_relevance = np.sort(relevance_scores)[::-1]

    # Compute NDCG for the top N results
    ndcg = ndcg_score([true_relevance], [relevance_scores], k=top_n)

    # Print results
    for i, index in enumerate(top_n_indices):
        print(f"Document {i+1}: {data.iloc[index]['product_name']} | Score: {relevance_scores[i]:.4f}")

    print(f"\nOverall NDCG for top {top_n} results: {ndcg:.4f}")


query = "alisha footwear womens"
ndcg_per_document(query, data, model, top_n=30)


Document 1: Alisha Solid Women's Cycling Shorts | Score: 6.6812
Document 2: Alisha Solid Women's Cycling Shorts | Score: 6.0987
Document 3: Alisha Solid Women's Cycling Shorts | Score: 6.0895
Document 4: Alisha Solid Women's Cycling Shorts | Score: 6.0869
Document 5: Alisha Solid Women's Cycling Shorts | Score: 6.0837
Document 6: Alisha Solid Women's Cycling Shorts | Score: 6.0719
Document 7: Alisha Solid Women's Cycling Shorts | Score: 6.0669
Document 8: Roha Collections Womens Loafers | Score: 6.0222
Document 9: Urban Monkey Womens Pu Loafers | Score: 5.8927
Document 10: Womens Trendz Alloy Necklace | Score: 4.7952
Document 11: Womens Trendz Alloy Necklace | Score: 4.7934
Document 12: Womens Trendz Alloy Necklace | Score: 4.7934
Document 13: Womens Trendz Vertical Ball Thushi Alloy Necklace | Score: 4.6562
Document 14: Digni Boots | Score: 4.6020
Document 15: Womens Trendz Kolhapuri Saaj Thushi Crystal Yellow Gold Plated Alloy Necklace | Score: 4.5025
Document 16: Womens Trendz Kolha

In [None]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.37.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading streamlit-1.37.1-py2.py3-none-any.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m25.9 MB

#**STREAM LIT**

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load data
file_path = '/content/drive/MyDrive/zepto DS/processed.csv'
data = pd.read_csv(file_path)

# Initialize Sentence Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocessing Function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Combine text fields and preprocess
data['combined_text'] = data.apply(lambda row: preprocess_text(f"{row['product_name']} {row['description']} {row['brand']} {row['top_level_category']}"), axis=1)

# Load precomputed embeddings
embeddings_df = pd.read_csv('/content/drive/MyDrive/zepto DS/embeddings.csv')
embeddings = embeddings_df.values.tolist()
embeddings_array = np.array(embeddings)

# Initialize BM25
tokenized_corpus = [text.split() for text in data['combined_text']]
bm25 = BM25Okapi(tokenized_corpus)

# Function to compute relevance scores
def compute_relevance_scores(query, query_embedding, data):
    query_processed = preprocess_text(query)
    query_tokens = query_processed.split()
    bm25_scores = bm25.get_scores(query_tokens)
    tfidf_vectorizer = TfidfVectorizer()
    X_tfidf = tfidf_vectorizer.fit_transform(data['combined_text'])
    query_tfidf = tfidf_vectorizer.transform([query])
    tfidf_scores = X_tfidf.dot(query_tfidf.T).toarray().flatten()
    cosine_scores = cosine_similarity(query_embedding.reshape(1, -1), embeddings_array).flatten()
    weights = {'bm25': 0.35, 'tfidf': 0.3, 'cosine': 0.35}
    combined_scores = (weights['bm25'] * bm25_scores +
                       weights['tfidf'] * tfidf_scores +
                       weights['cosine'] * cosine_scores)
    return combined_scores

# Search function
def search(query, data, top_n=10):
    query_embedding = model.encode(preprocess_text(query))
    scores = compute_relevance_scores(query, query_embedding, data)
    top_n_indices = np.argsort(scores)[::-1][:top_n]
    return data.iloc[top_n_indices]

# Streamlit app
st.title("Product Search")

query = st.text_input("Enter search query:")
if query:
    results = search(query, data)
    st.write("Top 10 Results:")
    st.dataframe(results[['product_name', 'description', 'discounted_price']])


Writing app.py


In [None]:
!wget -q -O - ipv4.icanhazip.com

34.172.17.171


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8502[0m
[34m  External URL: [0m[1mhttp://34.172.17.171:8502[0m
[0m
your url is: https://sixty-papers-lie.loca.lt
[34m  Stopping...[0m
^C


In [None]:
!npm install localtunnel

[K[?25h
added 22 packages, and audited 23 packages in 2s

3 packages are looking for funding
  run `npm fund` for details

2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues, run:
  npm audit fix

Run `npm audit` for details.


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501

your url is: https://cold-doodles-dream.loca.lt
