In [1]:
!pip install sentence-transformers faiss-cpu scikit-learn gradio PyPDF2 --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import gradio as gr
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import faiss

In [3]:
model = SentenceTransformer("all-MiniLM-L6-v2")
documents = []
embeddings = None
faiss_index = None
vectorizer = None
tfidf_matrix = None

print("✅ Model loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully!


In [4]:
def read_uploaded_files(files):
    all_texts = []
    for file in files:
        name, ext = os.path.splitext(file.name.lower())

        # Handle TXT files
        if ext == ".txt":
            text = file.read().decode("utf-8")
            all_texts.append(text)

        # Handle PDF files
        elif ext == ".pdf":
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            all_texts.append(text)

        else:
            all_texts.append(f"Unsupported file type: {ext}")

    return all_texts


In [5]:
def build_index(files):
    global documents, embeddings, faiss_index, vectorizer, tfidf_matrix

    documents = read_uploaded_files(files)
    if not documents:
        return "⚠️ No valid documents uploaded!"

    # Generate embeddings
    embeddings = model.encode(documents, convert_to_numpy=True, show_progress_bar=True)

    # Create FAISS index
    dimension = embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(embeddings)

    # Create TF-IDF index
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    return f"✅ Successfully indexed {len(documents)} document(s)!"

In [6]:
def hybrid_search(query, top_k=3, alpha=0.5):
    global documents, embeddings, faiss_index, vectorizer, tfidf_matrix

    if not documents:
        return "⚠️ Please upload and index documents first."

    # Semantic similarity
    query_emb = model.encode([query], convert_to_numpy=True)
    _, sem_indices = faiss_index.search(query_emb, len(documents))
    sem_scores = {i: 1 - np.linalg.norm(embeddings[i] - query_emb) for i in range(len(documents))}

    # Keyword similarity
    tfidf_query = vectorizer.transform([query])
    keyword_scores = cosine_similarity(tfidf_query, tfidf_matrix).flatten()

    # Combine scores
    hybrid_scores = {}
    for i in range(len(documents)):
        hybrid_scores[i] = alpha * sem_scores[i] + (1 - alpha) * keyword_scores[i]

    # Sort and get top results
    top_indices = sorted(hybrid_scores, key=hybrid_scores.get, reverse=True)[:top_k]
    results = [(documents[i][:300] + "...", float(hybrid_scores[i])) for i in top_indices]  # Truncate text
    return results

In [7]:
def upload_and_build(files):
    return build_index(files)

def query_search(query, alpha):
    results = hybrid_search(query, alpha=alpha)
    if isinstance(results, str):
        return results
    output = "\n\n".join([f"🔹 {text}\n(score: {score:.4f})" for text, score in results])
    return output

with gr.Blocks() as demo:
    gr.Markdown("## 🔍 Hybrid Search System (with File Upload)\nUpload text or PDF files to perform smart search combining semantic and keyword relevance.")

    with gr.Tab("📂 Upload & Index"):
        upload_btn = gr.File(file_count="multiple", file_types=[".txt", ".pdf"], label="Upload your documents")
        build_output = gr.Textbox(label="Status")
        upload_btn.change(upload_and_build, inputs=upload_btn, outputs=build_output)

    with gr.Tab("🔎 Search"):
        query = gr.Textbox(label="Enter your search query")
        alpha = gr.Slider(0, 1, 0.5, step=0.1, label="Hybrid Weight (0=Keyword | 1=Semantic)")
        search_output = gr.Textbox(label="Top Results", lines=8)
        gr.Button("Search").click(query_search, inputs=[query, alpha], outputs=search_output)

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8fccd7e9794dee9069.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


