In [1]:
!pip install scikit-learn



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample dataset of documents
documents = [
    "Information retrieval is the process of obtaining information system resources.",
    "Natural language processing enables computers to understand human language.",
    "TF-IDF stands for Term Frequency-Inverse Document Frequency.",
    "Search engines use various algorithms to retrieve relevant documents.",
    "Cosine similarity is used to find similarity between text vectors."
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform documents into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(documents)

# Function to process query and retrieve top documents
def search_engine(query, top_n=3):
    query_vec = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarity_scores.argsort()[::-1][:top_n]

    print(f"\nTop {top_n} results for query: \"{query}\"")
    for i in top_indices:
        print(f"\nDocument {i+1}:")
        print(f"Score: {similarity_scores[i]:.4f}")
        print(f"Text: {documents[i]}")

# Loop to take user input
if __name__ == "__main__":
    while True:
        user_query = input("\nEnter your query (or type 'exit' to quit): ")
        if user_query.lower() == 'exit':
            print("Exiting the search engine. Goodbye!")
            break
        search_engine(user_query)


Enter your query (or type 'exit' to quit): similarity

Top 3 results for query: "similarity"

Document 5:
Score: 0.6003
Text: Cosine similarity is used to find similarity between text vectors.

Document 4:
Score: 0.0000
Text: Search engines use various algorithms to retrieve relevant documents.

Document 3:
Score: 0.0000
Text: TF-IDF stands for Term Frequency-Inverse Document Frequency.

Enter your query (or type 'exit' to quit): retrieval

Top 3 results for query: "retrieval"

Document 1:
Score: 0.2930
Text: Information retrieval is the process of obtaining information system resources.

Document 5:
Score: 0.0000
Text: Cosine similarity is used to find similarity between text vectors.

Document 4:
Score: 0.0000
Text: Search engines use various algorithms to retrieve relevant documents.

Enter your query (or type 'exit' to quit): exit
Exiting the search engine. Goodbye!
