In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample document corpus
documents = [
    "Natural language processing enables computers to understand human language.",
    "Machine learning is a method of data analysis.",
    "Artificial intelligence and machine learning are closely related.",
    "Information retrieval is used in search engines.",
    "TF-IDF is a statistical measure to evaluate how important a word is in a document."
]

# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Function to process query and retrieve most relevant document(s)
def search_engine(query, top_n=3):
    # Vectorize the query
    query_vec = vectorizer.transform([query])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Get top N results
    top_indices = similarity_scores.argsort()[-top_n:][::-1]

    print("\nTop Relevant Documents:")
    for idx in top_indices:
        print(f"\nDocument {idx + 1} (Score: {similarity_scores[idx]:.4f}):\n{documents[idx]}")

# Example user query
user_query = input("Enter your search query: ")
search_engine(user_query)

Enter your search query: method of data analysis

Top Relevant Documents:

Document 2 (Score: 0.8340):
Machine learning is a method of data analysis.

Document 5 (Score: 0.0000):
TF-IDF is a statistical measure to evaluate how important a word is in a document.

Document 4 (Score: 0.0000):
Information retrieval is used in search engines.
