<a href="https://colab.research.google.com/github/cloudpendyala/pendyala_ai/blob/main/TFIDF_CosineSimilarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a corpus
def create_corpus():
    corpus = [
        "The quick brown fox jumps over the lazy dog.",
        "Never jump over the lazy dog quickly.",
        "A quick brown dog outpaces a quick fox."
    ]
    return corpus

# Compute TF-IDF for the corpus
def compute_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X, vectorizer

# Calculate Cosine Similarity
def calculate_cosine_similarity(X):
    similarity_matrix = cosine_similarity(X)
    return similarity_matrix

# Perform Similarity Search
def similarity_search(query, corpus, vectorizer):
    query_vec = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vec, vectorizer.transform(corpus)).flatten()
    ranked_indices = similarity_scores.argsort()[::-1]
    return ranked_indices, similarity_scores, query_vec

# Display TF-IDF weights for the query terms
def display_query_tfidf_weights(query, query_vec, vectorizer):
    query_terms = vectorizer.get_feature_names_out()
    tfidf_scores = query_vec.toarray().flatten()
    query_weights = {term: tfidf_scores[idx] for idx, term in enumerate(query_terms) if tfidf_scores[idx] > 0}
    return query_weights

# Example usage:
corpus = create_corpus()
X, vectorizer = compute_tfidf(corpus)
similarity_matrix = calculate_cosine_similarity(X)

query = "quick fox"
ranked_indices, similarity_scores, query_vec = similarity_search(query, corpus, vectorizer)
query_weights = display_query_tfidf_weights(query, query_vec, vectorizer)

print("Query TF-IDF Weights:")
for term, weight in query_weights.items():
    print(f"Term: {term}, Weight: {weight}")

print("\nSimilarity Search Results:")
for idx in ranked_indices:
    print(f"Document: {corpus[idx]}, Similarity Score: {similarity_scores[idx]}")


Query TF-IDF Weights:
Term: fox, Weight: 0.7071067811865476
Term: quick, Weight: 0.7071067811865476

Similarity Search Results:
Document: A quick brown dog outpaces a quick fox., Similarity Score: 0.7349057283930712
Document: The quick brown fox jumps over the lazy dog., Similarity Score: 0.4201087427318642
Document: Never jump over the lazy dog quickly., Similarity Score: 0.0
