In [24]:
import math
import random
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
# Set a random seed for reproducibility
random_seed = 4
random.seed(random_seed)
np.random.seed(random_seed)

queries = [
        "Fruity and balanced wine",
        "Crisp and refreshing white wine",
        "Smooth and medium-bodied red wine",
        "Aromatic and vibrant wine",
        "Dry and elegant wine"]
    
k = 4  # Number of documents to retrieve

def load_documents(file_path):
    with open(file_path, 'r') as file:
        documents = [line.strip() for line in file]
    return documents

file_path = 'document.txt'  # Default file
documents = load_documents(file_path)

In [25]:
# rank documents based on TF-IDF    
def rank_documents(query, documents, k):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])

    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(tfidf_matrix)
    distances, indices = knn.kneighbors(query_vector)

    ranked_documents = []
    for idx in indices[0]:
        document_number = idx + 1  # Assign a document number based on the index (starting from 1)
        ranked_documents.append(document_number)

    return ranked_documents

print("Ranked Documents:")
for query in queries:
    ranked_documents = rank_documents(query, documents, k)
    print(f"Query: {query}")
    for i, document_number in enumerate(ranked_documents):
        print(f"Rank {i+1}: Document {document_number}")
    print()

Ranked Documents:
Query: Fruity and balanced wine
Rank 1: Document 2
Rank 2: Document 7
Rank 3: Document 3
Rank 4: Document 5

Query: Crisp and refreshing white wine
Rank 1: Document 3
Rank 2: Document 7
Rank 3: Document 2
Rank 4: Document 5

Query: Smooth and medium-bodied red wine
Rank 1: Document 2
Rank 2: Document 6
Rank 3: Document 7
Rank 4: Document 3

Query: Aromatic and vibrant wine
Rank 1: Document 3
Rank 2: Document 2
Rank 3: Document 5
Rank 4: Document 1

Query: Dry and elegant wine
Rank 1: Document 3
Rank 2: Document 2
Rank 3: Document 5
Rank 4: Document 1



In [26]:
# rank documents based on TF
def rank_documents(query, documents, k):
    vectorizer = CountVectorizer()
    tf_matrix = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])

    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(tf_matrix)
    distances, indices = knn.kneighbors(query_vector)

    ranked_documents = []
    for idx in indices[0]:
        document_number = idx + 1  # Assign a document number based on the index (starting from 1)
        ranked_documents.append(document_number)

    return ranked_documents

print("Ranked Documents:")
for query in queries:
    ranked_documents = rank_documents(query, documents, k)
    print(f"Query: {query}")
    for i, document_number in enumerate(ranked_documents):
        print(f"Rank {i+1}: Document {document_number}")
    print()

Ranked Documents:
Query: Fruity and balanced wine
Rank 1: Document 1
Rank 2: Document 7
Rank 3: Document 4
Rank 4: Document 3

Query: Crisp and refreshing white wine
Rank 1: Document 1
Rank 2: Document 7
Rank 3: Document 3
Rank 4: Document 4

Query: Smooth and medium-bodied red wine
Rank 1: Document 1
Rank 2: Document 7
Rank 3: Document 4
Rank 4: Document 3

Query: Aromatic and vibrant wine
Rank 1: Document 1
Rank 2: Document 4
Rank 3: Document 7
Rank 4: Document 3

Query: Dry and elegant wine
Rank 1: Document 1
Rank 2: Document 4
Rank 3: Document 7
Rank 4: Document 3



In [27]:
# rank documents based on BoW
def rank_documents(query, documents, k):
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])

    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(bow_matrix)
    distances, indices = knn.kneighbors(query_vector)

    ranked_documents = []
    for idx in indices[0]:
        document_number = idx + 1  # Assign a document number based on the index (starting from 1)
        ranked_documents.append(document_number)

    return ranked_documents

print("Ranked Documents:")
for query in queries:
    ranked_documents = rank_documents(query, documents, k)
    print(f"Query: {query}")
    for i, document_number in enumerate(ranked_documents):
        print(f"Rank {i+1}: Document {document_number}")
    print()

Ranked Documents:
Query: Fruity and balanced wine
Rank 1: Document 1
Rank 2: Document 7
Rank 3: Document 4
Rank 4: Document 3

Query: Crisp and refreshing white wine
Rank 1: Document 1
Rank 2: Document 7
Rank 3: Document 3
Rank 4: Document 4

Query: Smooth and medium-bodied red wine
Rank 1: Document 1
Rank 2: Document 7
Rank 3: Document 4
Rank 4: Document 3

Query: Aromatic and vibrant wine
Rank 1: Document 1
Rank 2: Document 4
Rank 3: Document 7
Rank 4: Document 3

Query: Dry and elegant wine
Rank 1: Document 1
Rank 2: Document 4
Rank 3: Document 7
Rank 4: Document 3

