In [21]:
# https://github.com/lubka1/recommender-system/tree/main
import math
import random
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
# Set a random seed for reproducibility
random_seed = 2
random.seed(random_seed)
np.random.seed(random_seed)


def load_documents(file_path):
    with open(file_path, 'r') as file:
        documents = [line.strip() for line in file]
    return documents

# Term Frequency
def TF(document):
    term_frequency = {}
    terms = document.split()

    for term in terms:
        if term in term_frequency:
            term_frequency[term] += 1
        else:
            term_frequency[term] = 1

    return term_frequency

# Inverse Document Frequency
def IDF(documents, term):
    document_count = len(documents)
    term_occurrences = sum(1 for document in documents if term in document)
    if term_occurrences > 0:
        idf = math.log(document_count / term_occurrences)
    else:
        idf = 0.0
    return idf

# Term Frequency-Inverse Document Frequency
def TF_IDF(term_frequency, idf):
    tf_idf = {}
    for term, tf in term_frequency.items():
        tf_idf[term] = tf * idf
    return tf_idf

def display_table(documents):
    for document in documents:
        term_frequency = TF(document)
        all_terms = list(term_frequency.keys())
        data = []

        for i, term in enumerate(all_terms):
            tf = term_frequency.get(term, 0)
            idf = IDF(documents, term)
            tf_idf = TF_IDF(term_frequency, idf)
            tf_idf_value = tf_idf.get(term, 0.0)
            data.append([term, tf, idf, tf_idf_value])

        df = pd.DataFrame(data, columns=[ "Term", "TF", "IDF", "TF-IDF"])
        print("Table for Document:")
        print(document)
        display(df)
        print()    
        
def calculate_cosine_similarity(documents):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

def display_similarity_matrix(similarity_matrix):
    n = len(similarity_matrix)
    columns = ["Document " + str(i+1) for i in range(n)]
    index = ["Document " + str(i+1) for i in range(n)]
    df = pd.DataFrame(similarity_matrix, columns=columns, index=index)
    print("Cosine Similarity Matrix:")
    display(df)
    print()
    

# rank documents based on TF-IDF    
def rank_documents(query, documents, k):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])

    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(tfidf_matrix)
    distances, indices = knn.kneighbors(query_vector)

    ranked_documents = []
    for idx in indices[0]:
        ranked_documents.append(documents[idx])

    return ranked_documents

"""
# rank documents based on TF
def rank_documents(query, documents, k):
    vectorizer = CountVectorizer()
    tf_matrix = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query]).toarray()

    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(tf_matrix)
    distances, indices = knn.kneighbors(query_vector)

    ranked_documents = []
    for idx in indices[0]:
        ranked_documents.append(documents[idx])

    return ranked_documents
"""



if __name__ == '__main__':
    file_path = input("Enter the name of the text file to analyze (leave blank for default): ")
    if not file_path:
        file_path = 'document.txt'  # Default file
    documents = load_documents(file_path)
    
    query = input("Enter your query here: ")
    k = 2  # Number of documents to retrieve
    
    # display best ranked document
    ranked_documents = rank_documents(query, documents, k)

    print("Ranked Documents:")
    for i, document in enumerate(ranked_documents):
        print(f"Rank {i+1}: {document}")
        
    # display all the tables    
    print("__________________________________________")
    print("Tabels with values of TF, IDF TF-IDF and cosine similarity")
    display_table(documents)
    similarity_matrix = calculate_cosine_similarity(documents)
    display_similarity_matrix(similarity_matrix)
    

Enter the name of the text file to analyze (leave blank for default): 
Enter your query here: fruit
Ranked Documents:
Rank 1: Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.
Rank 2: Blackberry and raspberry aromas show a typical Navarran whiff of green herbs and, in this case, horseradish. In the mouth, this is fairly full bodied, with tomatoey acidity. Spicy, herbal flavors complement dark plum fruit, while the finish is fresh but grabby.
__________________________________________
Tabels with values of TF, IDF TF-IDF and cosine similarity
Table for Document:
Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,Aromas,1,1.94591,1.94591
1,include,1,1.94591,1.94591
2,tropical,1,1.94591,1.94591
3,"fruit,",1,1.252763,1.252763
4,"broom,",1,1.94591,1.94591
5,brimstone,1,1.94591,1.94591
6,and,2,0.0,0.0
7,dried,2,1.94591,3.89182
8,herb.,1,1.94591,1.94591
9,The,1,0.847298,0.847298



Table for Document:
This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,This,1,1.94591,1.94591
1,is,2,0.154151,0.308301
2,ripe,1,1.252763,1.252763
3,and,2,0.0,0.0
4,"fruity,",1,1.94591,1.94591
5,a,1,0.0,0.0
6,wine,1,0.847298,0.847298
7,that,1,1.252763,1.252763
8,smooth,1,1.94591,1.94591
9,while,1,1.252763,1.252763



Table for Document:
Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,Tart,1,1.94591,1.94591
1,and,2,0.0,0.0
2,"snappy,",1,1.94591,1.94591
3,the,2,0.336472,0.672944
4,flavors,1,1.252763,1.252763
5,of,1,0.154151,0.154151
6,lime,1,1.94591,1.94591
7,flesh,1,1.94591,1.94591
8,rind,1,1.252763,1.252763
9,dominate.,1,1.94591,1.94591



Table for Document:
Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,Pineapple,1,1.94591,1.94591
1,"rind,",1,1.94591,1.94591
2,lemon,1,1.94591,1.94591
3,pith,1,1.94591,1.94591
4,and,2,0.0,0.0
5,orange,1,1.94591,1.94591
6,blossom,1,1.94591,1.94591
7,start,1,1.94591,1.94591
8,off,1,1.252763,1.252763
9,the,1,0.336472,0.336472



Table for Document:
Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,Much,1,1.94591,1.94591
1,like,1,1.94591,1.94591
2,the,1,0.336472,0.336472
3,regular,1,1.94591,1.94591
4,bottling,1,1.94591,1.94591
5,from,1,1.252763,1.252763
6,2012,1,1.94591,1.94591
7,this,1,1.252763,1.252763
8,comes,1,1.94591,1.94591
9,across,1,1.94591,1.94591



Table for Document:
Blackberry and raspberry aromas show a typical Navarran whiff of green herbs and, in this case, horseradish. In the mouth, this is fairly full bodied, with tomatoey acidity. Spicy, herbal flavors complement dark plum fruit, while the finish is fresh but grabby.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,Blackberry,1,1.94591,1.94591
1,and,1,0.0,0.0
2,raspberry,1,1.94591,1.94591
3,aromas,1,0.847298,0.847298
4,show,1,1.94591,1.94591
5,a,1,0.0,0.0
6,typical,1,1.94591,1.94591
7,Navarran,1,1.94591,1.94591
8,whiff,1,1.94591,1.94591
9,of,1,0.154151,0.154151



Table for Document:
Here's a bright, informal red that opens with aromas of candied berry, white pepper and savory herb that carry over to the palate. It's balanced with fresh acidity and soft tannins.


Unnamed: 0,Term,TF,IDF,TF-IDF
0,Here's,1,1.94591,1.94591
1,a,1,0.0,0.0
2,"bright,",1,1.94591,1.94591
3,informal,1,1.94591,1.94591
4,red,1,1.252763,1.252763
5,that,2,1.252763,2.505526
6,opens,1,1.94591,1.94591
7,with,2,0.154151,0.308301
8,aromas,1,0.847298,0.847298
9,of,1,0.154151,0.154151



Cosine Similarity Matrix:


Unnamed: 0,Document 1,Document 2,Document 3,Document 4,Document 5,Document 6,Document 7
Document 1,1.0,0.038465,0.073779,0.087749,0.021093,0.090836,0.126953
Document 2,0.038465,1.0,0.071552,0.077408,0.137193,0.154906,0.240099
Document 3,0.073779,0.071552,1.0,0.161499,0.071055,0.169074,0.104117
Document 4,0.087749,0.077408,0.161499,1.0,0.061683,0.140385,0.138174
Document 5,0.021093,0.137193,0.071055,0.061683,1.0,0.085589,0.101084
Document 6,0.090836,0.154906,0.169074,0.140385,0.085589,1.0,0.114961
Document 7,0.126953,0.240099,0.104117,0.138174,0.101084,0.114961,1.0



