In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the dataset
data = pd.read_csv('../datasets/clean/filtered_datasets/books_merged.csv')

# Drop rows with missing descriptions
data = data.dropna(subset=['description'])

# Preprocessing (e.g., lowercase, remove punctuation, etc.)
# Implement your preprocessing steps here if needed

# Vectorization
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(data['description'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Function to recommend books based on their similarity
def recommend_books(book_title, cosine_sim=cosine_sim, data=data, top_n=5):
    idx = data[data['Book-Title'] == book_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]
    return data['Book-Title'].iloc[book_indices]

# Example: Recommend books similar to 'The Testament'
recommended_books = recommend_books('The Testament')
print(recommended_books)
print()
recommended_books = recommend_books('The Lord of the Rings')
print(recommended_books)

720                            Stephen Hawking's Universe
1702    Tractatus Logico Philosophicus (Routledge Clas...
582                 The Collected Stories of Eudora Welty
1561     Because It Is Bitter, and Because It Is My Heart
1015    George Washington's Rules of Civility and Dece...
Name: Book-Title, dtype: object

915     Bare Bones: Conversations on Terror With Steph...
1627                                       The Value of X
486                         Close Range : Wyoming Stories
1338                            The Virginia Woolf Reader
169     The Gift of the Magi and Other Short Stories (...
Name: Book-Title, dtype: object
