In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
cat = pd.read_csv("../data/Audible_Catlog.csv")
adv = pd.read_csv("../data/Audible_Catlog_Advanced_Features.csv")

print(cat.shape)
print(adv.shape)

cat.head()


In [None]:
df = pd.merge(
    cat,
    adv,
    on=["Book Name", "Author"],
    how="inner"
)

df = df.drop_duplicates()

# Description fix
if "Description" in df.columns:
    df["Description"] = df["Description"].fillna("")
else:
    df["Description"] = ""

# Rating fix
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")
df = df.dropna(subset=["Rating"])

# Reviews fix
if "Number of Reviews" in df.columns:
    df["Number of Reviews"] = pd.to_numeric(df["Number of Reviews"], errors="coerce")

df.info()
df.head()


In [None]:
plt.figure()
df["Rating"].hist(bins=20)
plt.title("Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.show()


In [None]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(df["Description"])

kmeans = KMeans(n_clusters=20, random_state=42)
df["cluster"] = kmeans.fit_predict(tfidf_matrix)

df[["Book Name", "Author", "cluster"]].head()


In [None]:
cosine_sim = cosine_similarity(tfidf_matrix)
indices = pd.Series(df.index, index=df["Book Name"].str.lower()).drop_duplicates()

def recommend_similar(book_name):
    book_name = book_name.lower()

    if book_name not in indices:
        print("Book not found")
        return df.head(0)

    idx = indices[book_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    book_indices = [i[0] for i in sim_scores]

    return df.loc[book_indices, ["Book Name", "Author", "Rating"]]

# Test
recommend_similar(df["Book Name"].iloc[0])
