In [10]:
import pandas as pd
import context
import knowledge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

book_data = pd.read_csv('Data/Books_Reduced.csv', dtype=str)
user_data = pd.read_csv('Data/Users_Reduced.csv', dtype=str)
rating_data = pd.read_csv('Data/Ratings_Reduced.csv', dtype=str)

merged_data = pd.read_csv('Data/Merged.csv', dtype=str)

book_data.drop('Image-URL-S', axis=1, inplace=True)
book_data.drop('Image-URL-M', axis=1, inplace=True)
book_data.drop('Image-URL-L', axis=1, inplace=True)

In [11]:
# get user input of User-ID if they have previous ratings
user_id, new_user = context.get_user_id()

if not new_user:
    profile = context.get_profile(user_id)
    print(profile)

In [12]:
# if it is a new user then use the knowledge based system
# else give recommendations of previously read books rated highly
# and combine with books that other rate high in the same
# age range and location.
if not new_user:
    filtered_data = merged_data[(merged_data['Country'] == profile[1].split(', ')[2]) &
                                (merged_data['Age'].between(str(float(profile[2])-5), str(float(profile[2])+5))) &
                                (merged_data['ISBN'].isin(merged_data[merged_data['User-ID'] == profile[0]]['ISBN'].values))]
    filtered_data.loc[:, 'Book-Rating'] = pd.to_numeric(filtered_data['Book-Rating'])
    avg_ratings = filtered_data.groupby('ISBN')['Book-Rating'].mean().reset_index()
    avg_ratings.rename(columns={'Book-Rating': 'Average-Rating'}, inplace=True)

    user_books_with_avg_ratings = pd.merge(filtered_data, avg_ratings, on='ISBN')

    sorted_books = user_books_with_avg_ratings.sort_values(by='Average-Rating', ascending=False)
    books = []
    for book in sorted_books.values:
        if len(books) == 5:
            break
        if book[4] not in books:
            books.append(book[4])
            last_book = book[4]
    print(f'Popular books you have read in your country: {profile[1].split(", ")[2]}')
    print(books, '\n')

    filtered_data = merged_data[(merged_data['Country'] == profile[1].split(', ')[2]) &
                                (merged_data['Age'].between(str(float(profile[2])-5), str(float(profile[2])+5))) &
                                ~(merged_data['ISBN'].isin(merged_data[merged_data['User-ID'] == profile[0]]['ISBN'].values))]
    filtered_data.loc[:, 'Book-Rating'] = pd.to_numeric(filtered_data['Book-Rating'])

    avg_ratings = filtered_data.groupby('ISBN')['Book-Rating'].mean().reset_index()
    avg_ratings.rename(columns={'Book-Rating': 'Average-Rating'}, inplace=True)

    user_books_with_avg_ratings = pd.merge(filtered_data, avg_ratings, on='ISBN')

    sorted_books = user_books_with_avg_ratings.sort_values(by='Average-Rating', ascending=False)
    books = []
    for book in sorted_books.values:
        if len(books) == 5:
            break
        if book[4] not in books:
            books.append(book[4])
            last_book = book[4]
    print(f'Popular books you have not read in your country: {profile[1].split(", ")[2]}')
    print(books)

In [13]:
if new_user:
    knowledge.Question(
        "What is your age?",
        knowledge.age_check,
        "Please enter an integer greater than zero."
    )
    knowledge.Question(
        "Who are your favorite authors? (comma-separated name)",
        knowledge.author_check,
        "Please enter a comma-separated list of author names."
    )
    knowledge.Question(
        "Do you have a preference for books published after a certain year? (Enter the year)",
        knowledge.year_pref_check,
        "Please enter a valid year."
    )
    knowledge.Question(
        "please provide books you currently like",
        knowledge.book_check,
        "Please enter a valid book name."
    )

In [14]:
if new_user:
    question_list = knowledge.Question.question_list
    user_responses = knowledge.ask_user(question_list)
    age, favorite_authors, year_preference = user_responses[0], user_responses[1], user_responses[2]
    favorite_authors = [author.strip() for author in favorite_authors.split(",")]
    book = user_responses[3]

    print(age, favorite_authors, year_preference, book)
    #21 ['harper lee'] 1980 To Kill a Mockingbird

35 ['Harper Lee'] 1980 To Kill a Mockingbird


In [15]:


if new_user:
    #filter year they were published, and title
    filtered_books = book_data[book_data['Year-Of-Publication'].between(year_preference, '2023')]
    filtered_books = filtered_books.reset_index(drop=True)

    #gets user id of people in 5 year age span of user
    user_ids = user_data[user_data['Age'].between(str(int(age)-5), str(int(age)+5))]['User-ID'].values

    #get top 10000 books rated by users in user_ids
    top_rated_books = rating_data[rating_data['User-ID'].isin(user_ids)].groupby('ISBN')['Book-Rating'].count().sort_values(ascending=False).head(10000).index

    #filter books by top rated books
    filtered_books = filtered_books[filtered_books['ISBN'].isin(top_rated_books)]
    filtered_books = filtered_books.reset_index(drop=True)

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(filtered_books['Book-Title'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    def get_recommendations(title, cosine_sim=cosine_sim, weight_factor=2, threshold=0.1):
        try:
            idx = filtered_books[filtered_books['Book-Title'].str.contains(title, case=False, regex=False)].index[0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            adjusted_scores = [(i, score * weight_factor if score >= threshold else score) for i, score in sim_scores]
            adjusted_scores = sorted(adjusted_scores, key=lambda x: x[1], reverse=True)
            adjusted_scores = adjusted_scores[0:501] 
            book_indices = [i[0] for i in adjusted_scores]
            return filtered_books['Book-Title'].iloc[book_indices]
        except IndexError:
            print(f"No match found for: {title}")
            return []

    # Check if any liked books are in the filtered dataset
    liked_books_in_dataset = any(book in filtered_books['Book-Title'].values for book in book)
    # Initialize an empty set for recommendations
    aggregated_recommendations = set()

    # If liked books are in the dataset, use your existing recommendation system
    if liked_books_in_dataset:
            print("Liked Books:", book)
            similar_books = get_recommendations(book)
            aggregated_recommendations.update(similar_books)            
    # Fallback strategy if no liked books are in the dataset
    if not liked_books_in_dataset or not aggregated_recommendations:
        fallback_recommendations = top_rated_books[:500]
        fallback_book_titles = book_data[book_data['ISBN'].isin(fallback_recommendations)]['Book-Title']
        aggregated_recommendations.update(fallback_book_titles)

    title_to_isbn = {row['Book-Title']: row['ISBN'] for index, row in filtered_books.iterrows()}
    recommended_isbns = [title_to_isbn[title] for title in aggregated_recommendations if title in title_to_isbn]

    # Calculate average ratings
    rating_data['Book-Rating'] = pd.to_numeric(rating_data['Book-Rating'])
    average_ratings = rating_data[rating_data['ISBN'].isin(recommended_isbns)].groupby('ISBN')['Book-Rating'].mean()
    tuple_list = [(title, average_ratings[title_to_isbn[title]]) for title in aggregated_recommendations if title in title_to_isbn]
    #sort the tuple list by rating
    tuple_list.sort(key=lambda x: x[1], reverse=True)
    #print the top 10 recommendations
    print("Top Recommendations:")
    idx = 0
    for i in tuple_list:
        if idx == 10:
            break
        print(i[0], i[1])
        idx += 1

Top Recommendations:
The Two Towers (The Lord of the Rings, Part 2) 9.454545454545455
The Drawing of the Three (The Dark Tower, Book 2) 9.4
Siddhartha 9.4
Harry Potter and the Sorcerer's Stone (Book 1) 9.23076923076923
The Prince of Tides 9.2
Without Remorse 9.125
Dune (Remembering Tomorrow) 9.11320754716981
The Stand (The Complete and Uncut Edition) 9.10344827586207
Harry Potter and the Prisoner of Azkaban (Book 3) 9.095238095238095
Harry Potter and the Order of the Phoenix (Book 5) 9.078212290502794
