# Book Recommendation System

# Part IV: Dash

### Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import ast
import itertools

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from io import StringIO

import os

import dash
from dash import dcc, html, Input, Output, State, ALL
import dash_grocery

import json

# Import my functions
from my_functions import users_matrix
from my_functions import recommendations_by_genres

Check this [link](https://community.plotly.com/t/how-to-use-other-peoples-react-components-in-my-dash-app/65627) for an explanation on how to use the rating stars.

### Loading the Data

In [5]:
books = pd.read_csv("data/Books_cleaned.csv").drop('Unnamed: 0', axis = 1)
#ratings = pd.read_csv("data_cleaned/Ratings_cleaned.csv").drop('Unnamed: 0', axis = 1)

ratings_files = [f'data/Ratings_cleaned_part_{i}.csv' for i in range(1,6+1)]
ratings_dfs = [pd.read_csv(file) for file in ratings_files]
ratings = pd.concat(ratings_dfs, ignore_index=True).drop('Unnamed: 0', axis = 1)

books_genres = pd.read_csv("data/Books_genres_cleaned.csv").drop('Unnamed: 0', axis = 1)
books_genres_list = pd.read_csv("data/Books_genres_list_cleaned.csv").drop('Unnamed: 0', axis = 1)

## Dash

In [6]:
# Maximum number of users with coincidences that we use
n_users_upper_limit = 10000 

# Number of neighbours
default_number_neighbours = 50


# Create a dash application
app = dash.Dash(__name__, suppress_callback_exceptions=True)


###############################################################################
#                                                                             #
#                                   lAYOUT                                    #
#                                                                             #
###############################################################################


# Create an app layout
app.layout = html.Div([
    dcc.Store( # Store to maintain app state
        id='app_state', 
        data={'book_selection_ongoing': True,
              'potential_recommendations_ongoing': False,
              'final_recommendations_ongoing': False}
    ),  
    dcc.Store( # Store to store the ratings 
        id='rating_store'
    ),
    dcc.Store(
        id='potential_recommendations_df'
    ), 
    #
    # Book Selection
    #
    html.Div([
        html.H1("Book selection"),
        html.P("If you have previously saved a selection or want to save your current one, you can enter your ID to load it."),
        dcc.Input(id='user_id_input', type='text', placeholder='Enter your user ID.'),
        html.Button("Load Ratings", id="load_ratings_button"),
        html.P('That user ID is not in our system. If it is your first time here, you can use this ID to save your first selection. Otherwise, try again with a valid value.', 
               id='nonexistent_userID', style={'display': 'none'}),
        html.P("Choose as many books as you want from the list and rate them. Select at least one."),
        dcc.Dropdown(
            id='dropdown_book_titles',
            options=[
                {'label': book_title, 'value': book_title} for book_title in books['Title']
            ],
            multi=True, # Allow multiple selection
            placeholder="Select books...",
            style={'display': 'block'} # Default style to display the dropdown
        ),
        html.Button("Save Ratings", id="save_ratings_button"),
        html.Button("Finish selection", id="finish_book_selection_button"),  # Button to finish selection
        html.P(
            "No book selected! Please select at least one book.",
            id='text_no_select', 
            style={'display': 'none'}
        ),
        html.Div(id='selected_books_container'), # Container to show the selected books       
    ], id='book_selection', style={'display': 'block'}),
    #
    # Recommender program
    # 
    html.Div([
        html.H1("Obtaining your recommendations"),
        html.P('Wait while the recommendations are obtained...')
    ], id='potential_recommendations_program', style={'display': 'none'}), 
    #
    # Final recommendations
    # 
    html.Div([
        html.H1("Here are your recommendations!"),
        html.Div([
            html.P('If you want your recommendations to satisfy any genre selection, please, select the genres in the dropdown below.'),
            html.Div([
                html.P('Do you want the recommendations to include all the selected genres or just any of them?', 
                       style={'margin-left': '30px'}),
                html.Button("All", id="include_all_genres", n_clicks=0, style={'margin-left': '30px', 'margin-right': '15px'}),
                html.Button("Any", id="include_any_genres", n_clicks=0),
                dcc.Store(
                    id='genre_button_state', 
                    data={'include_all_genres': False, 
                          'include_any_genres': True,
                          'have_they_changed': False}
                ),
            ], style={'display': 'flex', 'align-items': 'center'}),
            html.Div([
                dcc.Dropdown(
                    id='dropdown_include_genres',
                    multi=True,
                    placeholder="Select genre(s) to include..."
                )
            ])
        ], style={'display': 'block'}),
        html.Div([
            html.P("If you want your recommendations to exclude any genre selection, please, select the genres in the dropdown below."),
            html.Div([
                dcc.Dropdown(
                    id='dropdown_exclude_genres',
                    multi=True,
                    placeholder="Select genre(s) to exclude..."
                )
            ])
        ], style={'display': 'block'}),
        html.P("Note: Both dropdowns only include genres that are present in your recommendations."),
        html.P(
            "No recommendations available with your genre selection. Please, change your choice.", 
            id='text_no_recommendations', 
            style={'display': 'none'}
        ),
        html.Div(id='recommended_books_container')
    ], id='final_recommendations', style={'display': 'none'})
])


###############################################################################
#                                                                             #
#                            UPDATE THE APP STATE                             #
#                                                                             #
###############################################################################


# Callback to show/hide components based on app state
@app.callback(
    [Output('book_selection', 'style'),
     Output('potential_recommendations_program', 'style'),
     Output('final_recommendations', 'style')],
    [Input('app_state', 'data')]
)
def update_components_visibility(app_state):
    book_selection_style = {'display': 'block'} if app_state['book_selection_ongoing'] else {'display': 'none'}
    recommendations_program_style = {'display': 'block'} if app_state['potential_recommendations_ongoing'] else {'display': 'none'}
    final_recommendations_style = {'display': 'block'} if app_state['final_recommendations_ongoing'] else {'display': 'none'}
    
    return book_selection_style, recommendations_program_style, final_recommendations_style


###############################################################################
#                                                                             #
#                               BOOK SELECTION                                #
#                                                                             #
###############################################################################


# Callback to load a previous selection of a user
@app.callback(
    [Output('rating_store', 'data', allow_duplicate=True),
     Output('dropdown_book_titles', 'value', allow_duplicate=True),
     Output('nonexistent_userID', 'style')],
    [Input('load_ratings_button', 'n_clicks')],
    [State('user_id_input', 'value')],
    prevent_initial_call=True
)
def load_ratings(n_clicks, user_id):
    if n_clicks is None:
        raise dash.exceptions.PreventUpdate

    user_file = f'user_files/user_ratings_{user_id}.json'
    if os.path.exists(user_file):
        with open(user_file, 'r') as f:
            rating_store = json.load(f)
        selected_books = list(rating_store.keys())
        return rating_store, selected_books, {'display': 'none'}
    else:
        return {}, [], {'display': 'block', 'fontSize': 15, 'color': 'red'}


# Callback to update app state when finish button is clicked and to hide the "No book selected!" message
@app.callback(
    [Output('app_state', 'data'),
     Output('text_no_select', 'style')],
    [Input('finish_book_selection_button', 'n_clicks'),
     Input('dropdown_book_titles', 'value')],
     State('app_state', 'data')
)
def update_app_state_or_hide_message(n_clicks,  selected_books, app_state):
    ctx = dash.callback_context

    # Determine which input triggered the callback
    if not ctx.triggered:
        raise dash.exceptions.PreventUpdate

    trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]

    if trigger_id == 'finish_book_selection_button':
        # This branch handles the finish_book_selection_button changes
        if n_clicks is not None:
            if not selected_books:
                text_no_select_style = {'display': 'block'}
            else:
                text_no_select_style = {'display': 'none'}
                app_state['book_selection_ongoing'] = False
                app_state['potential_recommendations_ongoing'] = True
        return app_state, text_no_select_style
    else:
        if trigger_id == 'dropdown_book_titles':
            # This branch handles the dropdown selection changes
            return dash.no_update, {'display': 'none'}  


# Callback to display the selected books by the user from the initial dropdown
@app.callback(
    Output('selected_books_container', 'children'),
    [Input('dropdown_book_titles', 'value')],
    [State('rating_store', 'data')] # State is used to access the current state of a component without triggering the callback
)
def display_selected_books(selected_books, rating_store):
    if selected_books:
        books_info = []
        for book_title in selected_books:
            book_row = books[books['Title'] == book_title].iloc[0]
            image_url = book_row['Image_url']
            rating_value = rating_store.get(book_title, 1) if rating_store else 1 # 1 is the default (and minimum) rating
            rating = dash_grocery.Stars(
                id={'type': 'rating', 'index': book_title}, 
                count=5, value=rating_value, color2="gold", size=30, edit=True, half=False
            )
            book_info = html.Div([
                html.Div([
                    html.Button('x', id={'type': 'remove_book_dropdown', 'index': book_title}, n_clicks=0, style={'margin-right': '10px'}),
                    html.Img(src=image_url, style={'width': '50px', 'height': '75px', 'margin-top': '10px', 'margin-right': '20px'}),
                    html.H3(book_title, style={'margin-right': '20px'}),
                    rating
                ], style={'display': 'flex', 'align-items': 'center'}),
            ])
            books_info.append(book_info)
        return books_info
    else:
        return html.Div()


# Callback to handle book removal using the 'x' button
@app.callback(
    Output('dropdown_book_titles', 'value'),
    [Input({'type': 'remove_book_dropdown', 'index': ALL}, 'n_clicks')],
    [State('dropdown_book_titles', 'value')]
)
def remove_selected_book_from_dropdown(n_clicks, selected_books):
    # This allows to access detailed information about what has actuvated a 
    # callback and about the inputs and outputs involved in the function
    ctx = dash.callback_context 

    # ctx.triggered is a list of the inputs that activated the callback
    # Each element is a dictionary with the keys 'prop_id' and 'value'
    if not ctx.triggered: 
        raise dash.exceptions.PreventUpdate

    # Determine which input triggered the callback
    # 'prop_id' indicates what input changed
    trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]
    trigger_id = ast.literal_eval(trigger_id)

    for i, elem in enumerate(selected_books):
        if elem == trigger_id['index'] and n_clicks[i] != 0:
            book_to_remove = elem
            if book_to_remove in selected_books:
                selected_books.remove(book_to_remove)
                return selected_books

    raise dash.exceptions.PreventUpdate


# Callback to update the Store with the values of the ratings
@app.callback(
    Output('rating_store', 'data'),
    [Input({'type': 'rating', 'index': ALL}, 'value')],  # Dynamic input for all the ratings
    [State('dropdown_book_titles', 'value'),  # State for the selected books
     State('rating_store', 'data')]  # Access to the current Store  
)
def update_rating_store(rating_values, selected_books, rating_store):
    # To initialize the store (dictionary) every time the function is called.
    # This guarantees that the books that were removed are dropped from the dictionary
    rating_store = {}

    # If there are no books selected, exit the function
    if selected_books is None:
        return rating_store
    
    # Iterate over the selected books and their corresponding rating values
    for book_title, rating_value in zip(selected_books, rating_values):
        # Update the rating value for each selected book
        rating_store[book_title] = rating_value

    # Save the dictionary of selected books
#    with open('rating_store.json', 'w') as f:
#            json.dump(rating_store, f)
    
    return rating_store


# Callback to save the user selection
@app.callback(
    Output('rating_store', 'data', allow_duplicate=True),
    [Input('save_ratings_button', 'n_clicks')],
    [State('user_id_input', 'value'), 
     State('rating_store', 'data')],
    prevent_initial_call=True
)
def save_ratings(n_clicks, user_id, rating_store):
    if n_clicks is None:
        raise dash.exceptions.PreventUpdate

    user_file = f'user_files/user_ratings_{user_id}.json'
    with open(user_file, 'w') as f:
        json.dump(rating_store, f)
    return rating_store


###############################################################################
#                                                                             #
#                            RECOMMENDATION SYSTEM                            #
#                                                                             #
###############################################################################


@app.callback(
    [Output('potential_recommendations_df', 'data'),
     Output('app_state', 'data', allow_duplicate=True)],
    [Input('app_state', 'data')],
    [State('rating_store', 'data')],
     prevent_initial_call=True
)
def update_intermediate_state(app_state, rating_store):
    if not app_state['book_selection_ongoing'] and app_state['potential_recommendations_ongoing']:

        target_UserID = 19960808 # This value is arbitrary, but not an existing UserID

        # ratings dataframe including the target user ratings
        ratings_new = user_dictionary_to_df(rating_store, target_UserID)

        # Books rated by the target user
        target_books = ratings_new[ratings_new['UserID'] == target_UserID].BookID.values

        # Selected users to get the recommendations
        selected_users, selected_ratings = selected_users_df(ratings_new, target_books, n_users_upper_limit, target_UserID)

        # Creating the matrix with users and books
        ratings_csr_matrix, ratings_matrix = get_users_matrix(selected_ratings)

        # Get the potential recommendations
        potential_recommendations = knn_model(ratings_csr_matrix, ratings_matrix, target_UserID, default_number_neighbours, selected_ratings, target_books)
                                           
        del ratings_matrix
    
        potential_recommendations_json = potential_recommendations.to_json(orient='split')

        # Save the table of potential recommendations
#        potential_recommendations_list = potential_recommendations.to_dict(orient='records')
#        with open('potential_recommendations.json', 'w') as f:
#            json.dump(potential_recommendations_list, f)

        # Update the state to indicate that the process has finished
        app_state['potential_recommendations_ongoing'] = False
        app_state['final_recommendations_ongoing'] = True
        
        return potential_recommendations_json, app_state

    else:
        raise dash.exceptions.PreventUpdate


###############################################################################
#                                                                             #
#                            FINAL RECOMMENDATIONS                            #
#                                                                             #
###############################################################################


# Callback to modify the genres options of the dropdown that the recommended books must satisfy
@app.callback(
    [Output('dropdown_include_genres', 'options'),
     Output('dropdown_include_genres', 'value'),
     Output('dropdown_exclude_genres', 'options'),
     Output('dropdown_exclude_genres', 'value'),
     Output('genre_button_state', 'data', allow_duplicate=True)],
    [Input('app_state', 'data'),
     Input('potential_recommendations_df' , 'data'),
     Input('dropdown_include_genres', 'value'),
     Input('dropdown_exclude_genres', 'value'),
     Input('genre_button_state', 'data')],
     prevent_initial_call=True
)
def get_genres_to_include(app_state, pot_recom_json, selected_included_genres, selected_excluded_genres, button_state):
    # pot_recom_json : all the potential recommendations for the user
    # selected_included_genres : genres currently selected in the included genres dropdown
    # selected_excluded_genres : genres currently selected in the excluded genres dropdown
    # button_state : dictionary with the state of the All and Any buttuns
    
    if pot_recom_json is None or not app_state['final_recommendations_ongoing']:
        raise dash.exceptions.PreventUpdate
    
    pot_recom = pd.read_json(StringIO(pot_recom_json), orient='split')

    # Include the genres lists in the dataframe
    pot_recom = pd.merge(pot_recom, books_genres[['BookID', 'Genres', 'Genre_1', 'Genre_2', 'Genre_3', 'Genre_4', 'Genre_5', 'Genre_6', 'Genre_7']], on='BookID', how='left')

    # Already selected excluded genres
    if selected_excluded_genres is None:
        excluded_genres = []
    else:
        excluded_genres = [genre for genre in selected_excluded_genres]

    # Already selected included genres
    if selected_included_genres is None:
        included_genres = []
    else:
        included_genres = [genre for genre in selected_included_genres]

    # Keep only the books that do not have the excluded genres
    pot_recom = pot_recom[~pot_recom.apply(lambda row: contains_any_genre(row, excluded_genres), axis=1)]

    # If the state of the buttons has just changed, initialize the selected included genres
    if button_state['have_they_changed'] == True:
        included_genres = []
        # Put the have_they_changed state in the genre button state back to False
        button_state['have_they_changed'] = False

    # List with all the lists of genres of the potential recommendations. The array is also converted to a list
    lists_genres = pot_recom[['Genres']].values
    lists_genres = [ast.literal_eval(item[0]) for item in lists_genres]
    
    # The list for the dropdown depends on the genre buttons selection
    if button_state['include_all_genres'] == True:
        # Lists that include the selected genres
        filtered_lists_genres = [lst for lst in lists_genres if all(genre in lst for genre in included_genres)] 
    else:
        # Lists that include the selected genres
        if not included_genres:
            filtered_lists_genres = lists_genres
        else:
            filtered_lists_genres = [lst for lst in lists_genres if any(genre in lst for genre in included_genres)] 

    # One list with all the genres of the previous lists
    possible_genres = list(itertools.chain(*filtered_lists_genres)) 
    # Drop duplicates
    include_list_for_dropdowns = list(set(possible_genres))

    # The list for the excluded genres has to include the excluded genres too for them to remain selected
    for genre in included_genres:
        if not genre in include_list_for_dropdowns:
            include_list_for_dropdowns.append(genre)
    
    # The list for the excluded genres has to include the excluded genres too for them to remain selected
    # Also, it has to exclude the selected genres to be included
    exclude_list_for_dropdowns = include_list_for_dropdowns.copy()
    for genre in excluded_genres:
        exclude_list_for_dropdowns.append(genre)
    for genre in included_genres:
        if genre in exclude_list_for_dropdowns:
            exclude_list_for_dropdowns.remove(genre)

    # Options for the dropdowns
    options_include = [
        {'label': genre, 'value': genre} for genre in include_list_for_dropdowns
    ]

    options_exclude = [
        {'label': genre, 'value': genre} for genre in exclude_list_for_dropdowns
    ]
    
    return options_include, included_genres, options_exclude, excluded_genres, button_state


@app.callback(
    [Output('genre_button_state', 'data'),
     Output('include_all_genres', 'style'),
     Output('include_any_genres', 'style')],
    [Input('include_all_genres', 'n_clicks'), 
     Input('include_any_genres', 'n_clicks')],
    [State('genre_button_state', 'data')]
)
def toggle_genre_button_and_style(button_all_clicks, button_any_clicks, button_state):
    changed_id = [trigger_id['prop_id'] for trigger_id in dash.callback_context.triggered][0]
    
    # Update the state of the buttons
    if 'include_all_genres' in changed_id:
        if button_state['include_all_genres'] == True:
            button_state['have_they_changed'] = False
        else:
            button_state['have_they_changed'] = True
        button_state['include_all_genres'] = True
        button_state['include_any_genres'] = False
    elif 'include_any_genres' in changed_id:
        if button_state['include_any_genres'] == True:
            button_state['have_they_changed'] = False
        else:
            button_state['have_they_changed'] = True
        button_state['include_all_genres'] = False
        button_state['include_any_genres'] = True
    
    # Update the style of the buttons depending on the state
    button_all_style = {'background-color': 'blue', 'color': 'white', 'margin-left': '30px', 'margin-right': '15px'} if button_state['include_all_genres'] else {'margin-left': '30px', 'margin-right': '15px'}
    button_any_style = {'background-color': 'blue', 'color': 'white'} if button_state['include_any_genres'] else {}
    
    return button_state, button_all_style, button_any_style


# Callback to print the recommendations
@app.callback(
    [Output('recommended_books_container', 'children'),
     Output('text_no_recommendations', 'style')],
    [Input('app_state', 'data'),
     Input('potential_recommendations_df' , 'data'),
     Input('dropdown_include_genres', 'value'),
     Input('dropdown_exclude_genres', 'value')],
     State('genre_button_state', 'data')
)
def get_the_final_recommendations(app_state, pot_recom_json, selected_genres, excluded_genres, button_state):
    if pot_recom_json is None or not app_state['final_recommendations_ongoing']:
        raise dash.exceptions.PreventUpdate
        
    # Genres selected for the books to include or exclude them
    included_genres = selected_genres if selected_genres else []
    excluded_genres = excluded_genres if excluded_genres else []

    # Potential book recommendation
    pot_recom = pd.read_json(StringIO(pot_recom_json), orient='split')
    # Include the genres lists in the dataframe
    pot_recom = pd.merge(pot_recom, books_genres[['BookID', 'Genres']], on='BookID', how='left')

    # Filter the potential recommendations by the selected genres
    if button_state['include_all_genres'] == True:
        combine = True
    else:
        combine = False
    recommendations = books_satisfying_genres(pot_recom, included_genres, excluded_genres, combine=combine)
    recommendations = pd.merge(recommendations, books[['BookID', 'Title', 'Image_url']], on='BookID', how='left')

    # Number of recommendations
    n = 10
    recommendations = recommendations.head(n)
    
    # Save the table of potential recommendations
    recommendations_list = recommendations.to_dict(orient='records')
#    with open('recommendations.json', 'w') as f:
#        json.dump(recommendations_list, f)

    # Crear la lista de recomendaciones para mostrar en el contenedor
    recommendations_display = []
    for rec in recommendations_list:
        book_title = rec['Title']
        book_image_url = rec['Image_url']
        recommendations_display.append(
            html.Div([
                html.Img(src=book_image_url, style={'width': '50px', 'height': '75px', 'margin-right': '20px'}),
                html.H4(book_title, style={'margin-right': '20px'})
            ], style={'display': 'flex', 'align-items': 'center', 'margin-bottom': '10px'})
        )

    # Show or hide the 'No recommendations' message
    if len(recommendations_list) == 0:
        text_no_recommendations_style = {'display': 'block', 'fontSize': 20, 'color': 'red'}
    else:
        text_no_recommendations_style = {'display': 'none', 'fontSize': 20, 'color': 'red'}

    return recommendations_display, text_no_recommendations_style


if __name__ == '__main__':
    app.run_server(debug=True)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 330, in update_intermediate_state(
    app_state={'book_selection_ongoing': False, 'final_recommendations_ongoing': False, 'potential_recommendations_ongoing': True},
    rating_store={'A Court of Mist and Fury (A Court of Thorns and Roses, #2)': 4, 'A Court of Thorns and Roses (A Court of Thorns and Roses, #1)': 3, 'A Court of Wings and Ruin (A Court of Thorns and Roses, #3)': 4, 'Beyond the Shadows (Night Angel, #3)': 4, 'Chronicle of a Death Foretold': 4, 'Equal Rites (Discworld, #3; Witches #1)': 3, 'Flatland: A Romance of Many Dimensions': 3, 'Foundation (Foundation #1)': 5, 'Foundation and Empire (Foundation #2)': 5, 'Second Foundation (Foundation #3)': 5, ...}
)
    327 target_UserID = 19960808 # This value is arbitrary, but not an existing UserID
    329 # ratings dataframe including the target user ratings
--> 

TODO:
- [ ] Dejar que el usuario seleccione el número de recomendaciones.
- [ ] Ver cómo puedo tener en cuenta para las recomendaciones la distancia entre vecinos. A más lejos, menos relevante debe ser su aportación. Tal vez estaría bien poner un número de vecinos muy alto y filtrar después en función de las distancias.
- [ ] Permitir buscar también por autor.
- [ ] Tal vez no mostrar en el dropdown de selección de libros los libros seleccionados, ya que se pueden eliminar con el botón en x.
- [x] No recomendar packs de libros?
- [ ] Si está recomendando una segunda, o tercera parte, y el usuario no ha leído la primera, solo recomendar la primera?
- [x] Guardar la selección de libros en un json para poder cargarlo en futuros usos
- [ ] Eliminar 'Empty' de la selección de géneros
- [ ] Ordenar los géneros por orden alfabético en los dropdown

In [12]:
with open('rating_store.json', 'r') as f:
    rating_store_data = json.load(f)

rating_store_data

FileNotFoundError: [Errno 2] No such file or directory: 'rating_store.json'

In [12]:
with open('potential_recommendations.json', 'r') as f:
        recommendation_data_list = json.load(f)

recommendation_data = pd.DataFrame(recommendation_data_list)

potential_recommendations = pd.merge(recommendation_data, books[['BookID', 'Title']], on='BookID', how='left')

potential_recommendations = pd.merge(potential_recommendations, books_genres[['BookID', 'Genres', 'Genre_1', 'Genre_2', 'Genre_3', 'Genre_4', 'Genre_5', 'Genre_6', 'Genre_7']], on='BookID', how='left')

potential_recommendations.head()

Unnamed: 0,index,BookID,Average_Rating,Ratings_Count,Weighted_Rating,Title,Genres,Genre_1,Genre_2,Genre_3,Genre_4,Genre_5,Genre_6,Genre_7
0,512,862,4.896552,29,4.753295,"Words of Radiance (The Stormlight Archive, #2)","['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Audiobook,Adult,Magic
1,369,562,4.8125,32,4.69052,"The Way of Kings (The Stormlight Archive, #1)","['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Audiobook,Adult,Science Fiction Fantasy
2,717,1374,4.818182,11,4.523915,"A Memory of Light (Wheel of Time, #14)","['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Science Fiction Fantasy,Audiobook,Epic
3,1171,2889,4.875,8,4.488227,"Mistborn Trilogy Boxed Set (Mistborn, #1-3)","['Fantasy', 'Fiction', 'Epic Fantasy', 'Scienc...",Fantasy,Fiction,Epic Fantasy,Science Fiction Fantasy,Magic,High Fantasy,Science Fiction
4,224,307,4.6,20,4.452447,"The Wise Man's Fear (The Kingkiller Chronicle,...","['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Magic,Science Fiction Fantasy,Adventure


In [11]:
with open('recommendations.json', 'r') as f:
    recommendations_list = json.load(f)

recommendation_df = pd.DataFrame(recommendations_list)

recommendation_df.head()

Unnamed: 0,index,BookID,Average_Rating,Ratings_Count,Weighted_Rating,Genres,Genre_1,Genre_2,Genre_3,Genre_4,Genre_5,Genre_6,Genre_7,Title,Image_url
0,512,862,4.896552,29,4.753295,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Audiobook,Adult,Magic,"Words of Radiance (The Stormlight Archive, #2)",https://images.gr-assets.com/books/1391535251m...
1,1171,2889,4.875,8,4.488227,"['Fantasy', 'Fiction', 'Epic Fantasy', 'Scienc...",Fantasy,Fiction,Epic Fantasy,Science Fiction Fantasy,Magic,High Fantasy,Science Fiction,"Mistborn Trilogy Boxed Set (Mistborn, #1-3)",https://images.gr-assets.com/books/1257442247m...
2,224,307,4.6,20,4.452447,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Magic,Science Fiction Fantasy,Adventure,"The Wise Man's Fear (The Kingkiller Chronicle,...",https://images.gr-assets.com/books/1452624392m...
3,153,192,4.545455,22,4.417643,"['Fantasy', 'Fiction', 'Epic Fantasy', 'High F...",Fantasy,Fiction,Epic Fantasy,High Fantasy,Magic,Science Fiction Fantasy,Adult,The Name of the Wind (The Kingkiller Chronicle...,https://images.gr-assets.com/books/1472068073m...
4,861,1760,5.0,3,4.265532,"['Fantasy', 'Young Adult', 'Fiction', 'Middle ...",Fantasy,Young Adult,Fiction,Middle Grade,Adventure,Magic,Childrens,"Keys to the Demon Prison (Fablehaven, #5)",https://images.gr-assets.com/books/1298081448m...


In [17]:
books[books['Title'].str.contains('1-', case=False)][['Original_Title','Title']]

Unnamed: 0,Original_Title,Title
188,The Lord of the Rings,"The Lord of the Rings (The Lord of the Rings, ..."
218,The Chronicles of Narnia,The Chronicles of Narnia (Chronicles of Narnia...
421,Complete Harry Potter Boxed Set,"Harry Potter Boxset (Harry Potter, #1-7)"
506,The Hunger Games Box Set,The Hunger Games Trilogy Boxset (The Hunger Ga...
739,The Little House Collection,"The Little House Collection (Little House, #1-9)"
...,...,...
9307,The Dark Is Rising Sequence (The Dark Is Risin...,The Dark Is Rising Sequence (The Dark Is Risi...
9326,"Artemis Fowl Boxed Set (Artemis Fowl, #1-5)","Artemis Fowl Boxed Set, Bks 1-5 (Artemis Fowl,..."
9342,"The Wheel of Time: Boxed Set (Wheel of Time, ...","The Wheel of Time: Boxed Set (Wheel of Time, ..."
9460,The Complete Wreck (A Series of Unfortunate Ev...,A Series of Unfortunate Events Box: The Comple...


In [26]:
max_rows_original = pd.get_option('display.max_rows')
max_cols_original = pd.get_option('display.max_columns')
max_colwidth = pd.get_option('display.max_colwidth')

pd.set_option('display.max_rows', None)  # Show all the rows
pd.set_option('display.max_columns', None)  # Show all the columns
pd.set_option('display.max_colwidth', None) # Show the whole width of the columns

print(books[books['Title'].str.contains('#1-', case=False)]['Title'])

# Restore the original configuration
pd.set_option('display.max_rows', max_rows_original)
pd.set_option('display.max_columns', max_cols_original)
pd.set_option('display.max_colwidth', max_colwidth)

188                                                                                                    The Lord of the Rings (The Lord of the Rings, #1-3)
218                                                                                                  The Chronicles of Narnia (Chronicles of Narnia, #1-7)
421                                                                                                               Harry Potter Boxset (Harry Potter, #1-7)
506                                                                                               The Hunger Games Trilogy Boxset (The Hunger Games, #1-3)
739                                                                                                       The Little House Collection (Little House, #1-9)
842                                                                                                              Fifty Shades Trilogy (Fifty Shades, #1-3)
936                                                                   

In [37]:
ratings[ratings['BookID'].isin(bookID_collections)]

Unnamed: 0,UserID,BookID,Rating
10,2,3753,5
98,4,219,4
273,4,1380,5
310,24,4149,3
465,34,958,5
...,...,...,...
5975500,29454,2880,4
5976121,10622,992,5
5976297,48801,189,5
5976311,48801,4776,3


In [256]:
# Books which are collections of books
collection_books = books[books['Title'].str.contains('#1-', case=False)]
print(len(collection_books)) # Output: 90
# Collection books including a parenthesis
print(len(collection_books[collection_books['Title'].str.contains('\(', case=False)])) # Output: 88

90
88


Some book collections do not have a parenthesis to specify the saga and the volumes they include.

In [257]:
# Collection books not including a parenthesis
collection_books[~collection_books['Title'].str.contains('\(', case=False)]

Unnamed: 0,BookID,Goodreads_BookID,Best_BookID,WorkID,Books_Count,ISBN,Authors,Year,Original_Title,Title,Average_Rating,Ratings_Count,Work_Ratings_Count,Work_Text_Reviews_Count,Ratings_1,Ratings_2,Ratings_3,Ratings_4,Ratings_5,Image_url
6428,6429,48811,48811,47752,4,448445867,Carolyn Keene,2006.0,"Nancy Drew Complete Series Set, Books 1-64",Nancy Drew: #1-64,4.19,16743,16810,360,273,541,2897,5116,7983,https://images-na.ssl-images-amazon.com/images...
6990,6991,7278837,7278837,9914053,6,1615579184,Jeff Kinney,2009.0,Diary of a Wimpy Kid: #1-4,Diary of a Wimpy Kid: #1-4,4.45,10752,10973,434,249,296,1053,2022,7353,https://images-na.ssl-images-amazon.com/images...


In [333]:
# Change these book names to include the parenthesis
books.loc[6428, 'Title'] = 'Nancy Drew: #1-64' # This one will be left unchanged. The dataset includes just the volumes 1 and 2, and then the complete collection of 64 books. Then, I do not want to include this book in the modifications
books.loc[6990, 'Title'] = 'Diary of a Wimpy Kid: #1-4 (Diary of a Wimpy Kid, #1-4)'

# Also, the 'The Walking Dead' books have inconvenient titles
books.loc[501, 'Title'] = 'The Walking Dead: Days Gone Bye (The Walking Dead, #1)'
books.loc[1450, 'Title'] = 'The Walking Dead: Compendium 1 (The Walking Dead, #1-8)'
books.loc[3543, 'Title'] = 'The Walking Dead: Book One (The Walking Dead, #1-2)'
books.loc[3808, 'Title'] = 'The Walking Dead: Miles Behind Us (The Walking Dead, #2)'
books.loc[4324, 'Title'] = 'The Walking Dead: Safety Behind Bars (The Walking Dead, #3)'
books.loc[4431, 'Title'] = 'The Walking Dead: Made to Suffer (The Walking Dead, #8)'
books.loc[5203, 'Title'] = 'The Walking Dead: The Hearts Desire (The Walking Dead, #4)'
books.loc[5463, 'Title'] = 'The Walking Dead: The Best Defense (The Walking Dead, #5)'
books.loc[5772, 'Title'] = 'The Walking Dead: This Sorrowful Life (The Walking Dead, #6)'
books.loc[5975, 'Title'] = 'The Walking Dead: Life Among Them (The Walking Dead, #12)'
books.loc[6378, 'Title'] = 'Rise of the Governor (The Walking Dead: Novels, #1)'
books.loc[7009, 'Title'] = 'The Walking Dead: The Calm Before (The Walking Dead, #7)'
books.loc[7780, 'Title'] = 'The Walking Dead: Here We Remain (The Walking Dead, #9)'
books.loc[7812, 'Title'] = 'The Walking Dead: Fear the Hunters (The Walking Dead, #11)'
books.loc[7864, 'Title'] = 'The Walking Dead: Compendium 2 (The Walking Dead, #9-16)'
books.loc[7930, 'Title'] = 'The Walking Dead: What We Become (The Walking Dead, #10)'
books.loc[8076, 'Title'] = 'The Walking Dead: No Way Out (The Walking Dead, #14)'
books.loc[8077, 'Title'] = 'The Walking Dead: Book Two (The Walking Dead, #3-4)'
books.loc[8111, 'Title'] = 'The Walking Dead: Book Three (The Walking Dead, #5-6)'
books.loc[9740, 'Title'] = 'The Walking Dead: Too Far Gone (The Walking Dead, #13)'

# Other problematic names:
books.loc[3062, 'Title'] = 'The Dark Is Rising (The Dark Is Rising, #2)'
books.loc[3635, 'Title'] = 'Eragon, Eldest & Brisingr (The Inheritance Cycle, #1-3)'
books.loc[4602, 'Title'] = 'Eragon & Eldest (The Inheritance Cycle, #1-2)'
books.loc[5184, 'Title'] = 'Sand (The Sand Chronicles, #1)'
books.loc[5866, 'Title'] = 'The Sword of Shannara Trilogy (The Original Shannara Trilogy, #1-3)'
books.loc[6217, 'Title'] = 'From the Two Rivers: The Eye of the World, Part 1 (Wheel of time, #1)'
books.loc[6406, 'Title'] = 'Dragonlance Chronicles (Dragonlance: Chronicles #1-3)'
books.loc[7055, 'Title'] = 'The Icewind Dale Trilogy Collectors Edition (The Icewind Dale Trilogy, #1-3)'
books.loc[7616, 'Title'] = "Dragon's Oath (House of Night: Novellas, #1)"
books.loc[7696, 'Title'] = 'The Captive Part II / The Power (The Secret Circle, #3)'
books.loc[9324, 'Title'] = "Lenobia's Vow (House of Night: Novellas, #2)"
books.loc[2807, 'Title'] = 'The Crystal Shard (Legend of Drizzt, #4)'
books.loc[3392, 'Title'] = "The Halfling's Gem (Legend of Drizzt, #6)"
books.loc[3499, 'Title'] = 'Streams of Silver (Legend of Drizzt, #5)'

# Collection books without the Nancy Drew collection
collection_books = books[books['Title'].str.contains('#1-', case=False)]
collection_books = collection_books[collection_books['Title'].str.contains('\(', case=False)]

In [396]:
import re

# To store the name of the saga and the volumes the collection has
collection_books_info = []
for index, book in collection_books.iterrows():  
    # Name of the saga
    saga = re.findall(r'\((.*?),', book['Title'])
    if saga == []:
        saga = re.findall(r'\((.*?) #', book['Title'])
    # Numbers of the volumes in the collection 
    match = re.search(r'#(\d+)-(\d+)', book['Title'])
    first_volume = int(match.group(1))
    last_volume = int(match.group(2))
    volumes = [i for i in range(first_volume, last_volume + 1)]
    
    collection_books_info.append([index, saga[0], volumes, books.loc[index, 'BookID']])

In [397]:
for i in range(0, len(collection_books_info)):
    print(collection_books_info[i])
    saga = collection_books_info[i][1]
    volumes = collection_books_info[i][2]

    aux = books[(books['Title'].str.contains('\(' + saga + ' ')) | (books['Title'].str.contains('\(' + saga + ','))]
#    if len(aux) == 0:
#        aux = books[books['Title'].str.contains(saga)]
    for volume in volumes:
        print(aux[aux['Title'].str.contains(f'#{volume}\)')]['Title'].values)

    print('')

[188, 'The Lord of the Rings', [1, 2, 3], 189]
['The Fellowship of the Ring (The Lord of the Rings, #1)']
['The Two Towers (The Lord of the Rings, #2)']
['The Return of the King (The Lord of the Rings, #3)']

[218, 'Chronicles of Narnia', [1, 2, 3, 4, 5, 6, 7], 219]
['The Lion, the Witch, and the Wardrobe (Chronicles of Narnia, #1)']
['Prince Caspian (Chronicles of Narnia, #2)']
['The Voyage of the Dawn Treader (Chronicles of Narnia, #3)']
['The Silver Chair (Chronicles of Narnia, #4)']
['The Horse and His Boy (Chronicles of Narnia, #5)']
["The Magician's Nephew (Chronicles of Narnia, #6)"]
['The Last Battle (Chronicles of Narnia, #7)']

[421, 'Harry Potter', [1, 2, 3, 4, 5, 6, 7], 422]
["Harry Potter and the Sorcerer's Stone (Harry Potter, #1)"]
['Harry Potter and the Chamber of Secrets (Harry Potter, #2)']
['Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)']
['Harry Potter and the Goblet of Fire (Harry Potter, #4)']
['Harry Potter and the Order of the Phoenix (Harry Potter

In [398]:
# Collections for which only one book, some or none are present in the dataset
indices_not_to_include = [993, 1366, 2181, 2271, 2870, 3472, 3988, 4148, 4858, 5246, 5414, 5464, 5723, 6295, 6386, 6908, 7055, 7262, 8164, 8529, 9307, 9858]

In [399]:
# Drop the books with the previous indices
for i in range(len(collection_books_info)-1, -1, -1):
    if collection_books_info[i][0] in indices_not_to_include:
        collection_books_info.remove(collection_books_info[i])

In [453]:
# BookID of the collection books
bookID_collections = [elem[3] for elem in collection_books_info]

# Ratings of the collection books
book_collection_ratings = ratings[ratings['BookID'].isin(bookID_collections)].reset_index(drop=True)

# Create a diccionary to fastly access  the data of the collections
collection_books_dict = {elem[3]: elem for elem in collection_books_info}

# Create a set to store the BookIDs of the collection books
collection_book_ids = set(collection_books_dict.keys())

# Filter the titles of the books containing the sagas names
collection_book_titles = set()
for saga_info in collection_books_info:
    saga = saga_info[1]
    volumes = saga_info[2]
    for volume in volumes:
        pattern = f'\({saga}.*?#{volume}\\)'
        collection_book_titles.update(books[books['Title'].str.contains(pattern, case=False, regex=True)]['BookID'])

# Filter the titles of the books containing the sagas and the volume
collection_books_filtered = books[books['BookID'].isin(collection_book_titles)]

# Create an empty dataframe to store the new ratings
new_ratings = pd.DataFrame(columns=['UserID', 'BookID', 'Rating']) 

# Iterate over book_collection_ratings rowa
for index, row in book_collection_ratings.iterrows():
    user_id = row['UserID']
    book_id = row['BookID']
    rating = row['Rating']

    user_ratings = ratings[(ratings['UserID'] == user_id) & (ratings['BookID'].isin(collection_book_titles))]

    if book_id in collection_books_dict:
        saga_info = collection_books_dict[book_id]
        saga = saga_info[1]
        volumes = saga_info[2]

        aux = collection_books_filtered[(collection_books_filtered['Title'].str.contains('\(' + saga + ' ')) | (collection_books_filtered['Title'].str.contains('\(' + saga + ','))]
        
        for volume in volumes:
            volume_books = aux[aux['Title'].str.contains(f'#{volume}\\)', regex=True)]
            
            if not volume_books.empty:
                book_id_sub = volume_books['BookID'].values[0]
    
                if user_ratings[user_ratings['BookID'] == book_id_sub].empty:
                    new_row = pd.DataFrame([{'UserID': user_id, 'BookID':book_id_sub, 'Rating':rating}])
                    new_ratings = pd.concat([new_ratings, new_row], ignore_index=True)


KeyboardInterrupt: 

In [451]:
collection_books_dict[book_id]

[739, 'Little House', [1, 2, 3, 4, 5, 6, 7, 8, 9], 740]

In [448]:
# BookID of the collection books
bookID_collections = [elem[3] for elem in collection_books_info]

# Ratings of the collection books
book_collection_ratings = ratings[ratings['BookID'].isin(bookID_collections)].reset_index(drop=True)

#
def search_elem_list_of_lists(lists, elem, sublist_idx):
    return next((idx for idx, sublist in enumerate(lists) if sublist[sublist_idx] == elem), None)


# Iterate over book_collection_ratings rowa
new_ratings = pd.DataFrame(columns=['UserID', 'BookID', 'Rating']) # Empty dataframe
for index, row in book_collection_ratings.iterrows():
    user_id = row['UserID']
    book_id = row['BookID']
    rating = row['Rating']

    user_ratings = ratings[ratings['UserID'] == user_id]

    idx = search_elem_list_of_lists(collection_books_info, book_id, 3)

    saga = collection_books_info[idx][1]
    volumes = collection_books_info[idx][2]

    aux = books[(books['Title'].str.contains('\(' + saga + ' ')) | (books['Title'].str.contains('\(' + saga + ','))]
    for volume in volumes:
        volume_books = aux[aux['Title'].str.contains(f'#{volume}\)', regex=True)]

        if not volume_books.empty:
            book_id_sub = volume_books['BookID'].values[0]

            if user_ratings[user_ratings['BookID'] == book_id_sub].empty:
                new_row = pd.DataFrame([{'UserID': user_id, 'BookID':book_id_sub, 'Rating':rating}])
                new_ratings = pd.concat([new_ratings, new_row], ignore_index=True)


KeyboardInterrupt: 

In [None]:
new_ratings

In [443]:
book_collection_ratings.loc[1768]

UserID    4351
BookID    6531
Rating       4
Name: 1768, dtype: int64

In [444]:
books[books['BookID'] == 6531]

Unnamed: 0,BookID,Goodreads_BookID,Best_BookID,WorkID,Books_Count,ISBN,Authors,Year,Original_Title,Title,Average_Rating,Ratings_Count,Work_Ratings_Count,Work_Text_Reviews_Count,Ratings_1,Ratings_2,Ratings_3,Ratings_4,Ratings_5,Image_url
6530,6531,3099780,3099780,3130939,11,61670855,L.J. Smith,2008.0,The Initiation / The Captive Part I,The Initiation / The Captive Part I (The Secre...,3.96,19563,21026,1024,440,1402,4746,6465,7973,https://images.gr-assets.com/books/1341591862m...


In [445]:
search_elem_list_of_lists(collection_books_info, 6531, 3)

49

In [446]:
collection_books_info[49]

[6530, 'The Secret Circle', [1, 2], 6531]

In [447]:
books[(books['Title'].str.contains('\(' + 'The Secret Circle' + ' ')) | (books['Title'].str.contains('\(' + 'The Secret Circle' + ','))]

Unnamed: 0,BookID,Goodreads_BookID,Best_BookID,WorkID,Books_Count,ISBN,Authors,Year,Original_Title,Title,Average_Rating,Ratings_Count,Work_Ratings_Count,Work_Text_Reviews_Count,Ratings_1,Ratings_2,Ratings_3,Ratings_4,Ratings_5,Image_url
6530,6531,3099780,3099780,3130939,11,61670855,L.J. Smith,2008.0,The Initiation / The Captive Part I,The Initiation / The Captive Part I (The Secre...,3.96,19563,21026,1024,440,1402,4746,6465,7973,https://images.gr-assets.com/books/1341591862m...
7696,7697,3665811,3665811,3709042,13,61671355,L.J. Smith,2009.0,The Secret Circle: The Captive Part II and The...,The Captive Part II / The Power (The Secret Ci...,4.11,16231,17148,553,218,815,3446,5116,7553,https://images.gr-assets.com/books/1341592078m...


In [None]:

for i in range(0, len(collection_books_info)):
    print(collection_books_info[i])
    saga = collection_books_info[i][1]
    volumes = collection_books_info[i][2]

    aux = books[(books['Title'].str.contains('\(' + saga + ' ')) | (books['Title'].str.contains('\(' + saga + ','))]
#    if len(aux) == 0:
#        aux = books[books['Title'].str.contains(saga)]
    for volume in volumes:
        print(aux[aux['Title'].str.contains(f'#{volume}\)')]['Title'].values)

    print('')

In [336]:
books[books['Title'].str.contains('Beacon 23')]

Unnamed: 0,BookID,Goodreads_BookID,Best_BookID,WorkID,Books_Count,ISBN,Authors,Year,Original_Title,Title,Average_Rating,Ratings_Count,Work_Ratings_Count,Work_Text_Reviews_Count,Ratings_1,Ratings_2,Ratings_3,Ratings_4,Ratings_5,Image_url
9858,9859,26771521,26771521,46093269,10,1516865871,Hugh Howey,2015.0,Beacon 23: The Complete Novel,"Beacon 23: The Complete Novel (Beacon 23, #1-5)",3.9,5140,9909,877,84,491,2423,4289,2622,https://images.gr-assets.com/books/1444683708m...


In [332]:
books.loc[3062, 'Title'] = 'The Dark Is Rising (The Dark Is Rising, #2)'
books.loc[9307, 'Title']

'The Dark Is Rising Sequence  (The Dark Is Rising #1-5)'