In [1]:

"""
Recommender System
Requirements:
A dataset of books from Goodreads known for use in recommendation engines.
Columns Description:
bookID contains the unique ID for each book/series
title contains the titles of the books
authors contains the author of the particular book
average_rating the average rating of the books, as decided by the users
ISBNISBN (10) number, tells the information about a book - such as edition and publisher
ISBN 13 the new format for ISBN, implemented in 2007 (13 digits)
language_code denotes the language for the books
Num_pages contains the number of pages for the book
Ratings_count contains the number of ratings given for the book
text_reviews_count the count of reviews left by users
Expected Output
By the end of this mini project, you are supposed to deliver within your code:
Multiple recommendations based on the implementation of two different recommendation engine:
Popularity based recommender.
Content based recommender.

"""


def main():

    import pandas as pd
    
    # STEP 1 & 2: Download and read the Dataset 
    df = pd.read_csv('books.csv', engine='python', on_bad_lines='warn')    
    print("DATASET", df)
    print("DATASET INFO", df.info())

    # STEP 3 Popularity-based Recommender
    """
    Popularity-based Recommender
    Create a function named Popularity Recommender and use it to recommend books based on popularity.
    Use a weighted rank similar to that used in the IMDB rating example in Lesson 2.
    """
    
    """ find the five highest ranked books in the dataset by average_rating"""
    print("\nTOP 5 BY AVERAGE RATINGS")
    top5Recommendations = df.sort_values(by = 'average_rating',
                                     ascending = False).head(5)
    print("", top5Recommendations)

    """ find the five highest ranked books in the dataset by ratings_count """
    print("\nTOP 5 BY RATINGS COUNT")
    top5Recommendations = df.sort_values(by = 'ratings_count',
                                     ascending = False).head(5)
    print("", top5Recommendations)

    """
    Create a function named Popularity Recommender and use it to recommend books based on popularity.
    Use a weighted rank similar to that used in the IMDB rating example in Lesson 2.
    """
    print("\nCOLUMNS")
    print(df.columns)

    def popularityRecommender(df):
    
        #Define the minimum ratings count
        minimum_vote_count = 0.75 * df['ratings_count'].max()
        
        #Define C – the mean rating
        mean_rating = df['average_rating'].mean()

        df['weighted_rating'] = (((df['ratings_count']/(df['ratings_count']+minimum_vote_count)) * df['average_rating']) +
                                ((minimum_vote_count/(df['ratings_count']+minimum_vote_count))*mean_rating))

        recommendations = df.sort_values(by = 'weighted_rating',ascending = False).head(5)
        
        return(recommendations) 

    print("\nPOPULARITY RECOMMENDER - TOP 5")
    top5 = popularityRecommender(df)
    top5[["title",'ratings_count','average_rating','weighted_rating',]].head(5)
    print(top5)

    
    # STEP 4 Content-based Recommender
    # So what we're doing in our content based recommender:
    # we're converting words in some Document into a vector
    # ignore stop words.
    # We count up all the words across all the authors descriptions.
    # then calculate the occurrence of each word and each document/author name (TF)
    # That's the term frequency and multiply it by the number of documents with each word. That's the inverse document frequency, and then we end up with an N dimensional vector for each document represented by.
    # The length being the number of unique words that we're working with in the whole library. So then.
    # We multiply those two terms together and that gives us the TFI TFIDF normalised vector
    # This is the cosine similarity. So this is a distance calculation to determine the angle of separation between.
    # In each vector. So between each pair of vectors in our N dimensional, so N words dimensional space
    # basically we want to find the words that are the most similar. Lowest cosine because cosine gets gets lower as angle decreases
    # 0 is the same 1 is opposite or no similarity


    """
    ITEM to ITEM COMPARISON
    Content-based Recommender
    Create a function named Content-based Recommender and use it to recommend books based on content.
    TF-IDF Vectorizer
    Use TF-IDF Vectorizer on the author data for each book.
    Distance matrix
    Choose cosine similarity for pairwise distances comparison.
    -------------------------------------------------------
    Recommender system, content based recommender item to item comparison does not take into account how people have viewed or interacted with it.
    None of that is considered here. We are just looking at what describes our item and finding similar items. So at the end of the day, if I watch.
    Harry Potter. Maybe it gives me fantastical beasts next or some other fantasy show, right?
    Even better, if it's like a magic oriented one, 'cause, that's kind of what that's about.
    We need text to describe my items. in this case author names
    Scalar output per word in our document(author description)
    """
    
    from sklearn.feature_extraction.text import TfidfVectorizer 
    # Replace empty descriptions with a blank "" value and transform the author of books in our dataset into the TF-IDF matrix

    """
    So essentially what we're doing is we're taking
    all of the words for our documents and using that to create a sparse vector that represents our items. So that's why it's called. It's the TFIDF vectorizer. We can make an instance of our vectorizer.
    """
    vectorizer = TfidfVectorizer(stop_words = 'english')
    print("Sample author description", df['authors'][0])
    df['authors'] = df['authors'].fillna('')
    tfidf_matrix = vectorizer.fit_transform(df['authors'])
    print("tfidf_matrix", tfidf_matrix)
    """    
    How often the term, whatever the word (author name ?), whatever the term, is typically a word. So how often a word occurs in the particular document that I'm looking at and it's basically. 
    The frequency because I'm dividing that by the total number of terms or words in the document. 
    So if I have a sentence, the cat is hungry.
    Let's do five `the black cat is hungry`, then each one of those would occur 1/5 of the time because there's no duplicated words. If I said the hungry cat is hungry, then hungry becomes a more important term in that document because it's occurring twice out of those 5 words. So it's going to have a higher frequency.
    And then there's a second term to this. So this is the inverse document frequency and what we need to do is count the documents(authors or theor names). So the total number of documents that I'm considering in my analysis.
    So my sentence had five words. Maybe I have 100 sentences, so that would be 100 and we need to compare that term through the documents. So how many times does the word hungry occur in all of my sentences?
    Maybe it's in four other sentences. There's a hungry dog and a hungry person and a hungry cat. Again, I don't know. So of my 100 documents, it occurs four times and I take the log of that. So this is essentially a method to give higher weightings to those rare words.

    Our TFIDF vectorizer is going to take in the first sentence(author) and calculate.
    TFIDF value for every word in this sentence.
    Except for the stop words. So let's check. I use this Lambda function to split up my words so there is actually 50 words in this sentence

    """
    rows, cols = tfidf_matrix.shape

    print(f"TF-IDF Matrix Shape: {rows} Rows (Documents) x {cols} Columns (Terms or words or author description or their names)")
    print(f"Total Documents/Books: {rows}")
    print(f"Total Vocabulary Size of authors data: {cols}")

    # There are 8445 words used to describe the different authors and will those words in the similarity analysis
    # Look at the vector representing the importance of the words in the document. Cumulatively, they represent the document/author description.
    # For book zero columns 6581 the TFIDF value for that word is 0.5771265855737906
    # For book zero columns 4988 the TFIDF value for that word is 0.44553176567161423
    # For book zero columns 2937 the TFIDF value for that word is 0.684416795528479       
    print("\nvector representing the importance of the words in the document/author description.")
    print("TF-IDF Matrix", tfidf_matrix[0])

    """
    So what we have here is essentially a vector. Now in N dimensional space that represents the book, The Catcher in the Rye and so.
    we ctually have a vector of this size for every single book  
    """
    
    # Extract the first row
    first_doc_vector = tfidf_matrix[0]
    print(f"--- Analysis of Document 1 ---")
    print(f"Vector Type: {type(first_doc_vector)}")
    print(f"Shape of this row: {first_doc_vector.shape} (1 row across all vocabulary columns)")
    print(f"Non-zero scores in this doc: {first_doc_vector.nnz if hasattr(first_doc_vector, 'nnz') else 'N/A'}")
    print(f"Raw Vector Data:\n{first_doc_vector}")

    # Assign the instance of our recommender function.
    # This is a matrix with a similarity value for every book with every other book in the dataset
    
    """
    cosine similarity and basically what we're doing is we're finding the angle of separation between 2 book vectors you could also say this is the dot product between the two vectors and the resulting scalar singular value is basically a representation of the similarity between those two.
    items for busines   
    We call these processes pairwise processes, because we're trying, we're basically making every possible computation of two pairs of things, and then calculating a similarity value. We'll use cosine similarity. 
    Cosign similarity is the most common 
    """

    from sklearn.metrics.pairwise import cosine_similarity
    distance_matrix = cosine_similarity(tfidf_matrix)
    print("\nDistance Matrix object type")
    print("distance_matrix_type", type(distance_matrix))

    # Re-create the indices of our list of books by removing any duplicates if required
    indices = pd.Series(df.index, index=df['authors']).drop_duplicates()
    print("\nDistance Matrix Size")
    print(distance_matrix.size)
    print("\nDistance Matrix Shape")
    print(distance_matrix.shape)

    # Define a function that takes the re-indexed dataset, finds the 6 most similar authors/their names or descriptions 
    #to a chosen author based on the
    # similarity of the words in the authors,
    # and returns the top 10, (not) including itself, which will be the best match. 

    def ContentBasedRecommender(authors, indices, distance_matrix):
        id_ = indices[authors] #Fetch the index of the book we will enter
        
        #List of tuples with distance for each book to the entered book (2 cols = id and distance)
        distances = list(enumerate(distance_matrix[id_]))
        
        #sort by the distance function, which is in column[1]
        distances = sorted(distances, key=lambda x: x[1], reverse = True)
        
        distances = distances[1:11] # Get the 10 best scores , not including itself
        
        # get the indices of the top 10
        recommendations = [distance[0] for distance in distances]
        
        # return those recommendation names by pulling authors from the given 10 indices
        return df['authors'].iloc[recommendations]
   
    recommendations = ContentBasedRecommender("Stephenie Meyer", indices, distance_matrix)
    
    print("\nRECOMMENDATIONS")
    print(recommendations)    
  
    
main()

Skipping line 1571: ',' expected after '"'
Skipping line 4514: ',' expected after '"'
Skipping line 9967: ',' expected after '"'
Skipping line 10870: ',' expected after '"'
Skipping line 3349: Expected 12 fields in line 3349, saw 13
Skipping line 4702: Expected 12 fields in line 4702, saw 13
Skipping line 5877: Expected 12 fields in line 5877, saw 13
Skipping line 8979: Expected 12 fields in line 8979, saw 13


DATASET        bookID                                              title  \
0           1  Harry Potter and the Half-Blood Prince (Harry ...   
1           2  Harry Potter and the Order of the Phoenix (Har...   
2           4  Harry Potter and the Chamber of Secrets (Harry...   
3           5  Harry Potter and the Prisoner of Azkaban (Harr...   
4           8  Harry Potter Boxed Set  Books 1-5 (Harry Potte...   
...       ...                                                ...   
11114   45631   Expelled from Eden: A William T. Vollmann Reader   
11115   45633                        You Bright and Risen Angels   
11116   45634                    The Ice-Shirt (Seven Dreams #1)   
11117   45639                                        Poor People   
11118   45641                        Las aventuras de Tom Sawyer   

                                                 authors  average_rating  \
0                             J.K. Rowling/Mary GrandPré            4.57   
1                      