In [None]:
import warnings
warnings.filterwarnings('ignore') # hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning) # hides deprecation warning
warnings.filterwarnings("ignore",category=UserWarning) # hides user warning

In [None]:
import pandas as pd

csv = "br.csv"
data = pd.read_csv(csv, engine='python', error_bad_lines=False)
# use python engine for more feature-complete
# skips error bad line
data.head(5)

In [None]:
df = data.copy()

In [None]:
df = df.filter(items=['reviewerName', 'title', 'reviewerRatings'])
# filters the items that we need
df.head(5)

In [None]:
# checks for missing value for each column (back slash for line break)
for missing in\
(df['reviewerName'].isnull().value_counts(),
 df['title'].isnull().value_counts(),
 df['reviewerRatings'].isnull().value_counts()):
    print(missing)

In [None]:
# drops missing values for all columns
df = df.dropna(how='any')

In [None]:
# checks for missing value again
for missing in\
(df['reviewerName'].isnull().value_counts(),
 df['title'].isnull().value_counts(),
 df['reviewerRatings'].isnull().value_counts()):
    print(missing)

In [None]:
# remove rows with non-ASCII characters in reviewerName and title column
df = df[~df.reviewerName.str.contains(r'[^\x00-\x7F]')]
df = df[~df.title.str.contains(r'[^\x00-\x7F]')]

In [None]:
#resets the index
df = df.reset_index(drop=True)
df

In [None]:
# see how many times each book comes up rated (calls the collection package)
from collections import Counter
Counter(df['title'])

In [None]:
# see how many times multiple reviewers rated the same title
df.groupby("reviewerName")["title"].unique()

In [None]:
# see all the different users rated this one title that have multiple users
same_names = df[df['title'] == 'Anne of Avonlea']["reviewerName"].unique()
for name in same_names:
    print(name)

In [None]:
# OPTIONAL
# filters the unique reviewerName for their corresponding title and reviewRatings
df1 = df.set_index(['reviewerName', 'title']).sort_index()
df1.head(5)

In [None]:
# converts dataframe to dictionary
d = (df.groupby('reviewerName')['title','reviewerRatings']
     .apply(lambda x: dict(x.values))
     .to_dict())
# use groupy with lambda function per reviewerName,
# then use to_dict to convert from DataFrame to dictionary
d

# another way to perform the above
# d = df.groupby('reviewerName').apply(lambda x: x.set_index('title')['reviewerRatings'].to_dict()).to_dict()

[solve IOPub data rate exceeded](https://stackoverflow.com/questions/43288550/iopub-data-rate-exceeded-in-jupyter-notebook-when-viewing-image)

In [None]:
# returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
    # get the list of shared_items
    si = {} 
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    
    # if they have no ratings in common, return 0
    if len(si) == 0: 
        return 0
    
    # add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item],2) 
                          for item in prefs[person1] if item in prefs[person2]])
    
    return 1/(1+sum_of_squares)

sim_distance(d, 'Charles G', 'Maureen')

In [None]:
from math import sqrt

# returns the Pearson correlation coefficient for person1 and person2
def sim_pearson(prefs,person1,person2):
    # get the list of shared_items
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    
    # find the number of elements
    n = len(si)
    
    # if they have no ratings in common, return 0
    if len(si) == 0: 
        return 0
    
    # add up all the preferences
    sum1 = sum([prefs[person1][it] for it in si]) # it is all items in si
    sum2 = sum([prefs[person2][it] for it in si])
    
    # sum up the squares
    sum1Sq = sum([pow(prefs[person1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[person2][it], 2) for it in si])
    
    # sum up the products
    pSum = sum([prefs[person1][it] * prefs[person2][it] for it in si])
    
    # calculate Pearson score
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1, 2)/n) * (sum2Sq - pow(sum2, 2)/n))
    if den == 0:
        return 0
    
    r = num/den
    
    return r

sim_pearson(d, 'Charles G', 'Maureen')

In [None]:
# checks for similarity to myself (Maureen) using Euclidean distance score
# n is the length of shared items si

def top_matches(prefs, person, n=10, similarity = sim_distance):
    # sets other parameter to exclude myself
    scores = [(similarity(prefs,person,other), other)
            for other in prefs if other!=person]
    
    # sort the list so the higest score appear at the top
    scores.sort()
    scores.reverse()
    return scores[0:n] # slices from first index to last index

top_matches(d, 'Maureen', n=10)

In [None]:
# checks for similarity to myself (Maureen) using Pearson correlation score
top_matches(d, 'Maureen', n=10, similarity = sim_pearson)

In [None]:
# Gets recommendation for a person by using a weighted average of all other users
# Using Euclidean distance score
def get_recommendations(prefs, person, similarity = sim_distance):
    totals = {} 
    # get the list of each book for sum of similarity score x actual rating
    simSums = {} # get the list of each book for sum of similartiy score
    for other in prefs:
        # don't compare me to myself
        if other == person:
            continue
        sim = similarity(prefs, person, other) # then use sim
        
        # ignore scores of zero or lower
        if sim <= 0:
            continue
        for item in prefs[other]: # item in prefs from sim_distance
            
            # only score books I haven't seen yet
            if item not in prefs[person] or prefs[person][item] == 0:
                # similarity * score
                totals.setdefault(item,0) 
                totals[item] += prefs[other][item] * sim
                # sum of similarities
                simSums.setdefault(item,0)
                simSums[item] += sim
            # setdefault calls the key, and returns 0 if it does not exist
            # similar to get()
            
    # create the normalized list
    rankings = [(total/simSums[item], item) for item, total in totals.items()]
    # total(singular item, sum of (sim*actual ratings for each user)),
    # divide by simSum for each item,
    # and run total and item for each item, 
    # while adding each result in the totals list,
    # then returns a dictionary by calling totals.items()
    # items() returns a list of dictionary
    
    # return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

get_recommendations(d, 'Maureen')

In [None]:
# Gets recommendation for a person by using a weighted average of all other users
# Using Pearson correlation score
get_recommendations(d, 'Maureen', similarity = sim_pearson)

In [None]:
# inverts the data set for item-centric matrix

def transform_prefs(prefs):
    result={} # get the list of the transformed results
    for person in prefs: # prefs -> person (user from the preference dataset)
        for item in prefs[person]: # prefs[person] -> item (rating from user)
            result.setdefault(item,{}) 
            # add ratings to the transformed results list
            # by calling the ratings (keys) from the users (prefs[person])
            # setdefault calls the key, 
            # and returns empty list {} if it does not exist
            # similar to get()
            
            # flip item and person (flips user with ratings)
            result[item][person] = prefs[person][item]
    return result

transform_prefs(d)

In [None]:
# find similar books for each of the book using ranking function above,
# top_matches(sim_distance)

def calculate_similar_items(prefs, n=10):
    # create a dictionary of items showing which other items
    # they are most similar to
    result = {} 
    
    # invert the preference matrix to be item-centric
    itemPrefs = transform_prefs(prefs)
    c = 0 # assigns a new count of dataset
    for item in itemPrefs:
        
        # status update for large dataset
        c += 1 # implement loop to add to dataset
        if c%100 == 0: 
        # if module is 0, then print length of dataset and length of dictionary
            print("%d / %d" % (c, len(itemPrefs)))
        
        # find the most similar items to this one (calls the ranking function)
        scores = top_matches(itemPrefs, item, n=n, similarity = sim_distance)
        # item is now used instead of person since we inverted the matrix
        result[item] = scores
        # adds similar items (scores) to the result[item] list
    return result

itemsim = calculate_similar_items(d)
itemsim

#substitute sim_pearson for Pearson correlation score

In [None]:
# recommend book using similar function above, 
# itemsim(calculate_similar_items(d))

def get_recommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user] # creates a list for the self user
    scores = {}
    # get the list of similarity score for similar books, 
    # from the sum of similarity score (book read) x actual rating (book read)
    # against similar books
    totalSim= {} # get the list of total similarity score for similar books,
    # from the sum of similartiy score (books) against similar books
    
    # Loop over items (books) rated by this user 
    for (item, rating) in userRatings.items():
    # items that are already self rated, add to the list for self user 
        
        # Loop over items similar to this one
        for (similarity, item2) in itemMatch[item]:
        # item2 are items that are not self rated, 
        # we calculate the similarity distance between
        # other items that are rated by other users
        # itemMatch from calculate_similar_items function
            
            # Ignore if this user has already rated this item
            if item2 in userRatings:
                continue
            # if the other items (item2) are already self rated, then we ignore
                
            # Weighted sum of rating times similarity for other items
            scores.setdefault(item2,0)
            scores[item2] += similarity * rating
            # setdefault calls the key, and returns 0 if it does not exist
            # similar to get()
            
            # Sum of all the similarities for other items
            totalSim.setdefault(item2,0)
            totalSim[item2] += similarity
            # setdefault calls the key, and returns 0 if it does not exist
            # similar to get()
            
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item, score in scores.items()]
    # score(singular item, sum of (sim*actual ratings for each movie)),
    # divide by totalSim for each item,
    # and run score and item for each item, 
    # while adding each result in the scores list,
    # then returns a dictionary by calling scores.items()
    # items() returns a list of dictionary
    
    # Return the rankings from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings

get_recommendedItems(d, itemsim, 'Maureen')