In [12]:
# token edit for github assignment
import csv
from math import sqrt

In [13]:
## Create a dictionary called critics from critics.csv file.
## The keys are the names of the critics, and the values are dictionaries containing
## containing the movie names as the keys and the ratings as the values

critics={}

#Read in csv file, taking single quotes into account

csvReader = csv.reader(open("critics.csv", "r"), quotechar="'", delimiter=',')

#Create a temp dictionary by iterating over csv data, and merging film names with respective scores
#Dictionary format: {name : {film: score}}.  This format chosen over a potentially more "tidy" format (such as {film: {critic: score} ...}) given the following sections and size of the dataset

for row in csvReader:
    
    name = row[0]    
    film = []
    score = []
    
    for index, element in enumerate(row[1:]):
        
        if index % 2 ==0:
            
            film.append(element)
            
        else:
            
            score.append(element)
            
        data = dict(zip(film, score))
            
    critics[name] = data   

In [14]:
# Return a distance-based similarity score for two individuals
# params: critics dictionary, person 1's name, person 2's name 
def sim_distance(prefs,person1,person2):
    # Get the list of shared items (mutually rated)
    si={}
    # start code    
        
    person1Prefs = prefs[person1]
    person2Prefs = prefs[person2]
    
    #Create nested dictionary of shared items in format {film: {person1: person1score, person2: person2score}} by looping through critics dictionary 
    #Get list of shared items, then loop through to combine
        
    person1List = list(person1Prefs.keys())
    
    person2List = list(person2Prefs.keys())
    
    sharedList = list(set(person1List).intersection(person2List))
    
    for film in sharedList:
        
        person1Score = person1Prefs[film]
        
        person2Score = person2Prefs[film]
        
        si[film] = {person1 : float(person1Score), person2 : float(person2Score)}
            
    # end code
    
    # if they have no ratings in common, return 0
    
    if len(si) == 0:
        
        return(0)
    
    # Add up the squares of all the differences of the ratings for the shared items
    
    sum_of_squared_difference = sum(n ** 2 for i in si.values() for n in i.values())
    
    return(1/(1 + sum_of_squared_difference))

In [15]:
print(sim_distance(critics,'Toby','Jack Matthews'))

0.01092896174863388


In [16]:
# Returns the Pearson correlation coefficient for two individuals
# first parameter is the critics dictionary

def sim_pearson(prefs,person1,person2):
    
    # Get the list of mutually rated items
    
    si={}
    
    # start code 
    
    #This is repeated code from above.  Normally I would have done this as its own function, but given the non-linear nature of notebooks,
    #I thought it best to sacrifice concise code for safety, especially as much of the code was defined already and this code may be used stand-alone one day
    
    person1Prefs = prefs[person1]
    person2Prefs = prefs[person2]
            
    person1List = list(person1Prefs.keys())
    
    person2List = list(person2Prefs.keys())
    
    sharedList = list(set(person1List).intersection(person2List))
    
    for film in sharedList:
        
        person1Score = person1Prefs[film]
        
        person2Score = person2Prefs[film]
        
        si[film] = {person1 : float(person1Score), person2 : float(person2Score)}
    
    # end code
    # if they have no ratings in common, return 0
    
    if len(si) == 0:
        
        return(0)
    
    #find the number of common elements
    
    n = len(si)   
    
    #Add up all the ratings for each individual
    
    sum1 = sum(si[i][person1] for i in si)
    sum2 = sum(si[i][person2] for i in si)
    
    # Sum up the squares of the ratings for each person
    
    sum1Sq = sum(si[i][person1] ** 2 for i in si)
    sum2Sq = sum(si[i][person2] ** 2 for i in si) 
    
    # Sum up the products of the ratings for the two individuals 
    # rating person 1 x rating person 2 for all the common ratings and add it up   
    
    pSum = sum(si[i][person1] * si[i][person2] for i in si)
    
    #Calculate Pearson score
    
    numerator = pSum - (sum1*sum2/n)
    denominator = sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    
    if denominator == 0:
        
        return(0)
    
    r = numerator/denominator
    
    return(r)

In [17]:
print(sim_pearson(critics,'Toby','Jack Matthews'))

0.66284898035987


In [18]:
# Return the best matches for person from the critics dictionary
# Number of results and similarity function are optional parameters

def topMatches(prefs,person,n=5,similarity=sim_pearson):
    
    ## find the similarity scores for the person and every other individual
    ## store values in a tuple for the person and other individual
    
    # Get list of critics to compare against  
    
    scores = []
    
    criticList = []
    
    for critic in prefs:
        
        if critic != person:
            
            criticList.append(critic)
            
    # compare and get scores - compatible with sim_distance and sim_pearson
            
    for compare in criticList:
                
        sim = similarity(prefs,person,compare)
        data = (sim, compare)
        scores.append(data)
        
    # sort the list so that the highest scores appear at the top
        
    scores.sort()
    scores.reverse()
    return(scores[0:n])

In [19]:
print(topMatches(critics,'Toby'))

[(0.9912407071619299, 'Lisa Rose'), (0.9244734516419049, 'Mick LaSalle'), (0.8934051474415647, 'Claudia Puig'), (0.66284898035987, 'Jack Matthews'), (0.38124642583151164, 'Gene Seymour')]


In [20]:
## Get recommendations for a person by using a weighted average
## of every other user's rankings

def getRecommendations(prefs,person,similarity=sim_pearson):
    
    totals = {}    
    simSums = {}
    
    for other in prefs:
        
        #don't compare me to myself
        
        if other == person:
            
            continue
            
        sim = similarity(prefs,person,other)
        
        #ignore scores of zero or lower
        
        if sim <= 0:
            
            continue
            
        for item in prefs[other]:
            
            #only score movies I haven't seen yet  
            
            if item not in prefs[person] or prefs[person][item]==0: 
                
                #Similariy * Score
                totals.setdefault(item,0)                
                totals[item] += float(prefs[other][item]) * float(sim)
                
                #Sum of similarities
                simSums.setdefault(item,0)                
                simSums[item] += sim
            
            #create a normalized list
        rankings = [(total/simSums[item],item) for item,total in totals.items()]
        
        # return the sorted list
        rankings.sort()
        rankings.reverse()
        return(rankings)

In [21]:
print(getRecommendations(critics,'Toby'))

[(3.0, 'The Night Listener'), (3.0, 'Just My Luck'), (2.5, 'Lady in the Water')]


In [22]:
print(getRecommendations(critics,'Jack Matthews'))

[(3.0, 'Just My Luck')]
