In [6]:
# BRCF.3.VerifyData.ipynb 
# verify MC.Books.csv and MC.ratings.csv.

In [1]:
import numpy as np
import pandas as pd

In [2]:
def loadDataset(path=""):
	
	#Recover the titles of the books
	books = {}
	for line in open(path+"MC.Books.csv"):
		line = line.replace('"', "")
		line = line.replace("\\","")
		line = line.replace("\n","")        
		(id,title) = line.split(",")[0:2] 
		books[id] = title
	
	#Load the data
	prefs = {}
	countVal = 0
	countKey = 0
	for line in open(path+"MC.Ratings.csv"):
		line = line.replace('"', "")
		line = line.replace("\\","")
		line = line.replace("\n","")        
		(user,bookid,rating) = line.split(",")[0:3]
		try:
			if float(rating) > 0.0:
				prefs.setdefault(user,{})
				prefs[user][books[bookid]] = float(rating)
		except ValueError:
			countVal += 1
#			print "value error found! " + user + bookid + rating
		except KeyError:
			countKey += 1
#			print "key error found! " + user + " " + bookid
	print countVal
	print countKey
    
	return prefs


from math import sqrt

#Returns a distance-base similarity score for person1 and person2

def sim_distance(prefs, person1, person2):
	#Get the list of shared_items
	si = {}
	for item in prefs[person1]:
		if item in prefs[person2]:
			si[item] = 1

	#if they have no rating in common, return 0
	if len(si) == 0: 
		return 0

	#Add up the squares of all differences
	sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])

	return 1 / (1 + sum_of_squares)


#Returns the Pearson correlation coefficient for p1 and p2 
def sim_pearson(prefs,p1,p2):
	#Get the list of mutually rated items
	si = {}
	for item in prefs[p1]:
		if item in prefs[p2]: 
			si[item] = 1

	#if they are no rating in common, return 0
	if len(si) == 0:
		return 0

	#sum calculations
	n = len(si)

	#sum of all preferences
	sum1 = sum([prefs[p1][it] for it in si])
	sum2 = sum([prefs[p2][it] for it in si])

	#Sum of the squares
	sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
	sum2Sq = sum([pow(prefs[p2][it],2) for it in si])

	#Sum of the products
	pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])

	#Calculate r (Pearson score)
	num = pSum - (sum1 * sum2/n)
	den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
	if den == 0:
		return 0

	r = num/den

	return r

#Returns the best matches for person from the prefs dictionary
#Number of the results and similiraty function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
	scores = [(similarity(prefs,person,other),other)
				for other in prefs if other != person]
	scores.sort()
	scores.reverse()
	return scores[0:n]


#Gets recommendations for a person by using a weighted average
#of every other user's rankings

def getRecommendations(prefs,person,similarity=sim_pearson):
	totals = {}
	simSums = {}

	for other in prefs:
		#don't compare me to myself
		if other == person:
			continue
		sim = similarity(prefs,person,other)

		#ignore scores of zero or lower
		if sim <= 0: 
			continue
		for item in prefs[other]:
			#only score books i haven't seen yet
			if item not in prefs[person] or prefs[person][item] == 0:
				#Similarity * score
				totals.setdefault(item,0)
				totals[item] += prefs[other][item] * sim
				#Sum of similarities
				simSums.setdefault(item,0)
				simSums[item] += sim

	#Create the normalized list
	rankings = [(total/simSums[item],item) for item,total in totals.items()]

	#Return the sorted list
	rankings.sort()
	rankings.reverse()
	return rankings


#Function to transform Person, item - > Item, person
def transformPrefs(prefs):
	results = {}
	for person in prefs:
		for item in prefs[person]:
			results.setdefault(item,{})

			#Flip item and person
			results[item][person] = prefs[person][item]
	return results


In [3]:
prefs = loadDataset('data/')

0
49817


In [4]:
len(prefs)
print "prefs['98556'] : ", prefs['98556']
print "prefs['180727'] : ", prefs['180727']
prefs.has_key('276725') # first User_ID in BX-Book-Ratings.csv

prefs['98556'] :  {'Foundation (Foundation Novels (Paperback))': 7.0}
prefs['180727'] :  {'Foundation (Foundation Novels (Paperback))': 3.0, 'Brave New World': 6.0, 'Jeeves in the morning (Perennial library)': 7.0, 'Foundation and Empire (Foundation Novels (Paperback))': 4.0, "Foundation's Edge : The Foundation Novels (Foundation Novels (Paperback))": 3.0, 'Second Foundation (Foundation Novels (Paperback))': 4.0}


False

In [5]:
# clean data produced same sim_distance as shown in the article. 
sim_distance(prefs, '98556', '180727')

0.058823529411764705

In [6]:
# clean data produced same sim_pearson as shown in the article. 
sim_pearson(prefs,'180727', '177432')

0.6622661785325219

In [7]:
# clean data produced same 10 topMatches as shown in the article. 
topMatches(prefs, '98556', 10, sim_distance)

[(1.0, '69721'),
 (1.0, '28667'),
 (1.0, '224646'),
 (1.0, '182212'),
 (1.0, '11676'),
 (0.5, '4157'),
 (0.5, '28729'),
 (0.5, '224650'),
 (0.5, '199616'),
 (0.5, '189139')]

In [8]:
# clean data produced same 3 topMatches as shown in the article. 
topMatches(prefs, '180727', 3)

[(1.0, '189139'), (1.0, '11676'), (0.6622661785325219, '177432')]

In [9]:
# clean data produced same 3 getRecommendations as shown in the article.
getRecommendations(prefs,'180727')[0:3]

[(10.000000000000002, 'The Two Towers (The Lord of the Rings'),
 (10.000000000000002, 'The Return of the King (The Lord of the Rings'),
 (10.000000000000002, 'Hawaii')]

In [10]:
# clean data produced same 4 getRecommendations as shown in the article.
getRecommendations(prefs,'180727', similarity=sim_distance)[0:4] 

[(10.000000000000002, 'Dune'),
 (10.000000000000002, 'Best Friends'),
 (10.000000000000002, 'All Creatures Great and Small'),
 (10.000000000000002, 'A Christmas Carol (Dover Thrift Editions)')]

In [11]:
critics = transformPrefs(prefs)

In [12]:
print len(prefs)
print len(critics)

77805
133597


In [12]:
# clean data produced same topMatches as shown in the article.
topMatches(critics,'Drums of Autumn')

[(1.0, 'Year of Wonders'),
 (1.0, 'Velvet Angel'),
 (1.0, 'Twice Loved'),
 (1.0, 'Trying to Save Piggy Sneed'),
 (1.0, 'The Zebra Wall')]

In [13]:
# clean data produced same getRecommendations as shown in the article.
getRecommendations(critics,'The Weight of Water')[0:5]

[(10.000000000000002, '92048'),
 (10.000000000000002, '211152'),
 (10.000000000000002, '198996'),
 (10.000000000000002, '156467'),
 (10.0, '99298')]

In [None]:
# MC.Books.csv and MC.Ratings.csv produced same results as shown in Caraciolo's article. 