In [6]:
# BRCF.5.DataImpact.ipynb 
# Re-examine our baseline model using two times more ratings from Good.Ratings.csv.

In [2]:
import numpy as np
import pandas as pd

In [3]:
def loadDataset(path=""):
	""" To load the dataSet"
		Parameter: The folder where the data files are stored
		Return: the dictionary with the data
	"""
	#Recover the titles of the books
	books = {}
	for line in open(path+"MC.Books.csv"):
		line = line.replace('"', "")
		line = line.replace("\\","")
		line = line.replace("\n","")                
		(id,title) = line.split(",")[0:2]
		books[id] = title
	
	#Load the data
	prefs = {}
	countRatingsRead = 0
	countValueError = 0
	countKeyError = 0
	countRatingsUsed = 0
	for line in open(path+"Good.Ratings.csv"):
		countRatingsRead +=1        
		line = line.replace('"', "")
		line = line.replace("\\","")
		line = line.replace("\n","")        
		(user,bookid,rating) = line.split(",")[0:3]
		try:
			if float(rating) > 0.0:
				prefs.setdefault(user,{})
				prefs[user][books[bookid]] = float(rating)
				countRatingsUsed += 1                
		except ValueError:
			countValueError += 1
#			print "value error found! " + user + bookid + rating
		except KeyError:
			countKeyError += 1
#			print "key error found! " + user + " " + bookid
	print countRatingsRead
	print countValueError
	print countKeyError
	print countRatingsUsed

	return prefs


from math import sqrt

#Returns a distance-base similarity score for person1 and person2

def sim_distance(prefs, person1, person2):
	#Get the list of shared_items
	si = {}
	for item in prefs[person1]:
		if item in prefs[person2]:
			si[item] = 1

	#if they have no rating in common, return 0
	if len(si) == 0: 
		return 0

	#Add up the squares of all differences
	sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])

	return 1 / (1 + sum_of_squares)


#Returns the Pearson correlation coefficient for p1 and p2 
def sim_pearson(prefs,p1,p2):
	#Get the list of mutually rated items
	si = {}
	for item in prefs[p1]:
		if item in prefs[p2]: 
			si[item] = 1

	#if they are no rating in common, return 0
	if len(si) == 0:
		return 0

	#sum calculations
	n = len(si)

	#sum of all preferences
	sum1 = sum([prefs[p1][it] for it in si])
	sum2 = sum([prefs[p2][it] for it in si])

	#Sum of the squares
	sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
	sum2Sq = sum([pow(prefs[p2][it],2) for it in si])

	#Sum of the products
	pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])

	#Calculate r (Pearson score)
	num = pSum - (sum1 * sum2/n)
	den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
	if den == 0:
		return 0

	r = num/den

	return r

#Returns the best matches for person from the prefs dictionary
#Number of the results and similiraty function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
	scores = [(similarity(prefs,person,other),other)
				for other in prefs if other != person]
	scores.sort()
	scores.reverse()
	return scores[0:n]


#Gets recommendations for a person by using a weighted average
#of every other user's rankings

def getRecommendations(prefs,person,similarity=sim_pearson):
	totals = {}
	simSums = {}

	for other in prefs:
		#don't compare me to myself
		if other == person:
			continue
		sim = similarity(prefs,person,other)

		#ignore scores of zero or lower
		if sim <= 0: 
			continue
		for item in prefs[other]:
			#only score books i haven't seen yet
			if item not in prefs[person] or prefs[person][item] == 0:
				#Similarity * score
				totals.setdefault(item,0)
				totals[item] += prefs[other][item] * sim
				#Sum of similarities
				simSums.setdefault(item,0)
				simSums[item] += sim

	#Create the normalized list
	rankings = [(total/simSums[item],item) for item,total in totals.items()]

	#Return the sorted list
	rankings.sort()
	rankings.reverse()
	return rankings


#Function to transform Person, item - > Item, person
def transformPrefs(prefs):
	results = {}
	for person in prefs:
		for item in prefs[person]:
			results.setdefault(item,{})

			#Flip item and person
			results[item][person] = prefs[person][item]
	return results


In [4]:
prefs = loadDataset('data/')

1149780
2
76127
851569


In [5]:
# Few checks.
print len(prefs)
print "prefs['98556'] : ", prefs['98556']
print "prefs['180727'] : ", prefs['180727']
prefs.has_key('276725') # first User_ID in BX-Book-Ratings.csv
#
# baseline results:
# 77805
# prefs['98556'] :  {'Foundation (Foundation Novels (Paperback))': 7.0}
# prefs['180727'] :  {'Foundation (Foundation Novels (Paperback))': 3.0, 'Brave New World': 6.0, '
# Jeeves in the morning (Perennial library)': 7.0, 'Foundation and Empire (Foundation Novels (Paperback))': 
# 4.0, "Foundation's Edge : The Foundation Novels (Foundation Novels (Paperback))": 
# 3.0, 'Second Foundation (Foundation Novels (Paperback))': 4.0}
# False
#
# Comparison:
# 1. dictionary prefs is bigger.
# 2. Although "prefs['98556']" and "prefs['180727']" have same values, but now id "276725" has his/her prefs

99210
prefs['98556'] :  {'Foundation (Foundation Novels (Paperback))': 7.0}
prefs['180727'] :  {'Foundation (Foundation Novels (Paperback))': 3.0, 'Brave New World': 6.0, 'Jeeves in the morning (Perennial library)': 7.0, 'Foundation and Empire (Foundation Novels (Paperback))': 4.0, "Foundation's Edge : The Foundation Novels (Foundation Novels (Paperback))": 3.0, 'Second Foundation (Foundation Novels (Paperback))': 4.0}


True

In [6]:
print "prefs['276725'] : ", prefs['276725']
# 
# in "BX-Book-Ratings.csv", user "276725" has only 1 entry and with zero rating:
# bc7_cwang@nodeD:~/Book.Recommendation.CF$ grep '"276725"' data/BX-Book-Ratings.csv 
# "276725";"034545104X";"0"
#
# According to Amazon, <ISBN-10: 034545104X> is for hardcover <Flesh Tones>.
# Thus, as we gave it the average rating, now CF will incorporate this user.
#

prefs['276725'] :  {'Flesh Tones: A Novel': 6.0}


In [13]:
sim_distance(prefs, '98556', '180727')
#
# baseline result:
# 0.058823529411764705
#
# Comparison:
# Same.

0.058823529411764705

In [10]:
sim_pearson(prefs, '180727', '177432')
#
# baseline result:
# 0.6622661785325219
#
# Comparison:
# Same.

0.6622661785325219

In [14]:
topMatches(prefs, '98556', 10, sim_distance)
#
# baseline result:
# [(1.0, '69721'),
# (1.0, '28667'),
# (1.0, '224646'),
# (1.0, '182212'),
# (1.0, '11676'),
# (0.5, '4157'),
# (0.5, '28729'),
# (0.5, '224650'),
# (0.5, '199616'),
# (0.5, '189139')]
#
# Comparison:
# Different.

[(1.0, '71162'),
 (1.0, '69721'),
 (1.0, '32773'),
 (1.0, '28667'),
 (1.0, '270897'),
 (1.0, '224646'),
 (1.0, '182212'),
 (1.0, '155463'),
 (1.0, '118286'),
 (1.0, '11676')]

In [15]:
topMatches(prefs, '180727', 3)
#
# baseline result:
# [(1.0, '189139'), (1.0, '11676'), (0.6622661785325219, '177432')]
#
# Comparison:
# Different.

[(1.000000000000016, '171818'), (1.0, '86202'), (1.0, '32773')]

In [16]:
getRecommendations(prefs,'180727')[0:3]
#
# baseline result:
# [(10.000000000000002, 'The Two Towers (The Lord of the Rings, Part 2)'),
#  (10.000000000000002, 'The Return of the King (The Lord of the Rings, Part 3)'),
#  (10.000000000000002, 'Hawaii')]
#
# Comparison:
# Different.

[(10.000000000000002, 'The War of the Worlds (Bantam Classics)'),
 (10.0, '\xc2\xa1Corre'),
 (10.0, 'Zen Gardening')]

In [17]:
getRecommendations(prefs,'180727', similarity=sim_distance)[0:4] 
#
# baseline result:
# [(10.000000000000002, 'Dune'),
# (10.000000000000002, 'Best Friends'),
# (10.000000000000002, 'All Creatures Great and Small'),
# (10.000000000000002, 'A Christmas Carol (Dover Thrift Editions)')]
#
# Comparison:
# Different.

[(10.000000000000002, "Year's Best Fantasy and Horror"),
 (10.000000000000002, 'When The Loving Stopped (Harlequin Romance'),
 (10.000000000000002, 'Watership Down (Scribner Classics)'),
 (10.000000000000002, 'Three Weeks with My Brother')]

In [19]:
critics = transformPrefs(prefs)

In [21]:
topMatches(critics,'Drums of Autumn')
#
# baseline result:
# [(1.0, 'Year of Wonders'),
# (1.0, 'Velvet Angel'),
# (1.0, 'Twice Loved'),
# (1.0, 'Trying to Save Piggy Sneed'),
# (1.0, 'The Zebra Wall')]
#
# Comparison:
# Different.

[(1.0000000000000255, 'Heart of a Warrior'),
 (1.000000000000016, 'The Unlikely Spy'),
 (1.000000000000016, "Everything's Eventual : 14 Dark Tales"),
 (1.0000000000000155, 'The Simple Truth'),
 (1.0000000000000133, 'Lady of Hay')]

In [22]:
getRecommendations(critics,'The Weight of Water')[0:5]
#
# baseline result:
# [(10.000000000000002, '92048'),
# (10.000000000000002, '211152'),
# (10.000000000000002, '198996'),
# (10.000000000000002, '156467'),
# (10.0, '99298')]
#
# Comparison:
# Different.

[(10.000000000000002, '99298'),
 (10.000000000000002, '98344'),
 (10.000000000000002, '78631'),
 (10.000000000000002, '43937'),
 (10.000000000000002, '41343')]

In [None]:
# As expected, more ratings changed the recommendations.