In [6]:
# p5.t1.ipynb

In [1]:
import numpy as np
import pandas as pd

In [1]:
def loadDataset(path=""):
	""" To load the dataSet"
		Parameter: The folder where the data files are stored
		Return: the dictionary with the data
	"""
	
	#Recover the titles of the books
	fbooks = open(path+"books.1.csv", 'w')
	books = {}
	for line in open(path+"BX-Books.csv"):
		line = line.replace('"', "")
		line = line.replace("\\","")
		line = line.replace("\n","")        
		(id,title) = line.split(";") [0:2]
		books[id] = title
		fbooks.write(id + ',' + title + '\n')       
	fbooks.close()    
	
	#Load the data
	prefs = {}
	countVal = 0
	countKey = 0
	fratings = open(path+"ratings.1.csv", 'w')
	for line in open(path+"BX-Book-Ratings.csv"):
		line = line.replace('"', "")
		line = line.replace("\\","")
		line = line.replace("\n","")        
		(user,bookid,rating) = line.split(";")
		try:
			if float(rating) > 0.0:
				fratings.write(user + ',' + bookid + ',' + rating + '\n')                                
				prefs.setdefault(user,{})
				prefs[user][books[bookid]] = float(rating)
		except ValueError:
			countVal+=1
#			print "value error found! " + user + bookid + rating
		except KeyError:
			countKey +=1
#			print "key error found! " + user + " " + bookid
	print countVal
	print countKey
	fratings.close()    
	
	return prefs


from math import sqrt

#Returns a distance-base similarity score for person1 and person2

def sim_distance(prefs, person1, person2):
	#Get the list of shared_items
	si = {}
	for item in prefs[person1]:
		if item in prefs[person2]:
			si[item] = 1

	#if they have no rating in common, return 0
	if len(si) == 0: 
		return 0

	#Add up the squares of all differences
	sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])

	return 1 / (1 + sum_of_squares)


#Returns the Pearson correlation coefficient for p1 and p2 
def sim_pearson(prefs,p1,p2):
	#Get the list of mutually rated items
	si = {}
	for item in prefs[p1]:
		if item in prefs[p2]: 
			si[item] = 1

	#if they are no rating in common, return 0
	if len(si) == 0:
		return 0

	#sum calculations
	n = len(si)

	#sum of all preferences
	sum1 = sum([prefs[p1][it] for it in si])
	sum2 = sum([prefs[p2][it] for it in si])

	#Sum of the squares
	sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
	sum2Sq = sum([pow(prefs[p2][it],2) for it in si])

	#Sum of the products
	pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])

	#Calculate r (Pearson score)
	num = pSum - (sum1 * sum2/n)
	den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
	if den == 0:
		return 0

	r = num/den

	return r

#Returns the best matches for person from the prefs dictionary
#Number of the results and similiraty function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
	scores = [(similarity(prefs,person,other),other)
				for other in prefs if other != person]
	scores.sort()
	scores.reverse()
	return scores[0:n]


#Gets recommendations for a person by using a weighted average
#of every other user's rankings

def getRecommendations(prefs,person,similarity=sim_pearson):
	totals = {}
	simSums = {}

	for other in prefs:
		#don't compare me to myself
		if other == person:
			continue
		sim = similarity(prefs,person,other)

		#ignore scores of zero or lower
		if sim <= 0: 
			continue
		for item in prefs[other]:
			#only score books i haven't seen yet
			if item not in prefs[person] or prefs[person][item] == 0:
				#Similarity * score
				totals.setdefault(item,0)
				totals[item] += prefs[other][item] * sim
				#Sum of similarities
				simSums.setdefault(item,0)
				simSums[item] += sim

	#Create the normalized list
	rankings = [(total/simSums[item],item) for item,total in totals.items()]

	#Return the sorted list
	rankings.sort()
	rankings.reverse()
	return rankings


#Function to transform Person, item - > Item, person
def transformPrefs(prefs):
	results = {}
	for person in prefs:
		for item in prefs[person]:
			results.setdefault(item,{})

			#Flip item and person
			results[item][person] = prefs[person][item]
	return results


In [3]:
# line count
# BX-Books.csv = 271,380
#  books.1.csv = 271,380
# BX-Book-Ratings.csv = 1,149,781
#       ratings.1.csv =   433,671

In [2]:
# Create a user preferences dictionary 
#
# Only use BX-Books.csv and BX-Book-Ratings.csv
# Only use "ISBN" and "Book-Title" from BX-Books.csv
# Not use rating 0.
# Users who bought book but did not provide rating (i.e. 0) do not appear in the dictionary. 
#
prefs = loadDataset()

1
49818


In [3]:
len(prefs)
print "prefs['98556'] : ", prefs['98556']
print "prefs['180727'] : ", prefs['180727']
prefs.has_key('276725') # first User_ID in BX-Book-Ratings.csv

prefs['98556'] :  {'Foundation (Foundation Novels (Paperback))': 7.0}
prefs['180727'] :  {'Foundation (Foundation Novels (Paperback))': 3.0, 'Brave New World': 6.0, 'Jeeves in the morning (Perennial library)': 7.0, 'Foundation and Empire (Foundation Novels (Paperback))': 4.0, "Foundation's Edge : The Foundation Novels (Foundation Novels (Paperback))": 3.0, 'Second Foundation (Foundation Novels (Paperback))': 4.0}


False

In [None]:
# sim_pearson and sim_pearson can be used to determine how similar the preferences of two users.

In [4]:
#recommendations.sim_distance(critics.critics,'180727', '177432')
sim_distance(prefs,'180727', '177432')

0.008

In [5]:
#recommendations.sim_pearson(critics.critics,'180727', '177432')
sim_pearson(prefs,'180727', '177432')

0.6622661785325219

In [None]:
# topMatches returns a sorted list of n users with similar preferences to a specified user.
# We can choose the n and which distance function to use.
#
# may be we can use this to impute missing age!
#

In [6]:
#recommendations.topMatches(critics.critics,'98556',10,recommendations.sim_distance)
topMatches(prefs,'98556',10,sim_distance)

[(1.0, '69721'),
 (1.0, '28667'),
 (1.0, '224646'),
 (1.0, '182212'),
 (1.0, '11676'),
 (0.5, '4157'),
 (0.5, '28729'),
 (0.5, '224650'),
 (0.5, '199616'),
 (0.5, '189139')]

In [7]:
#recommendations.topMatches(critics.critics,'180727', 3)
topMatches(prefs,'180727', 3)

[(1.0, '189139'), (1.0, '11676'), (0.6622661785325219, '177432')]

In [None]:
# Next getRecommendations.  The article :
# 
# Find someone similar to read recommendations is great, but generally what we really want is to make recommendation 
# of books not users. I could simply look to the user profile and seek for books the user likes and i haven't read 
# yet, but this it's not so clever. This approach could eventually result in a user that haven't done an evaluation 
# on books that i could like. It could also return a user that liked a movie that was badly evaluated (low rates) by 
# all other users returned by the topMatches. To solve those problems, you have to give rates to items using a 
# weighted average that can properly classify the evaluations. The implementation code for this items recommendation 
# is simple and work with both measure distances.
# 
# The code of the function getRecommendations looks at each user except the one passed as parameter. It calculates 
# how similar the users are to the specified user and after looks at each item rated by those users. As result you 
# now have a classified books list and also a estimated rate that i would give for each book in it. This report 
# allows me to choose which book i want to read or not, or if i prefer to do other thing than read it. It's important
# to notice that you can decide not to make recommendations if any result achieves a specified threshold by the user.
#

In [9]:
# recommendations.getRecommendations(critics.critics,'180727')[0:3]
getRecommendations(prefs,'180727')[0:3]

[(10.000000000000002, 'The Two Towers (The Lord of the Rings, Part 2)'),
 (10.000000000000002,
  'The Return of the King (The Lord of the Rings, Part 3)'),
 (10.000000000000002, 'Hawaii')]

In [10]:
# recommendations.getRecommendations(critics.critics,'180727', 
#     similarity=recommendations.sim_distance)[0:4] 
getRecommendations(prefs,'180727', similarity=sim_distance)[0:4] 

[(10.000000000000002, 'Dune'),
 (10.000000000000002, 'Best Friends'),
 (10.000000000000002, 'All Creatures Great and Small'),
 (10.000000000000002, 'A Christmas Carol (Dover Thrift Editions)')]

In [None]:
# Not only we learn how to do recommendations for usres, given a book, we can also find similar books!  
#
# The article :
# 
# Now we know how to find similar users and recommend items to a user, but how about finding similar items ?! 
# You see those recommendations at web stores in the internet, specially when the store hasn't collected many 
# information about your preferences. One of web stores that uses this type of recommendations is the Amazon 
# web store, as you can see it here.
#
# In this case, you can evaluate the similarity, searching for users that liked a particular item and seeing 
# others that appreciated in the same way. To do this, you can use the same functions defined earlier in this 
# article, the only change is to replace the users by items now. So you can find similar items to the specified 
# item.
#
# I provided a function transformPrefs to do that. It rebuilds the new dictionary now with the key value with 
# the book name and as values the pairs (user,rate).
#

In [11]:
# Build a critics dictionary based on prefs dictionary
#
# critics = recommendations.transformPrefs(critics)
critics = transformPrefs(prefs)

In [12]:
print len(prefs)
print len(critics)

77805
135386


In [13]:
# The article:
#
# It's not so obvious that changing users to items it will lead to useful results, but in many cases it will make 
# possible to do interesting comparisons. Imagine a web store that collect buying historic profiles with the purpose 
# of recommend products to people in particular. Revert people to products, you can allow the system now recommend 
# users that could buy specific products. It's very useful when the marketing department of your company want to do 
# a great marketing effort to a big cut-off prices sales. Or it could be also be used to check if links recommended 
# show in a web page are really seen by users that have a great probability of liking them.
#
# recommendations.topMatches(critics,'Drums of Autumn')
print critics.has_key('Drums of Autumn')
if critics.has_key('Drums of Autumn') :
   print critics['Drums of Autumn']
   print topMatches(critics,'Drums of Autumn')

True
{'135458': 10.0, '127171': 10.0, '123430': 10.0, '220635': 8.0, '55492': 9.0, '144707': 10.0, '154658': 7.0, '251844': 8.0, '86552': 8.0, '37712': 10.0, '44852': 6.0, '39616': 8.0, '205843': 7.0, '26598': 10.0, '142256': 10.0, '171055': 10.0, '278160': 8.0, '242106': 10.0, '53174': 9.0, '229768': 10.0, '125701': 9.0, '68555': 10.0, '255655': 9.0, '142062': 9.0, '168245': 10.0, '136010': 7.0, '38590': 10.0, '32773': 9.0, '271558': 5.0, '43481': 9.0, '107951': 9.0, '240225': 10.0, '77749': 10.0, '23902': 7.0, '7082': 5.0, '94843': 9.0, '175636': 8.0, '259035': 10.0, '52853': 9.0, '156150': 8.0, '236757': 8.0, '12702': 10.0, '190448': 8.0, '105058': 9.0, '38081': 8.0, '59812': 7.0, '202738': 8.0, '78783': 10.0}
[(1.0, 'Year of Wonders'), (1.0, 'Velvet Angel'), (1.0, 'Twice Loved'), (1.0, 'Trying to Save Piggy Sneed'), (1.0, 'The Zebra Wall')]


In [14]:
print critics.has_key('Gone with the wind')
if critics.has_key('Gone with the wind') :
   print critics['Gone with the wind']
   print topMatches(critics,'Gone with the wind')

False


In [15]:
# Now given a book, we can advertise it to users who may want it :) sweet! 
#
# The article:
#
# If you want to recommend specific people that have done the evaluation about a book. If you want to send 
# invitations to a book launch event ?!
#
# recommendations.getRecommendations(critics,'The Weight of Water')[0:5]
getRecommendations(critics,'The Weight of Water')[0:5]

[(10.000000000000002, '92048'),
 (10.000000000000002, '211152'),
 (10.000000000000002, '198996'),
 (10.000000000000002, '156467'),
 (10.0, '99298')]

In [None]:
#
# I really like what the article show!
#
# However, it is not a PACKAGE, but just yet another distance calcaluator.  Is this what we want?  Let's talk.
#