In [1]:

# """
# Class 15: Recommendation Engines
# Content based and Collaborative based filtering
# Jaccard Similarity
# Modified KNN Algorithm
# """


In [2]:
########################################
## Collaborative-Based User Filtering ##
########################################

In [3]:
import pandas as pd

In [4]:
#read in brands data
user_brands = pd.read_csv('../data/user_brand.csv')

In [5]:
#look at count of stores
user_brands.Store.value_counts()

Target                        1866
Old Navy                      1200
Home Depot                    1186
Kohl's                        1157
Banana Republic                932
Nordstrom                      904
Gap                            860
Crate & Barrel                 816
Express                        785
KitchenAid                     700
J.Crew                         569
Container Store                564
Steve Madden                   539
Guess                          509
Cuisinart                      506
Nine West                      489
Calvin Klein                   476
Levi's                         472
Converse                       456
New Balance                    438
BCBGMAXAZRIA                   429
Restoration Hardware           410
Lacoste                        405
Kenneth Cole                   366
Kate Spade                     354
Puma                           350
Melissa & Doug                 335
DKNY                           328
Last Call by Neiman 

In [6]:
# Series of user IDs, note the duplicates
user_ids = user_brands.ID

In [7]:
user_ids

0        80002
1        80002
2        80010
3        80010
4        80010
5        80010
6        80010
7        80010
8        80010
9        80010
10       80010
11       80010
12       80011
13       80011
14       80011
15       80011
16       80011
17       80011
18       80011
19       80011
20       80011
21       80011
22       80011
23       80011
24       80011
25       80011
26       80015
27       80015
28       80015
29       80015
         ...  
23774    91924
23775    91927
23776    91927
23777    91931
23778    91931
23779    91931
23780    91931
23781    91943
23782    91943
23783    91943
23784    91944
23785    91944
23786    91944
23787    91944
23788    91944
23789    91944
23790    91944
23791    91944
23792    91944
23793    91944
23794    91946
23795    91946
23796    91946
23797    91946
23798    91955
23799    91957
23800    91957
23801    91957
23802    91957
23803    91957
Name: ID, dtype: int64

In [14]:
# groupby ID to see what each user likes!
# brandsfor = ANSWER HEREEEEEE
brandsfor = user_brands.groupby('ID').Store.value_counts()


In [11]:
brandsfor

ID     Store               
80002  Home Depot              1
       Target                  1
80010  Container Store         1
       Converse                1
       Cuisinart               1
       DKNY                    1
       Express                 1
       Kohl's                  1
       Levi's                  1
       Nordstrom               1
       Old Navy                1
       Puma                    1
80011  BCBGMAXAZRIA            1
       Banana Republic         1
       Calvin Klein            1
       Crate & Barrel          1
       Diesel                  1
       French Connection       1
       Gap                     1
       Guess                   1
       Kenneth Cole            1
       Nine West               1
       Nordstrom               1
       Restoration Hardware    1
       Steve Madden            1
       Target                  1
80015  Banana Republic         1
       Gap                     1
       Home Depot              1
       Target  

In [17]:
# turn the data frame into a dictionary
# where the key is a user ID, and the value is a 
# list of stores that the user "likes"
# ANSWER HEREEEEEE

brandsfor = {str(k): list(v) for k,v in user_brands.groupby("ID")["Store"]}

In [18]:
# try it out. User 83065 likes Kohl's and Targe
brandsfor['83065']

["Kohl's", 'Target']

In [None]:
# User 82983 likes many more!

In [None]:
########################
## Jaccard Similarity ##
########################

In [None]:
#
#The Jaccard Similarity allows us to compare two sets
#If we regard people as merely being a set of brands they prefer
#the Jaccard Similarity allows us to compare people
#
#Example. the jaccard similarty between user 82983 and 83065 is .125
#            because
#             brandsfor['83065'] == ["Kohl's", 'Target']
#             brandsfor['82983'] == ['Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom']

# the intersection of these two sets is just set("Target")
# the union of the two sets is set(['Target', 'Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom'])
# so the len(intersection) / len(union) = 1 / 8 == .125

# EXERCISE: what is the Jaccard Similarity 
#           between user 82956 and user 82963?
# 

In [22]:
brandsfor['82956']

['Diesel', 'Old Navy', 'Crate & Barrel', 'Target']

In [23]:
brandsfor['82983']

['Hanky Panky',
 'Betsey Johnson',
 'Converse',
 'Steve Madden',
 'Old Navy',
 'Target',
 'Nordstrom']

In [50]:
# ANSWER HERE
2/10


0

In [51]:
# '''
# EXERCISE: Complete the jaccard method below.
#           It should take in a list of brands, and output the 
#           jaccard similarity between them

# This should work with anything in the set, for example
# jaccard([1,2,3], [2,3,4,5,6])  == .3333333

# HINT: set1 & set2 is the intersection
#       set1 | set2 is the union

# '''

In [69]:
def jaccard(first, second):
    first = set(first)
    second = set(second)
    return float(len(first & second))/len(first | second)

In [53]:
# try it out!
brandsfor['83065'] # brands for user 83065
brandsfor['82983'] # brands for user 82983
jaccard(brandsfor['83065'], brandsfor['82983'])
jaccard(brandsfor['82956'], brandsfor['82963'])

0.125
0.333333333333


In [54]:
#######################
### Our Recommender ###
#######################


In [55]:
# '''
# Our recommender will be a modified KNN collaborative algorithm.
# Input: A given user's brands that they like
# Output: A set (no repeats) of brand recommendations based on
#         similar users preferences

# 1. When a user's brands are given to us, we will calculate the input user's
# jaccard similarity with every person in our brandsfor dictionary

# 2. We will pick the K most similar users and recommend
# the brands that they like that the given user doesn't know about

# EXAMPLE:
# Given User likes ['Target', 'Old Navy', 'Banana Republic', 'H&M']
# Outputs: ['Forever 21', 'Gap', 'Steve Madden']
# '''

In [56]:
given_user = ['Target', 'Old Navy', 'Banana Republic', 'H&M']

In [57]:
#similarty between user 83065 and given user
brandsfor['83065']
jaccard(brandsfor['83065'], given_user) 
# should be 0.2

0.2


In [58]:
# '''
# EXERCISE
#     Find the similarty between given_user and ALL of our users
#     output should be a dictionary where
#     the key is a user id and the value is the jaccard similarity
# {...
#  '83055': 0.25,
#  '83056': 0.0,
#  '83058': 0.1111111111111111,
#  '83060': 0.07894736842105263,
#  '83061': 0.4,
#  '83064': 0.25,
#  '83065': 0.2,
#  ...}
# '''

In [59]:
# ANSWER HEREEEEEE
similarities = {k: jaccard(given_user, v) for k, v in brandsfor.iteritems()}

0.25
0.0
0.333333333333
0.0
0.222222222222
0.125
0.0
0.166666666667
0.2
0.181818181818
0.0
0.111111111111
0.0
0.285714285714
0.2
0.0681818181818
0.0
0.0
0.0666666666667
0.0
0.181818181818
0.222222222222
0.176470588235
0.0769230769231
0.0
0.230769230769
0.166666666667
0.0
0.157894736842
0.230769230769
0.2
0.142857142857
0.2
0.125
0.111111111111
0.142857142857
0.125
0.0714285714286
0.0
0.25
0.111111111111
0.333333333333
0.0
0.222222222222
0.25
0.0
0.25
0.166666666667
0.0952380952381
0.0434782608696
0.111111111111
0.0
0.0
0.0
0.0
0.0
0.25
0.0
0.285714285714
0.25
0.0714285714286
0.125
0.111111111111
0.2
0.117647058824
0.2
0.0
0.25
0.111111111111
0.166666666667
0.333333333333
0.0
0.1875
0.0
0.0
0.0666666666667
0.0
0.333333333333
0.047619047619
0.2
0.0
0.166666666667
0.0
0.333333333333
0.0
0.153846153846
0.0
0.115384615385
0.166666666667
0.166666666667
0.0
0.0
0.0666666666667
0.0
0.1
0.0
0.0
0.0
0.25
0.0
0.125
0.0909090909091
0.142857142857
0.25
0.142857142857
0.142857142857
0.333333333333
0

In [68]:
similarities

{'80050': None,
 '81402': None,
 '84916': None,
 '86360': None,
 '84914': None,
 '84913': None,
 '86367': None,
 '89376': None,
 '86365': None,
 '89378': None,
 '88710': None,
 '86369': None,
 '84919': None,
 '89176': None,
 '89173': None,
 '89170': None,
 '89171': None,
 '82198': None,
 '87969': None,
 '82448': None,
 '82443': None,
 '80151': None,
 '82440': None,
 '82446': None,
 '82197': None,
 '82196': None,
 '85450': None,
 '88625': None,
 '88155': None,
 '88154': None,
 '88153': None,
 '88152': None,
 '90246': None,
 '88628': None,
 '85047': None,
 '88395': None,
 '84862': None,
 '88397': None,
 '82992': None,
 '82995': None,
 '88390': None,
 '82997': None,
 '82996': None,
 '81536': None,
 '81537': None,
 '81243': None,
 '81532': None,
 '89372': None,
 '82229': None,
 '86211': None,
 '87585': None,
 '89373': None,
 '87581': None,
 '87580': None,
 '85458': None,
 '89374': None,
 '89375': None,
 '86986': None,
 '80054': None,
 '86982': None,
 '86980': None,
 '80598': None,
 '81008'

In [60]:
K = 5 #number of similar users to look at


In [61]:
# Now for the top K most similar users, let's aggregate the brands they like.
# I sort by the jaccard similarty so most similar users are first
# I use the sorted method, but because I'm dorting dictionaries
# I specify the "key" as the value of the dictionary
# the key is what the list should sort on
# so the most similar users end up being on top
# ANSWER HEREEEEEE
**need to complete this** most_similar_users = sorted(similarities, key=similarities.get, )

In [64]:
# list of K similar users' IDs
most_similar_users = sorted(similarities, key=similarities.get, reverse=True)[:K]

In [65]:
# let's see what some of the most similar users likes
brandsfor[most_similar_users[0]]

['KitchenAid',
 'Betsey Johnson',
 'Cuisinart',
 'Lacoste',
 'Nine West',
 'Diesel',
 'Banana Republic',
 'Kate Spade',
 'Gap',
 'Ethan Allen',
 'Restoration Hardware',
 'J.Crew',
 "Lands' End",
 'Crate & Barrel',
 'Target',
 'Home Depot',
 'Nordstrom']

In [66]:
brandsfor[most_similar_users[3]]

['Nicole Miller',
 'Kenneth Cole',
 'New Balance',
 'Old Navy',
 'Target',
 'Nordstrom',
 'Shoebuy']

In [70]:
# Aggregate all brands liked by the K most similar users into a single set
brands_to_recommend = set()
for user in most_similar_users:
    # for each user
    brands_to_recommend.update(set(brandsfor[user]))
    # add to the set of brands_to_recommend

In [73]:
brands_to_recommend
# UH OH WE HAVE DUPLICATES. Banana Republic, Old Navy, Target are all repeats.


TypeError: unsupported operand type(s) for -: 'set' and 'list'

In [77]:
# EXERCISE: use a set difference so brands_to_recommend only has
# brands that given_user hasn't seen yet


In [80]:
brands_to_recommend = brands_to_recommend - set(given_user)

TypeError: unsupported operand type(s) for -: 'list' and 'set'

In [78]:
# without duplicates
brands_to_recommend

In [None]:
######################
## One Step Further ##
######################

In [None]:
# We can take this one step further and caculate a "score" of recommendation
# We will define the score as being the number of times
# a brand appears within the first K users
brands_to_recommend = []
for user in most_similar_users:
    pass

In [None]:
# Use a counter to count the number of times a brand 
# appears in brands_to_recommend
# assign it to the variable recommend_with_scores

In [None]:
# Now we see Gap has the highest score!
recommend_with_scores

In [None]:
#################################
#### Collaborative Item based ###
#################################

In [None]:
# '''
# We can also define a similary between items using jaccard similarity.
# We can say that the similarity between two items is the jaccard similarity
# between the sets of people who like the two brands.

# Example: similarity of Gap to Target is:
# '''

In [None]:
# filter users by liking Gap
gap_lovers = set(user_brands['Gap' == user_brands.Store].ID)
old_navy_lovers = set(user_brands['Old Navy' == user_brands.Store].ID)


In [None]:
# similarty between Gap and Old Navy
jaccard(gap_lovers, old_navy_lovers)

In [None]:
guess_lovers = set(user_brands['Guess' == user_brands.Store].ID)
# similarty between Gap andGuess
jaccard(guess_lovers, gap_lovers)

In [None]:
calvin_lovers = set(user_brands['Calvin Klein' == user_brands.Store].ID)
# similarty between Gap and Calvin Klein
jaccard(calvin_lovers, gap_lovers)