In [341]:
from math import sqrt
import numpy as np


import pandas as pd

raw = pd.read_csv('cocktail_user_survey_10.csv')

In [342]:
raw

Unnamed: 0,user_id,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,Mojito,-1,1,1,1,-1,1,1,-1,1,...,-1,1,1,-1,1,1,1,1,1,1
1,Peach Crush,-1,1,1,1,-1,1,-1,-1,1,...,-1,1,1,-1,1,-1,0,1,-1,1
2,Old Fashioned,-1,-1,1,0,-1,1,-1,1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,-1
3,Blue Hawaii,-1,0,0,1,-1,0,1,-1,1,...,1,1,1,1,1,-1,-1,-1,1,1
4,Long Island Tea,-1,-1,1,-1,-1,0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,1
5,Dry Martini,0,-1,1,-1,-1,1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
6,Apple Martini,-1,1,0,1,-1,0,-1,-1,1,...,-1,-1,1,-1,-1,-1,0,-1,-1,1
7,Margarita,1,-1,1,-1,-1,1,1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
8,Manhattan,-1,-1,1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,Jin tonic,1,1,1,1,-1,1,1,-1,1,...,-1,1,-1,1,-1,1,1,-1,-1,0


In [343]:
# preprocess raw data to adaptable to algorithm
rate = {}  # raw.columns.size = 62 (user size + 1)

for i in range(raw.columns.size):
    first = 0
    for c in range(len(raw)-1):
        if raw.iloc[c,i] != -1 : break
        first += 1
    if raw.iloc[first,i] != -1 :
        rate[raw.columns[i]] = {raw.user_id[first] : raw.iloc[first,i]}
    
for t in range(raw.columns.size-1):
    for i in range(raw.user_id.size):
        if raw.iloc[i,t+1] == -1 : continue
        rate[raw.columns[t+1]][raw.user_id[i]] = raw.iloc[i,t+1]

In [344]:
raw.user_id.size

22

In [301]:
del rate['user_id']

In [303]:
# Euclidean Distance Score
def sim_distance(prefs, person1, person2):
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    
    if len(si)==0: return 0
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
                       for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

In [304]:
# Pearson Correlation Score
def sim_pearson(prefs, p1, p2):
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]:si[item]=1
            
    n=len(si)
    
    if n==0: return 0
    # Add up all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    
    #Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    #Sum up the products
    pSum=sum([prefs[p1][it] * prefs[p2][it] for it in si])
    
    #Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt(sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)
    if den==0:return 0
    r=num/den
    return r

In [305]:
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            
            result[item][person]=prefs[person][item]
    return result

In [306]:
def topMatches(prefs,person,n=5,similarity=sim_distance):
    scores=[(similarity(prefs,person,other),other)
    for other in prefs if other!=person]
    
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [307]:
def calculateSimilarItems(prefs, n=10):
    result = {}
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        c+=1
        if c%100 == 0: print("%d / %d" %(c, len(itemPrefs)))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_pearson)
        result[item]=scores
    return result

In [309]:
itemsim=calculateSimilarItems(rate)

In [310]:
itemsim # item간 유사도

{'Dry Martini': [(0.6017806225656264, 'Whiskey sour'),
  (0.38729833462074165, 'Old Fashioned'),
  (0.38273277230987157, 'June Bug'),
  (0.3410916157056769, 'Mojito'),
  (0.22344134689658168, 'Midori sour'),
  (0.13569561401644367, 'God Father'),
  (0.089055121960406, 'Margarita'),
  (0.04615384615384624, 'Jack Coke'),
  (0.02534717368677977, 'Screw Driver'),
  (0.020856594363009823, 'Jin tonic')],
 'Margarita': [(0.5408326913195983, 'Manhattan'),
  (0.2347816851683432, 'Long Island Tea'),
  (0.22706219204711095, 'God Father'),
  (0.2024809676835146, 'Screw Driver'),
  (0.19086270308410555, 'Barcardi'),
  (0.18697540017072004, 'Kahlua Milk'),
  (0.1721325931647741, 'June Bug'),
  (0.1526241057207505, 'Mojito'),
  (0.11795584024080187, 'Jin tonic'),
  (0.10327955589886445, 'Tequila Sunrise')],
 'Jin tonic': [(0.3779644730092272, 'God Father'),
  (0.296077061122885, 'Whiskey sour'),
  (0.28923056717375895, 'Old Fashioned'),
  (0.15464202373854416, 'Mojito'),
  (0.1526241057207505, 'Midor

In [330]:
# User-based cf
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        if other==person: continue
        sim=similarity(prefs,person,other)
        
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            
            # only scores cocktails I haven't drink yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
                
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    
    iter = 0
    for i in range(len(rankings)):        
         if rankings[i][0] < 0.5:   # set recommend threshold = 0.5
                del rankings[i]
                iter = i
                break
    for i in range(len(rankings)-iter): del rankings[iter]  

    return rankings


In [331]:
# Item-based cf
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}

    # Loop over items rated by this user
    for (item,rating) in userRatings.items():
        # Loop over items similar to this one
        for (similarity, item2) in itemMatch[item]:
            
            # Ignore if this user has already rated this item
            if item2 in userRatings:continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
        
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items()]
        
    # Return the rankings from highest to lowest

    rankings.sort()
    rankings.reverse()
    
    iter = 0
    for i in range(len(rankings)):        
         if rankings[i][0] < 0.5: # set recommend threshold = 0.5
                del rankings[i]
                iter = i
                break
    for i in range(len(rankings)-iter): del rankings[iter] 
        
    return rankings

In [268]:
user_id = rate.keys()

In [269]:
user_id

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '12', '13', '14', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '32', '33', '34', '35', '36', '38', '39', '40', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61'])

In [270]:
user_id = list(user_id)

In [338]:
count = 0
for i in range(len(user_id)):
    
    if not getRecommendations(rate,user_id[i]): 
        #print(getRecommendations(rate,user_id[i]))
        count += 1
    print(getRecommendations(rate,user_id[i]))
print(count)

[(1.0, 'Manhattan'), (0.9832447198787395, 'Peach Crush'), (0.9741191318625535, 'God Father'), (0.9626502456854334, 'Mojito'), (0.9509977647689266, 'Blue Hawaii'), (0.9500586862686378, 'Midori sour'), (0.9080320474520095, 'Long Island Tea'), (0.9041817650384035, 'Apple Martini'), (0.8429367741073261, 'Jack Coke'), (0.7967809619720658, 'Old Fashioned'), (0.7756551289265282, 'June Bug'), (0.7595531065492273, 'Vodca & tonic')]
[(0.9347110522899876, 'Dry Martini'), (0.8233319782569668, 'June Bug'), (0.7576416045661137, 'Old Fashioned'), (0.7575770060222626, 'Long Island Tea'), (0.7034785282689512, 'Margarita'), (0.6889203100291605, 'Cosmopolitan'), (0.6366702590196429, 'Manhattan'), (0.6266353565899183, 'God Father')]
[(0.8066859480585975, 'Apple Martini'), (0.562861039785417, 'Cosmopolitan'), (0.5226288906263546, 'Blue Hawaii')]
[(1.0, 'Long Island Tea'), (0.8907132917499231, 'Screw Driver'), (0.8011685610264099, 'Manhattan'), (0.7938239362079752, 'Jägerbomb'), (0.7745698917059697, 'Midori

In [339]:
count = 0
for i in range(len(user_id)):
    if not getRecommendedItems(rate,itemsim,user_id[i]): 
        print(getRecommendedItems(rate,itemsim,user_id[i]))
        count += 1
    print(getRecommendedItems(rate,itemsim,user_id[i]))
print(count)

[(1.0, 'Manhattan'), (0.9173701395186444, 'Peach Crush'), (0.9059605472434641, 'God Father'), (0.8451335257029852, 'Long Island Tea'), (0.789344781895883, 'Jack Coke'), (0.7382547925563351, 'Cosmopolitan'), (0.6923095638525985, 'Apple Martini'), (0.6808871274894827, 'Blue Hawaii'), (0.6558843947721603, 'Vodca & tonic'), (0.6288672208259184, 'Tequila Sunrise'), (0.6244879080882386, 'June Bug'), (0.5973862815959279, 'Midori sour'), (0.5780051227900754, 'Mojito')]
[(1.0, 'Old Fashioned'), (1.0, 'Dry Martini'), (0.7469318790651531, 'Cosmopolitan'), (0.7093410890622021, 'June Bug'), (0.6674671128568457, 'God Father'), (0.642748269956191, 'Vodca & tonic'), (0.6413179553985401, 'Long Island Tea'), (0.6078382175857355, 'Margarita')]
[]
[]
[(0.853083535578912, 'Long Island Tea'), (0.7986736737347491, 'Cosmopolitan'), (0.797798182524427, 'Jägerbomb'), (0.7857085098320715, 'Screw Driver'), (0.7441137761924786, 'Margarita'), (0.6868902597030111, 'Midori sour'), (0.6393748136995997, 'God Father'), 