In [2]:
from math import sqrt
import numpy as np


import pandas as pd

raw = pd.read_csv('cocktail_user_survey.csv')

raw

Unnamed: 0,user_id,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,Mojito,-1,1,1,1,-1,1,1,-1,1,...,-1,1,1,-1,1,1,1,1,1,1
1,Peach Crush,-1,1,1,1,-1,1,-1,-1,1,...,-1,1,1,-1,1,-1,0,1,-1,1
2,Old Fashioned,-1,-1,1,0,-1,1,-1,1,-1,...,-1,-1,-1,-1,-1,-1,0,-1,-1,-1
3,Blue Hawaii,-1,0,0,1,-1,0,1,-1,1,...,1,1,1,1,1,-1,-1,-1,1,1
4,Long Island Tea,-1,-1,1,-1,-1,0,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,1,-1,1
5,Dry Martini,0,-1,1,-1,-1,1,-1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
6,Apple Martini,-1,1,0,1,-1,0,-1,-1,1,...,-1,-1,1,-1,-1,-1,0,-1,-1,1
7,Margarita,1,-1,1,-1,-1,1,1,-1,1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
8,Manhattan,-1,-1,1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,Jin tonic,1,1,1,1,-1,1,1,-1,1,...,-1,1,-1,1,-1,1,1,-1,-1,0


In [341]:
# preprocess raw data to adaptable to algorithm
rate = {}  # raw.columns.size = 62 (user size + 1)

for i in range(raw.columns.size):
    first = 0
    for c in range(len(raw)-1):
        if raw.iloc[c,i] != -1 : break
        first += 1
    if raw.iloc[first,i] != -1 :
        rate[raw.columns[i]] = {raw.user_id[first] : raw.iloc[first,i]}
    
for t in range(raw.columns.size-1):
    for i in range(raw.user_id.size):
        if raw.iloc[i,t+1] == -1 : continue
        rate[raw.columns[t+1]][raw.user_id[i]] = raw.iloc[i,t+1]

raw.user_id.size

del rate['user_id']

# Euclidean Distance Score
def sim_distance(prefs, person1, person2):
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    
    if len(si)==0: return 0
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
                       for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

# Pearson Correlation Score
def sim_pearson(prefs, p1, p2):
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]:si[item]=1
            
    n=len(si)
    
    if n==0: return 0
    # Add up all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    
    #Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    #Sum up the products
    pSum=sum([prefs[p1][it] * prefs[p2][it] for it in si])
    
    #Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt(sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)
    if den==0:return 0
    r=num/den
    return r

def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            
            result[item][person]=prefs[person][item]
    return result

def topMatches(prefs,person,n=5,similarity=sim_distance):
    scores=[(similarity(prefs,person,other),other)
    for other in prefs if other!=person]
    
    scores.sort()
    scores.reverse()
    return scores[0:n]

def calculateSimilarItems(prefs, n=10):
    result = {}
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
        c+=1
        if c%100 == 0: print("%d / %d" %(c, len(itemPrefs)))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_pearson)
        result[item]=scores
    return result

itemsim=calculateSimilarItems(rate)

itemsim # item간 유사도

# User-based cf
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        if other==person: continue
        sim=similarity(prefs,person,other)
        
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            
            # only scores cocktails I haven't drink yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
                
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    
    iter = 0
    for i in range(len(rankings)):        
         if rankings[i][0] < 0.5:   # set recommend threshold = 0.5
                del rankings[i]
                iter = i
                break
    for i in range(len(rankings)-iter): del rankings[iter]  

    return rankings


# Item-based cf
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}

    # Loop over items rated by this user
    for (item,rating) in userRatings.items():
        # Loop over items similar to this one
        for (similarity, item2) in itemMatch[item]:
            
            # Ignore if this user has already rated this item
            if item2 in userRatings:continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
        
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items()]
        
    # Return the rankings from highest to lowest

    rankings.sort()
    rankings.reverse()
    
    iter = 0
    for i in range(len(rankings)):        
         if rankings[i][0] < 0.5: # set recommend threshold = 0.5
                del rankings[i]
                iter = i
                break
    for i in range(len(rankings)-iter): del rankings[iter] 
        
    return rankings

user_id = rate.keys()

user_id

user_id = list(user_id)

count = 0
for i in range(len(user_id)):
    
    if not getRecommendations(rate,user_id[i]): 
        #print(getRecommendations(rate,user_id[i]))
        count += 1
    print(getRecommendations(rate,user_id[i]))
print(count)

count = 0
for i in range(len(user_id)):
    if not getRecommendedItems(rate,itemsim,user_id[i]): 
        print(getRecommendedItems(rate,itemsim,user_id[i]))
        count += 1
    print(getRecommendedItems(rate,itemsim,user_id[i]))
print(count)