In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
from lightfm import LightFM

In [32]:
# Quick Data Prep

In [3]:
raw = pd.read_csv('data/candy.csv')
raw.head(5)

Unnamed: 0,candy,michelle,max,zekry,jonathan,mousa,blair,vishal,megan,sara,cody,karina
0,100 Grand,,,,,,,,,,,
1,3 Musketeers,,,,8.0,,,,,,7.0,
2,One dime,,,,,,,,,,,
3,One quarter,,,,,,,,,,,
4,Air Heads,5.0,4.0,,8.0,,5.0,5.0,6.0,5.0,,5.0


In [4]:
df = pd.melt(raw, id_vars='candy', var_name='user', value_name='rating')
df = df.dropna()
df.head(5)

Unnamed: 0,candy,user,rating
4,Air Heads,michelle,5.0
5,Almond Joy,michelle,2.0
6,Baby Ruth,michelle,6.0
8,Candy Corn,michelle,8.0
10,Charleston Chew,michelle,2.0


In [33]:
# The Magic

In [5]:
class InteractionMachine:
    
    def __init__(self, df, ratings, users, items):
        self._ratings = np.array(df[ratings])
        self._users = np.array(df[users])
        self._items = np.array(df[items])
        # heavy lifting encoders
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()
        # preparation for the csr matrix
        u = self.user_encoder.fit_transform(self._users)
        i = self.item_encoder.fit_transform(self._items)
        uu = len(np.unique(u))
        ui = len(np.unique(i))
        # the good stuff
        self.interactions = csr_matrix((self._ratings, (u, i)), shape=(uu, ui))
    
    def get_users(self, encoded=False):
        users = np.unique(self._users)
        if encoded:
            users = self.user_encoder.transform(users)
        return users
    
    def get_items(self, encoded=False):
        items = np.unique(self._items)
        if encoded:
            items = self.item_encoder.transform(items)
        return items

In [6]:
im = InteractionMachine(df, 'rating', 'user', 'candy')
im.interactions

<11x71 sparse matrix of type '<class 'numpy.float64'>'
	with 405 stored elements in Compressed Sparse Row format>

In [7]:
im.get_items(encoded=True)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70])

In [8]:
im.get_users(encoded=False)

array(['blair', 'cody', 'jonathan', 'karina', 'max', 'megan', 'michelle',
       'mousa', 'sara', 'vishal', 'zekry'], dtype=object)

In [9]:
pd.DataFrame(
    im.interactions.todense(),
    index = im.get_users(),
    columns = im.get_items()
)

Unnamed: 0,3 Musketeers,Air Heads,Almond Joy,Baby Ruth,Candy Corn,Charleston Chew,Chewey Lemonhead Fruit Mix,Chiclets,Dots,Dum Dums,...,Tootsie Pop,Tootsie Roll Midgies,Tootsie Rolls,Twix,Twizzlers,Warheads,Welch's Fruit Snacks,Werther's Original Caramel,Whoppers,Wine gums
blair,0.0,5.0,3.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,...,5.0,0.0,6.0,9.0,7.0,6.0,3.0,4.0,0.0,7.0
cody,7.0,0.0,0.0,0.0,0.0,6.0,0.0,4.0,0.0,0.0,...,0.0,0.0,5.0,10.0,6.0,6.0,9.0,8.0,0.0,8.0
jonathan,8.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,5.0,0.0,0.0,8.0,5.0,0.0,9.0
karina,0.0,5.0,0.0,0.0,2.0,0.0,0.0,1.0,4.0,0.0,...,6.0,0.0,4.0,5.0,8.0,4.0,6.0,0.0,0.0,0.0
max,0.0,4.0,0.0,0.0,1.0,3.0,0.0,4.0,6.0,0.0,...,0.0,0.0,1.0,7.0,0.0,6.0,6.0,3.0,0.0,7.0
megan,0.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,4.0,6.0,6.0,5.0,9.0,9.0,8.0
michelle,0.0,5.0,2.0,6.0,8.0,2.0,0.0,1.0,0.0,5.0,...,4.0,3.0,4.0,8.0,7.0,4.0,8.0,9.0,7.0,9.0
mousa,0.0,0.0,0.0,0.0,8.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0
sara,0.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,4.0,0.0,2.0,3.0,0.0,0.0
vishal,0.0,5.0,0.0,0.0,4.0,0.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,8.0,7.0,6.0,8.0,4.0,0.0,5.0


In [10]:
model = LightFM()

In [11]:
model.fit(im.interactions, epochs=20)

<lightfm.lightfm.LightFM at 0x1a231384e0>

In [32]:
person = 'max'
user_id = im.user_encoder.transform([person])[0]
preds = model.predict(user_id, im.get_items(encoded=True))

In [33]:
candy = pd.DataFrame({
    'candy': im.get_items(),
    'rating': preds
}).sort_values('rating', ascending=False)
candy

Unnamed: 0,candy,rating
45,Reese's Peanut Butter cup,2.615568
19,Hershey's Milk Chocolate,2.600718
26,M&M's,2.594447
64,Twix,2.588922
23,Kit Kat,2.561655
52,Skittles original,2.559581
37,Nestle Smarties,2.552785
17,Hershey's Kisses,2.535986
68,Werther's Original Caramel,2.525778
67,Welch's Fruit Snacks,2.508765


In [34]:
reco = candy.candy.values.tolist()

In [35]:
tried = df[df['user'] == 'max'].candy.tolist()

In [36]:
[candy for candy in reco if candy not in tried][:5]

['Snickers',
 'Twizzlers',
 'Laffy Taffy',
 "Hershey's Special Dark",
 "Reese's pieces"]