In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [38]:
raw = pd.read_csv('data/candy.csv')
raw.head(5)

Unnamed: 0,candy,max,zekry,jonathan,mousa,blair,vishal,megan,sara,lucas,cody,minh
0,100 Grand,,,,,,,,,,,
1,3 Musketeers,,,8.0,,,,,,,7.0,
2,One dime,,,,,,,,,,,
3,One quarter,,,,,,,,,,,
4,Air Heads,4.0,,8.0,,5.0,5.0,6.0,5.0,,,


In [39]:
df = pd.melt(raw, id_vars='candy', var_name='user', value_name='rating')
df = df.dropna()
df.head(5)

Unnamed: 0,candy,user,rating
4,Air Heads,max,4.0
8,Candy Corn,max,1.0
10,Charleston Chew,max,3.0
12,Chiclets,max,4.0
13,Dots,max,6.0


In [18]:
user = 'user'
item = 'candy'
rating = 'rating'

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

eu = user_encoder.fit_transform(df[user])
ei = item_encoder.fit_transform(df[item])

np.unique(eu)

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [23]:
from scipy.sparse import csr_matrix

data = df['rating'].values

i = eu 
j = ei
M = len(np.unique(i))
N = len(np.unique(j))

sparse_matrix = csr_matrix((data, (i, j)), shape=(M, N))
pd.DataFrame(
    sparse_matrix.todense(),
    index = user_encoder.classes_,
    columns = item_encoder.classes_
)

Unnamed: 0,3 Musketeers,Air Heads,Almond Joy,Candy Corn,Charleston Chew,Chewey Lemonhead Fruit Mix,Chiclets,Dots,Fruit Chews,Fun Dip,...,Swedish Fish,Tootsie Pop,Tootsie Rolls,Twix,Twizzlers,Warheads,Welch's Fruit Snacks,Werther's Original Caramel,Whoppers,Wine gums
blair,0.0,5.0,3.0,2.0,0.0,0.0,2.0,0.0,0.0,7.0,...,7.0,5.0,6.0,9.0,7.0,6.0,3.0,4.0,0.0,7.0
cody,7.0,0.0,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,...,6.0,0.0,5.0,10.0,6.0,6.0,9.0,8.0,0.0,8.0
jonathan,8.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,5.0,0.0,0.0,8.0,5.0,0.0,9.0
max,0.0,4.0,0.0,1.0,3.0,0.0,4.0,6.0,0.0,6.0,...,6.0,0.0,1.0,7.0,0.0,6.0,6.0,3.0,0.0,7.0
megan,0.0,6.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,9.0,...,0.0,1.0,1.0,4.0,6.0,6.0,5.0,9.0,9.0,8.0
mousa,0.0,0.0,0.0,8.0,0.0,8.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0
sara,0.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,4.0,0.0,2.0,3.0,0.0,0.0
vishal,0.0,5.0,0.0,4.0,0.0,0.0,3.0,3.0,0.0,5.0,...,0.0,0.0,0.0,8.0,7.0,6.0,8.0,4.0,0.0,5.0
zekry,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,2.0,0.0,0.0,6.0,0.0,0.0,0.0,7.0,0.0,9.0


In [24]:
from lightfm import LightFM

model = LightFM(no_components=30)



In [25]:
model.fit(sparse_matrix, epochs=20)

<lightfm.lightfm.LightFM at 0xa15820c18>

In [29]:
person = 'max'
user_id = user_encoder.transform([person])[0]
model.predict(user_id, ei)

array([2.34412646, 2.1962769 , 2.03344178, 2.32646728, 2.03518414,
       2.19051623, 2.26881623, 2.35201478, 2.20023298, 2.45238996,
       2.4962666 , 2.1099534 , 2.45110679, 2.45664668, 2.49625993,
       2.32689738, 2.33569384, 2.33436632, 2.11765838, 2.2774632 ,
       2.33667779, 2.49734116, 1.84394574, 2.39910316, 2.45212889,
       2.45146847, 2.19568872, 2.12868881, 2.2738657 , 2.33413315,
       2.20289588, 1.61981237, 2.49470258, 2.26026535, 2.39521503,
       2.4513762 , 2.39267373, 2.32646728, 2.35201478, 2.45238996,
       2.4962666 , 2.34374452, 2.45110679, 2.45664668, 2.49625993,
       2.32915354, 2.34259701, 2.11765838, 2.49734116, 2.04220748,
       2.45212889, 2.45146847, 2.40753388, 2.20289588, 2.49470258,
       2.4513762 , 2.39267373, 2.05766892, 2.34412646, 2.26881623,
       2.35201478, 2.20023298, 2.45238996, 2.4962666 , 2.45110679,
       2.45664668, 2.49625993, 2.32689738, 2.34259701, 2.23525572,
       2.11765838, 2.49734116, 2.28945684, 2.39910316, 2.45212

In [35]:
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

{'test_rmse': array([1.42357251, 0.61624716]),
 'test_mae': array([1.32740105, 0.61585966]),
 'fit_time': (0.0007920265197753906, 3.695487976074219e-05),
 'test_time': (0.0007479190826416016, 2.9087066650390625e-05)}

In [42]:
df[['user', 'candy', 'rating']].head()

Unnamed: 0,user,candy,rating
4,max,Air Heads,4.0
8,max,Candy Corn,1.0
10,max,Charleston Chew,3.0
12,max,Chiclets,4.0
13,max,Dots,6.0


In [57]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['user', 'candy', 'rating']], reader)

trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

model.predict(3, 0)

Prediction(uid=3, iid=0, r_ui=None, est=6.180645161290323, details={'was_impossible': False})

In [60]:
testset = trainset.build_testset()

In [61]:
model.test(testset)

[Prediction(uid='max', iid='Air Heads', r_ui=4.0, est=5.019856176874441, details={'was_impossible': False}),
 Prediction(uid='max', iid='Candy Corn', r_ui=1.0, est=3.813655641816787, details={'was_impossible': False}),
 Prediction(uid='max', iid='Charleston Chew', r_ui=3.0, est=4.93420693102974, details={'was_impossible': False}),
 Prediction(uid='max', iid='Chiclets', r_ui=4.0, est=4.050825525719798, details={'was_impossible': False}),
 Prediction(uid='max', iid='Dots', r_ui=6.0, est=5.431627411671421, details={'was_impossible': False}),
 Prediction(uid='max', iid='Fun Dip', r_ui=6.0, est=5.908735358335794, details={'was_impossible': False}),
 Prediction(uid='max', iid='Haribo Gold Bears', r_ui=7.0, est=6.19566248906724, details={'was_impossible': False}),
 Prediction(uid='max', iid='Haribo Happy Cola', r_ui=7.0, est=6.138979401332663, details={'was_impossible': False}),
 Prediction(uid='max', iid='Haribo Sour Bears', r_ui=8.0, est=6.411313909601244, details={'was_impossible': False})

In [36]:
person = 'sara'
user_id = user_encoder.transform([person])[0]
model.predict(np.array([user_id]))

pred = pd.DataFrame({
    'p-rating': model.predict(np.array([user_id])),
    'candy': item_encoder.classes_
})

pred = pred.sort_values('p-rating', ascending=False)
ordered_candy = pred.candy.tolist()

tried = df[df['user'] == person].candy.tolist()

[candy for candy in ordered_candy if candy not in tried][:10]

# megan -> 3 Muskateers
# mousa -> wine gums

['Mike & Ike',
 'Gobstopper',
 'Milky Way',
 'Fun Dip',
 'Haribo Gold Bears',
 'Wine gums',
 'Warheads',
 '3 Musketeers',
 'Milk Duds',
 'Dots']

In [35]:
df.groupby('user')['rating'].mean()

user
blair       6.375000
cody        6.593750
jonathan    5.760000
max         5.783784
megan       6.285714
mousa       8.322581
sara        4.888889
vishal      5.846154
zekry       5.750000
Name: rating, dtype: float64