### Structure

In [8]:
import pandas as pd
import numpy as np

In [9]:
from sklearn.preprocessing import LabelEncoder

class InteractionMachine:
    def __init__(self):
        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()

    def __repr__(self):
        return 'InteractionMachine()'

    def build(self, users, items, ratings):
        u = self.user_encoder.fit_transform(users)
        i = self.item_encoder.fit_transform(items)
        self.n_users = len(np.unique(u))
        self.n_items = len(np.unique(i))
        self.interactions = csr_matrix((ratings, (u, i)), shape=(self.n_users, self.n_items))
        return self

### New Users

In [10]:
ex = pd.DataFrame([
    [0, 1, 1, 0, 0, 0], 
    [0, 1, 1, 1, 0, 0],
    [1, 0, 0, 1, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 0, 0, 1, 1, 1]
])

In [11]:
from sklearn.metrics.pairwise import euclidean_distances

euclidean_distances(ex)

array([[0.        , 1.        , 2.        , 1.        , 2.23606798],
       [1.        , 0.        , 1.73205081, 1.41421356, 2.        ],
       [2.        , 1.73205081, 0.        , 2.23606798, 1.73205081],
       [1.        , 1.41421356, 2.23606798, 0.        , 2.        ],
       [2.23606798, 2.        , 1.73205081, 2.        , 0.        ]])

In [12]:
df = pd.read_csv("data/candy.csv")
df = df[df['review'] >= 4]

In [13]:
df.sample(5)

Unnamed: 0,item,user,review
7046,Ferrero Rocher Chocolate,raymondwright,5
10835,3 Musketeers Miniature Bars,jessicajenkins,5
55,Lindt Lindor Milk Chocolate Truffles,mrobinson,5
11086,Hershey's Milk Chocolate Bar with Almonds,shawn01,5
5774,Starburst Favereds Minis Fruit Chews Candy Bag,briandeleon,4


In [14]:
df = df.groupby(["user"])["item"].apply(lambda x: ",".join(x))
df = pd.DataFrame(df)
df.head()

Unnamed: 0_level_0,item
user,Unnamed: 1_level_1
aaron67,"Mike and Ike Sour Licious Zours,Kit Kat Minis ..."
aaron68,"Hubba Bubba Bubbletape Awesome Original,Brooks..."
aaron73,Dove Chocolate Promises Silky Smooth Sea Salt ...
abarker,"Reese's Peanut Butter Bunny,Ghirardelli Gourme..."
abigail04,"Kit Kat Minis Crisp Wafers in Milk Chocolate,P..."


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(tokenizer=lambda x: x.split(","), max_features=250)
X = cv.fit_transform(df['item'])

In [16]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5)
nn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [17]:
neighbors = nn.kneighbors(X, return_distance=False)
neighbors

array([[   0, 2042,  920,  186, 1625],
       [   1,  404, 2424, 1110, 2370],
       [   2, 2221,  677, 1386,  739],
       ...,
       [2525,  620,  782,  661, 1546],
       [2526,  308, 2236,  177,  801],
       [2527, 1252, 1350, 2111, 2346]])

In [18]:
neighbors[0]

array([   0, 2042,  920,  186, 1625])

In [19]:
candy = []
for n in neighbors[0]:
    c = df.iloc[int(n)].values[0].split(",")
    candy.extend(c)
    
list(set(candy))

['Snickers Peanut Butter Squared Bars',
 "Reese's Peanut Butter Heart",
 'Kirkland Milk Chocolate Almonds',
 'Mounds Candy Bar',
 'Kraft Vanilla Caramels Snack Bags',
 'Sour Punch Real Rainbow Straws',
 "Hershey's Natural Unsweetened Cocoa",
 'Airheads Bites Fruit',
 'Kit Kat Minis Crisp Wafers in Milk Chocolate',
 'Jet Puffed Stackermallows Marshmallows',
 'Nestle Butterfinger Bites',
 'Mike and Ike Sour Licious Zours',
 "Reese's Peanut Butter Cup Pumpkins Milk Chocolate"]

### Putting a bow on it

In [20]:
df = pd.read_csv("data/candy.csv")
df = df[df['review'] >= 4]
df = df.groupby(["user"])["item"].apply(lambda x: ",".join(x))
df = pd.DataFrame(df)
df.head()

Unnamed: 0_level_0,item
user,Unnamed: 1_level_1
aaron67,"Mike and Ike Sour Licious Zours,Kit Kat Minis ..."
aaron68,"Hubba Bubba Bubbletape Awesome Original,Brooks..."
aaron73,Dove Chocolate Promises Silky Smooth Sea Salt ...
abarker,"Reese's Peanut Butter Bunny,Ghirardelli Gourme..."
abigail04,"Kit Kat Minis Crisp Wafers in Milk Chocolate,P..."


In [21]:
class NNRecommender:
    def __init__(
        self, n_neighbors=5, max_features=250, tokenizer=lambda x: x.split(",")):
        self.cv = CountVectorizer(tokenizer=tokenizer, max_features=max_features)
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xp = []
        for Xi in X:
            Xt = self.cv.transform([Xi])
            neighbors = self.nn.kneighbors(Xt, return_distance=False)
            repos = []
            for n in neighbors[0]:
                r = self.X.iloc[int(n)].split(",")
                repos.extend(r)
            repos = list(set(repos))
            repos = [r for r in repos if r not in Xi.split(",")]
            Xp.append(repos)
        return Xp

In [22]:
n_neighbors = 5
max_features = 250
model = NNRecommender(n_neighbors, max_features)
model.fit(df["item"])

<__main__.NNRecommender at 0x1a1f321310>

In [23]:
df.sample(1)['item'].values

array(["Ferrero Collection Fine Assorted Confections,Mike and Ike Sour Licious Zours,Bouquet of Fruits Valentine Chocolate Dipped Strawberries,Kit Kat Minis White Chocolate Candy,M&Ms Peanut Chocolate Candy,Twix,Hershey's Kisses Milk Chocolate,Ghirardelli Squares Peppermint Bark"],
      dtype=object)

In [24]:
sweet = ["Airheads Xtremes Sweetly Sour Candy Rainbow Berry,Life Savers Five Flavor Gummies,Twizzlers Pull-N-Peel Candy Cherry"]

In [25]:
peanut = ["Reese's Peanut Butter Cups Miniatures,M&Ms Peanut Chocolate Candy,Reese's Peanut Butter Big Cup"]

In [26]:
im.item_encoder.classes_

NameError: name 'im' is not defined

In [None]:
model.predict(sweet)

In [None]:
model.predict(peanut)

### Appendix

For when your data looks like this...

In [None]:
df = pd.read_csv('data/candy.csv')
df = df[df['user'].isin(df['user'].sample(10))]
df = df.pivot(index='item', columns='user', values='review')
df = df.reset_index()
df.head(5)

Do this...

In [None]:
df = df.melt(id_vars='item', var_name='user', value_name='review')
df = df.dropna().reset_index(drop=True)

df.head(5)

### Parting Thoughts

![](images/savage.png)

[Source](https://news.ycombinator.com/item?id=20495047)