### The New Person Problem

<img src="images/who_dis.png" width=600>

<img src="images/new_person.png" width=600>

### The Blueprint Solution

In [1]:
import pandas as pd

likes = pd.DataFrame([
    [0, 1, 1, 0, 0, 0],
    [0, 1, 1, 1, 0, 0],
    [1, 0, 0, 1, 0, 0],
    [0, 1, 1, 0, 0, 1],
    [0, 0, 0, 1, 1, 1]], 
    columns=['twix', 'mars', 'reeses', 'skittles', 'snickers', 'lindt']
)

likes

Unnamed: 0,twix,mars,reeses,skittles,snickers,lindt
0,0,1,1,0,0,0
1,0,1,1,1,0,0
2,1,0,0,1,0,0
3,0,1,1,0,0,1
4,0,0,0,1,1,1


In [2]:
from sklearn.metrics.pairwise import euclidean_distances

pd.DataFrame(euclidean_distances(likes))

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,2.0,1.0,2.236068
1,1.0,0.0,1.732051,1.414214,2.0
2,2.0,1.732051,0.0,2.236068,1.732051
3,1.0,1.414214,2.236068,0.0,2.0
4,2.236068,2.0,1.732051,2.0,0.0


In [3]:
new = pd.DataFrame([[0, 0, 1, 0, 0, 1]])

In [4]:
pd.DataFrame(euclidean_distances(likes, new))

Unnamed: 0,0
0,1.414214
1,1.732051
2,2.0
3,1.0
4,1.732051


### Using real data again...

In [5]:
df = pd.read_csv("data/candy.csv")
df = df[df['review'] >= 5]

Prep for CountVectorizer:

In [6]:
df = df.groupby(["user"])["item"].apply(lambda x: ",".join(x))
df = pd.DataFrame(df)
df.head()

Unnamed: 0_level_0,item
user,Unnamed: 1_level_1
aaron67,"Kit Kat Minis Crisp Wafers in Milk Chocolate,R..."
aaron68,"Brookside Dark Chocolate Pomegranate Flavor,Re..."
aaron73,Dove Chocolate Promises Silky Smooth Sea Salt ...
abarker,"Reese's Peanut Butter Bunny,Ghirardelli Gourme..."
abigail04,"Kit Kat Minis Crisp Wafers in Milk Chocolate,P..."


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(tokenizer=lambda x: x.split(","))
X = cv.fit_transform(df['item'])

In [8]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5)
nn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [9]:
neighbors = nn.kneighbors(X, return_distance=False)
neighbors

array([[   0,  906, 1764,  853,  373],
       [   1,  399, 2104,  766,  853],
       [   2,  729,  321,  902, 2071],
       ...,
       [2490,  612,  652,  483,  863],
       [2491,  789,  305,  114, 2370],
       [2492,  150,  253,  730, 1231]])

In [10]:
neighbors[0]

array([   0,  906, 1764,  853,  373])

In [11]:
candy = []
for n in neighbors[0]:
    c = df.iloc[int(n)].values[0].split(",")
    candy.extend(c)
    
list(set(candy))

["Reese's Peanut Butter Heart",
 'Snickers Peanut Butter Squared Bars',
 'Nestle Baby Ruth Bar',
 "Reese's Peanut Butter Cup Pumpkins Milk Chocolate",
 'Dove Chocolate Silky Smooth Dark Chocolate Singles Bar',
 'Twizzlers Twists Strawberry',
 'Kit Kat Minis Crisp Wafers in Milk Chocolate']

### Wrapping it up and putting a bow on it

In [12]:
df = pd.read_csv("data/candy.csv")
df = df[df['review'] >= 5]
df = df.groupby(["user"])["item"].apply(lambda x: ",".join(x))
df = pd.DataFrame(df)
df.head()

Unnamed: 0_level_0,item
user,Unnamed: 1_level_1
aaron67,"Kit Kat Minis Crisp Wafers in Milk Chocolate,R..."
aaron68,"Brookside Dark Chocolate Pomegranate Flavor,Re..."
aaron73,Dove Chocolate Promises Silky Smooth Sea Salt ...
abarker,"Reese's Peanut Butter Bunny,Ghirardelli Gourme..."
abigail04,"Kit Kat Minis Crisp Wafers in Milk Chocolate,P..."


In [13]:
class NNRecommender:
    def __init__(self, n_neighbors=5, separator=","):
        self.separator = separator
        self.cv = CountVectorizer(tokenizer=lambda x: x.split(separator))
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)
        
    def __repr__(self):
        return f'NNRecommender(n_neighbors={self.nn.n_neighbors}, separator="{self.separator}")'
        
    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xp = []
        for Xi in X:
            Xt = self.cv.transform([Xi])
            neighbors = self.nn.kneighbors(Xt, return_distance=False)
            repos = []
            for n in neighbors[0]:
                r = self.X.iloc[int(n)].split(self.separator)
                repos.extend(r)
            repos = list(set(repos))
            repos = [r for r in repos if r not in Xi.split(self.separator)]
            Xp.append(repos)
        return Xp

In [14]:
model = NNRecommender(n_neighbors=5)
model.fit(df["item"])

NNRecommender(n_neighbors=5, separator=",")

In [15]:
df.sample(1)['item'].values

array(["Dove Chocolate Promises Silky Smooth Almond Dark Chocolate,Hershey's Kisses Milk Chocolates with Almonds,Dove Chocolate Promises Silky Smooth Milk Chocolate,Hershey's Kisses Milk Chocolate"],
      dtype=object)

In [16]:
sweet = ["Airheads Xtremes Sweetly Sour Candy Rainbow Berry,Life Savers Five Flavor Gummies,Twizzlers Pull-N-Peel Candy Cherry"]

In [17]:
peanut = ["Reese's Peanut Butter Cups Miniatures,M&Ms Peanut Chocolate Candy,Reese's Peanut Butter Big Cup"]

In [18]:
model.predict(sweet)

[["Hershey's Whoppers Malted Milk Balls",
  'Starburst Tropical Fruit Chews Candy',
  "Werther's Original Caramel Hard Candies",
  'Trolli Sour Brite Eggs Candy',
  'Nestle Butterfinger Bites']]

In [19]:
model.predict(peanut)

[["Reese's Outrageous King Size Bar",
  'Snickers Chocolate Bar',
  "Reese's Peanut Butter Egg"]]

In [20]:
import cloudpickle

In [21]:
with open("model.pkl", "wb") as f:
    cloudpickle.dump(model, f)

In [22]:
del model

In [23]:
with open("model.pkl", "rb") as f:
    model = cloudpickle.load(f)

In [24]:
model.predict(peanut)

[["Reese's Outrageous King Size Bar",
  'Snickers Chocolate Bar',
  "Reese's Peanut Butter Egg"]]