In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_parquet('./food_data.parquet', engine= 'auto')
df.head()

Unnamed: 0,shop_id,shop_name,name_product,price_product,rating,rating_average,location,count_review
0,1415675181,Freezy Fresh,Pastel Goreng Kulit Renyah Frozen 5 pcs,38000.0,5,4.9,Bekasi,245
1,1658405839,Korean Jjang,D KING BONIBOL 250 GR ISI 1 PCS BISKUIT SALUT ...,17000.0,5,4.9,Jakarta Barat,213
2,477718102,Chocomory Official Store,Chocomory Choco Pie Isi 20 540 Gr 1 Pc,78000.0,5,4.9,Bogor,1117
3,1495464819,Mayora Official Store,Better Big Pack 120 Gr,8650.0,5,5.0,Jakarta Barat,1736
4,1675639818,Hejo Fresh,Hejo Pempek Dos Vegetarian Pempek Kates Vegan,27900.0,5,4.9,Jakarta Pusat,373


In [3]:
df.columns

Index(['shop_id', 'shop_name', 'name_product', 'price_product', 'rating',
       'rating_average', 'location', 'count_review'],
      dtype='object')

In [4]:
#encoding
bow = CountVectorizer(stop_words='english', tokenizer=word_tokenize)
bank = bow.fit_transform(df.name_product)

In [5]:
idx = 0

In [6]:
content = df.loc[idx, 'name_product']
content

'Pastel Goreng Kulit Renyah Frozen 5 pcs'

In [7]:
code = bow.transform([content])
code

<1x837 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [8]:
#Product Search
from sklearn.metrics.pairwise import cosine_distances

In [9]:
dist = cosine_distances(code, bank)
dist

array([[3.33066907e-16, 8.95171516e-01, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 6.58118271e-01, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 7.81782110e-01, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 8.98984746e-01, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 8.66369379e-01, 8.95171516e-01,
        8.45696650e-01, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 7.81782110e-01,
        1.00000000e+00, 1.00000000e+00, 8.98984746e-01, 1.00000000e+00,
        8.66369379e-01, 9.05508882e-01, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.000000

In [10]:
#10 items
rec_idx = dist.argsort()[0, 1:11]
rec_idx

array([  6, 177, 194, 226, 174,  43,  14,  66,  97,  24], dtype=int64)

In [11]:
df.loc[rec_idx]

Unnamed: 0,shop_id,shop_name,name_product,price_product,rating,rating_average,location,count_review
6,1881353462,Korean Jjang,CIMOL LELEH KEJU ISI 20 PCS FROZEN CEMILAN INS...,28000.0,5,4.8,Jakarta Barat,680
177,1566626076,Freezy Fresh,Potato Moza Ball frozen 10 pcs,33000.0,5,4.8,Bekasi,145
194,2091028132,Freezy Fresh,Frozen Combro,69500.0,5,5.0,Bekasi,67
226,589082146,gerai nuget,Pitsa goreng rasa keju isi 6 pizza pigo frozen...,24000.0,5,4.9,Bekasi,151
174,1768870580,SmallFood,15 Pcs Siomay Somay Ikan Frozen Food Cemilan A...,28500.0,5,4.6,Tangerang Selatan,103
43,1377268397,Dimsum 49 Rawamagun,DIMSUM 49 RAWAMANGUN ISI 100 PCS DIMSUM HALAL ...,1999.0,5,5.0,Jakarta Timur,888
14,796812309,Dua Coffee Shop,Risoles Mayo Frozen,30000.0,5,4.8,Jakarta Selatan,306
66,987903935,RICARAJA OFFICIAL,FROZEN FOOD AYAM WOKU Vacuum,55000.0,5,4.6,Jakarta Utara,26
97,812019147,pempekberingin,Pempek Kulit Beringin Vacum Isi 10Pcs,66000.0,5,5.0,Jakarta Barat,311
24,1719573818,MD GROCERY,MILO ENERGY CUBE isi 100 Pcs,65000.0,5,4.9,Bogor,1227


In [12]:
class RecommenderSystem :
    def __init__(self, df, content_col):
        self.df = pd.read_parquet(df)
        self.content_col = content_col
        self.encoder = None
        self.bank = None
        
    def fit(self):
        self.encoder = CountVectorizer(stop_words='english', tokenizer=word_tokenize)
        self.bank = self.encoder.fit_transform(self.df[self.content_col])
        
    def transform(self, idx, topr=10):
        content = df.loc[idx, self.content_col]
        code = self.encoder.transform([content])
        dist = cosine_distances(code, self.bank)
        rec_idx = dist.argsort()[0, 1:(topr+1)]
        return self.df.loc[rec_idx]

In [13]:
recsys = RecommenderSystem('./food_data.parquet', content_col='name_product')
recsys.fit()

In [14]:
# 1 = D king bonibol
recsys.transform(1)

Unnamed: 0,shop_id,shop_name,name_product,price_product,rating,rating_average,location,count_review
23,1658426574,Korean Jjang,D KING BONIBOL 1 DUS ISI 4 PCS BISKUIT SALUT C...,67200.0,5,4.8,Jakarta Barat,364
132,2069774457,Rezeki Fresh Market,Kinder Bueno T2 Coklat 43 gr 2 Pcs,14500.0,5,5.0,Jakarta Barat,41
2,477718102,Chocomory Official Store,Chocomory Choco Pie Isi 20 540 Gr 1 Pc,78000.0,5,4.9,Bogor,1117
5,581813548,Chocomory Official Store,Chocomory Cookies Cream Pie Isi 20 540 Gr 1 Pc,78000.0,5,4.9,Bogor,671
262,2096274472,Super Grosir Mama,Biskuit Oatbits Vitafruit 110 gr,6950.0,5,4.9,Jakarta Selatan,38
202,1605352046,Ferry Fish,Piscok Meler Coklat Isi 8,11500.0,5,4.8,Tangerang Selatan,155
38,1520792374,Freezy Fresh,Lotus Biscoff Caramelised 250 gr,39800.0,5,5.0,Bekasi,95
278,830629871,Japfa Best Jakarta,Pempek Seafood Lovers 250 gr,16500.0,5,4.9,Kab. Tangerang,725
180,2047279761,Plaza Korea,MOMOGI STICK 1 BOX ISI 20 PCS CEMILAN EKSTRUDA...,9450.0,5,4.8,Jakarta Barat,202
24,1719573818,MD GROCERY,MILO ENERGY CUBE isi 100 Pcs,65000.0,5,4.9,Bogor,1227
