In [1]:
import json, os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

from sqlalchemy import create_engine
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy_utils.functions import create_database, database_exists, drop_database

import pickle

## combine all files together

In [2]:
file_paths = []

for file in os.listdir('../data'):
    if 'product' in file:
        file_paths.append('../data/'+file)
file_paths

['../data/product10001_11000.csv',
 '../data/product11001_12000.csv',
 '../data/product12001_13000.csv',
 '../data/product13001_14000.csv',
 '../data/product14001_15000.csv',
 '../data/product1_2000.csv',
 '../data/product2001_3000.csv',
 '../data/product3001_4000.csv',
 '../data/product4001_5000.csv',
 '../data/product5001_6000.csv',
 '../data/product6001_7000.csv',
 '../data/product7001_8000.csv',
 '../data/product8001_9000.csv',
 '../data/product9001_10000.csv']

In [3]:
master_product = {
    'Unnamed: 0': [],
    'asin': [],
    'product_name': [],
    'product_brand': [],
    'product_rating': [],
    'product_rate_count': [],
    'category': [],
    'description': [],
}

master_product = pd.DataFrame(master_product)

In [4]:
for path in file_paths:
    master_product = pd.concat([master_product, pd.read_csv(path)])

master_product.head()

Unnamed: 0.1,Unnamed: 0,asin,product_name,product_brand,product_rating,product_rate_count,category,description
0,0.0,B01B4SHM52,SKIVA Micro USB Cables [4-Pack] USBLink Premiu...,SKIVA,4.4 out of 5 stars,70 ratings,"['Electronics', 'Computers & Accessories', 'Co...",['Make sure this fits\nby entering your model ...
1,1.0,B00XBL7LPM,GUANHE USB Drive Organizer Electronics Accesso...,GUANHE,4.1 out of 5 stars,176 ratings,"['Electronics', 'Computers & Accessories', 'Co...",['Make sure this fits\nby entering your model ...
2,2.0,B01B4J4WCW,SMA-09 Smart Fitness Tracker Watch Smartwatch ...,SMA,2.5 out of 5 stars,53 ratings,"['Electronics', 'Wearable Technology', 'Smartw...","['Activity Tracking Smart Watch', 'Pedometer,S..."
3,3.0,B015R3XJCK,0,0,0,0,0,0
4,4.0,B00JFJL9MA,Generic FBA_LYSB00JFJL9MA-ELECTRNCS GENERIC Re...,Generic,3.9 out of 5 stars,803 ratings,"['Electronics', 'Accessories & Supplies', 'Aud...",['Make sure this fits\nby entering your model ...


In [5]:
master_product.shape

(14987, 8)

# Drop duplicate values, if any

In [6]:
print(master_product.shape)

master_product = master_product.sort_values(
    ['asin'], ascending=False).drop_duplicates(
        subset = ['asin'], keep ='first')

print(master_product.shape)

(14987, 8)
(14987, 8)


In [7]:
master_product.drop(['Unnamed: 0'], axis = 1,inplace = True)

# Convert product rating and product rate count to numbers instead of words

In [8]:
master_product['product_rating']=master_product['product_rating'].str.replace('out of 5 stars', '').astype(float)    

In [9]:
master_product['product_rate_count']=master_product['product_rate_count'].str.replace(' ratings', '') 

In [10]:
master_product['product_rate_count']=master_product['product_rate_count'].str.replace(' rating', '') 

In [11]:
master_product['product_rate_count']=master_product['product_rate_count'].str.replace(',', '').astype(float) 

# Feature engineering

In [12]:
master_product['rating*count']=master_product['product_rate_count']*master_product['product_rating']

In [13]:
master_product.dtypes

asin                   object
product_name           object
product_brand          object
product_rating        float64
product_rate_count    float64
category               object
description            object
rating*count          float64
dtype: object

# Create a dictionary for asin and name for Flask use

In [14]:
asin_name = master_product[['asin','product_name']].copy()

In [15]:
asin_dict = dict(zip(asin_name['asin'], asin_name['product_name']))

In [87]:
pickle.dump(asin_name, open('../flask_app_complete/asin_name.p', 'wb'))

## There are items that alrady discountinued or meet autobot that returns 0 for values other than item number, will need to double check for them, but will filter them out for now

In [16]:
master_product=master_product[master_product['product_rating']!=0]

In [17]:
master_product.head()

Unnamed: 0,asin,product_name,product_brand,product_rating,product_rate_count,category,description,rating*count
639,B01HJDR9DQ,Geilienergy BT-17333 BT-27333 Handset Telephon...,GEILIENERGY,4.4,66.0,"['Electronics', 'Accessories & Supplies', 'Tel...",['Make sure this fits\nby entering your model ...,290.4
371,B01HJA3OUG,LETOUR DC 5V 30A Power Supply 150W AC 110V/220...,LETOUR,4.2,36.0,"['Electronics', 'Accessories & Supplies', 'Bat...",['DC 5V 30A POWER SUPPLY - Input: AC 110/220V ...,151.2
205,B01HIZEW1C,AmazonBasics DSLR Camera and Laptop Backpack B...,AmazonBasics,4.3,66.0,"['Electronics', 'Computers & Accessories', 'La...",['Make sure this fits\nby entering your model ...,283.8
131,B01HIWBNOY,Amtrak Solar's Powerful Attic Exhaust Fan Quie...,Amtrak Solar,4.0,58.0,"['Electronics', 'Computers & Accessories', 'Co...","['No deductibles or added costs. Parts, labor ...",232.0
696,B01HIURQWE,Dmax Armor for LG G Pad X 10.1 Screen Protecto...,Dmax Armor,4.1,86.0,"['Electronics', 'Computers & Accessories', 'Ta...",['Make sure this fits\nby entering your model ...,352.6


In [18]:
master_product.reset_index(inplace = True)

In [19]:
master_product.head(10)

Unnamed: 0,index,asin,product_name,product_brand,product_rating,product_rate_count,category,description,rating*count
0,639,B01HJDR9DQ,Geilienergy BT-17333 BT-27333 Handset Telephon...,GEILIENERGY,4.4,66.0,"['Electronics', 'Accessories & Supplies', 'Tel...",['Make sure this fits\nby entering your model ...,290.4
1,371,B01HJA3OUG,LETOUR DC 5V 30A Power Supply 150W AC 110V/220...,LETOUR,4.2,36.0,"['Electronics', 'Accessories & Supplies', 'Bat...",['DC 5V 30A POWER SUPPLY - Input: AC 110/220V ...,151.2
2,205,B01HIZEW1C,AmazonBasics DSLR Camera and Laptop Backpack B...,AmazonBasics,4.3,66.0,"['Electronics', 'Computers & Accessories', 'La...",['Make sure this fits\nby entering your model ...,283.8
3,131,B01HIWBNOY,Amtrak Solar's Powerful Attic Exhaust Fan Quie...,Amtrak Solar,4.0,58.0,"['Electronics', 'Computers & Accessories', 'Co...","['No deductibles or added costs. Parts, labor ...",232.0
4,696,B01HIURQWE,Dmax Armor for LG G Pad X 10.1 Screen Protecto...,Dmax Armor,4.1,86.0,"['Electronics', 'Computers & Accessories', 'Ta...",['Make sure this fits\nby entering your model ...,352.6
5,245,B01HIS5O7A,Polk Audio BITB-A Boom Bit Black,Polk BOOM,3.1,123.0,"['Electronics', 'Portable Audio & Video', 'Por...",['Make sure this fits\nby entering your model ...,381.3
6,191,B01HIS5N3K,JAM Rhythm WiFi Home Audio Speaker with Amazon...,Jam,4.0,128.0,"['Electronics', 'Home Audio', 'Speakers']","['No deductibles or added costs. Parts, labor ...",512.0
7,1044,B01HIS30OY,Logitech CREATE Backlit Keyboard Case with Sma...,Logitech,3.5,829.0,"['Electronics', 'Computers & Accessories', 'Ta...",['Make sure this fits\nby entering your model ...,2901.5
8,302,B01HIPOCYE,BOVKE Graphing Calculator Carrying Case for Te...,BOVKE,4.6,759.0,"['Electronics', 'Accessories & Supplies', 'Off...",['Make sure this fits\nby entering your model ...,3491.4
9,693,B01HIL1XZY,Jensen SB2000RB Portable AM/FM Radio in Red,Studebaker,3.9,1028.0,"['Electronics', 'Portable Audio & Video', 'Rad...",['Make sure this fits\nby entering your model ...,4009.2


In [20]:
master_product = master_product[~master_product['description'].str.contains("No deductibles or added costs")]

In [21]:
master_product.shape

(10910, 9)

In [22]:
master_product.isnull().mean()

index                 0.0
asin                  0.0
product_name          0.0
product_brand         0.0
product_rating        0.0
product_rate_count    0.0
category              0.0
description           0.0
rating*count          0.0
dtype: float64

In [23]:
master_product.describe()

Unnamed: 0,index,product_rating,product_rate_count,rating*count
count,10910.0,10910.0,10910.0,10910.0
mean,561.199083,4.104464,426.530339,1783.022997
std,371.766336,0.468395,1271.977329,5397.147366
min,0.0,1.0,1.0,1.0
25%,271.0,3.8,44.0,176.4
50%,535.0,4.2,115.0,468.85
75%,800.0,4.4,333.0,1390.1
max,1999.0,5.0,33024.0,148608.0


In [24]:
master_product.to_csv('../data/master_prod.csv')

## Clean up the category column: needs to convert the list of categories to dummies based on each asin

In [24]:
cate_dict = master_product['category'].to_dict()

In [25]:
for i in cate_dict:
    cate_dict[i] = cate_dict[i].replace('[', '')
    cate_dict[i] = cate_dict[i].replace(']', '')
    cate_dict[i] = cate_dict[i].replace('\'', '')
    #cate_dict[i] = cate_dict[i].split(', ')

In [26]:
df_cate= pd.DataFrame.from_dict(cate_dict, orient = 'index')
df_cate.head()

Unnamed: 0,0
0,"Electronics, Accessories & Supplies, Telephone..."
1,"Electronics, Accessories & Supplies, Batteries..."
2,"Electronics, Computers & Accessories, Laptop A..."
4,"Electronics, Computers & Accessories, Tablet A..."
5,"Electronics, Portable Audio & Video, Portable ..."


In [27]:
df_cate.rename(columns={0:'category'}, inplace = True)

In [28]:
df_cate['asin'] = master_product['asin']
df_cate.head(10)

Unnamed: 0,category,asin
0,"Electronics, Accessories & Supplies, Telephone...",B01HJDR9DQ
1,"Electronics, Accessories & Supplies, Batteries...",B01HJA3OUG
2,"Electronics, Computers & Accessories, Laptop A...",B01HIZEW1C
4,"Electronics, Computers & Accessories, Tablet A...",B01HIURQWE
5,"Electronics, Portable Audio & Video, Portable ...",B01HIS5O7A
7,"Electronics, Computers & Accessories, Tablet A...",B01HIS30OY
8,"Electronics, Accessories & Supplies, Office El...",B01HIPOCYE
9,"Electronics, Portable Audio & Video, Radios",B01HIL1XZY
12,"Electronics, Camera & Photo, Lighting & Studio...",B01HI76QG4
14,"Electronics, Headphones, Earbud Headphones",B01HI5YYN8


In [29]:
df_cate.shape

(10910, 2)

In [30]:
clean_cate = df_cate.set_index('asin').category.str.split(',', expand=True).stack()
clean_cate.head(20)

asin         
B01HJDR9DQ  0                 Electronics
            1      Accessories & Supplies
            2       Telephone Accessories
            3                   Batteries
B01HJA3OUG  0                 Electronics
            1      Accessories & Supplies
            2                   Batteries
            3      Chargers & Accessories
            4            Power Converters
B01HIZEW1C  0                 Electronics
            1     Computers & Accessories
            2          Laptop Accessories
            3                        Bags
            4             Cases & Sleeves
            5                   Backpacks
B01HIURQWE  0                 Electronics
            1     Computers & Accessories
            2          Tablet Accessories
            3           Screen Protectors
B01HIS5O7A  0                 Electronics
dtype: object

In [31]:
clean_cate=pd.get_dummies(clean_cate, prefix='g').groupby(level=0).sum()

In [32]:
clean_cate.reset_index(inplace = True)

In [33]:
clean_cate.head()

Unnamed: 0,asin,"g_ ""Kids Backpacks""","g_ ""Kids Electronics""","g_ ""Mens""",g_ 12V,g_ 2 in 1 Laptops,g_ 3D Glasses,g_ 3D Printer Parts & Accessories,g_ AA,g_ AC Adapters,...,g_Home & Kitchen,g_Industrial & Scientific,g_Musical Instruments,g_Office Products,g_Patio,g_Pet Supplies,g_Sports & Outdoors,g_Tools & Home Improvement,g_Toys & Games,g_Video Games
0,B0048ODJ3Y,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B004IA9XLU,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,B005ODJO32,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B008ITQIMO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B009IESEBQ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
clean_cate.isnull().mean().sum()

0.0

In [35]:
clean_cate.shape

(10910, 866)

## Cleanup the brand column

In [36]:
product_brand=master_product[['product_brand']].copy()

In [37]:
product_brand.head(10)

Unnamed: 0,product_brand
0,GEILIENERGY
1,LETOUR
2,AmazonBasics
4,Dmax Armor
5,Polk BOOM
7,Logitech
8,BOVKE
9,Studebaker
12,Godox
14,TRILINK


In [38]:
brand=pd.get_dummies(product_brand)

In [39]:
brand['asin'] = master_product['asin']
brand.head(10)

Unnamed: 0,product_brand_1 BY ONE,product_brand_10Gtek,product_brand_1MORE,product_brand_1PLUS,product_brand_1byone,product_brand_1st Choice,product_brand_2win2buy,product_brand_2xhome,product_brand_321OU,product_brand_360 Electrical,...,product_brand_wsdcam,product_brand_xcivi,product_brand_xinyuansu,product_brand_xmartO,product_brand_yIFeNG,product_brand_yinglite,product_brand_yueton,product_brand_zealsound,product_brand_zhi yun,asin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HJDR9DQ
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HJA3OUG
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HIZEW1C
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HIURQWE
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HIS5O7A
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HIS30OY
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HIPOCYE
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HIL1XZY
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HI76QG4
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,B01HI5YYN8


## Combine brand and category to one dataFrame

In [40]:
brand = brand.sort_values('asin')

In [41]:
clean_cate = clean_cate.sort_values('asin')

In [42]:
brand.shape

(10910, 3571)

In [43]:
brand.isnull().mean().sum()

0.0

In [44]:
clean_cate.shape

(10910, 866)

In [45]:
clean_cate.isnull().mean().sum()

0.0

In [46]:
list(brand['asin']) == list(clean_cate['asin'])

True

In [47]:
bran_cate = pd.merge(brand,clean_cate,on = 'asin')

In [48]:
bran_cate.head(10)

Unnamed: 0,product_brand_1 BY ONE,product_brand_10Gtek,product_brand_1MORE,product_brand_1PLUS,product_brand_1byone,product_brand_1st Choice,product_brand_2win2buy,product_brand_2xhome,product_brand_321OU,product_brand_360 Electrical,...,g_Home & Kitchen,g_Industrial & Scientific,g_Musical Instruments,g_Office Products,g_Patio,g_Pet Supplies,g_Sports & Outdoors,g_Tools & Home Improvement,g_Toys & Games,g_Video Games
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
bran_cate_rating = pd.merge(bran_cate,master_product,on = 'asin')

In [50]:
bran_cate_rating.drop(columns = ['index','product_name','product_brand','category','description'],inplace = True)

In [51]:
bran_cate_rating.head()

Unnamed: 0,product_brand_1 BY ONE,product_brand_10Gtek,product_brand_1MORE,product_brand_1PLUS,product_brand_1byone,product_brand_1st Choice,product_brand_2win2buy,product_brand_2xhome,product_brand_321OU,product_brand_360 Electrical,...,g_Office Products,g_Patio,g_Pet Supplies,g_Sports & Outdoors,g_Tools & Home Improvement,g_Toys & Games,g_Video Games,product_rating,product_rate_count,rating*count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.7,1.0,4.7
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.7,1.0,4.7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5.0,1.0,5.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.8,90.0,432.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.8,130.0,494.0


In [52]:
bran_cate_rating.set_index('asin',inplace = True)

In [53]:
bran_cate_rating.head()

Unnamed: 0_level_0,product_brand_1 BY ONE,product_brand_10Gtek,product_brand_1MORE,product_brand_1PLUS,product_brand_1byone,product_brand_1st Choice,product_brand_2win2buy,product_brand_2xhome,product_brand_321OU,product_brand_360 Electrical,...,g_Office Products,g_Patio,g_Pet Supplies,g_Sports & Outdoors,g_Tools & Home Improvement,g_Toys & Games,g_Video Games,product_rating,product_rate_count,rating*count
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.7,1.0,4.7
B004IA9XLU,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.7,1.0,4.7
B005ODJO32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5.0,1.0,5.0
B008ITQIMO,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.8,90.0,432.0
B009IESEBQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.8,130.0,494.0


In [54]:
recommender_b4 = pairwise_distances(bran_cate_rating, metric='cosine')

In [55]:
recommender_b4 = pd.DataFrame(recommender_b4)
recommender_b4.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10900,10901,10902,10903,10904,10905,10906,10907,10908,10909
0,1.110223e-16,0.07971303,0.05465387,0.3141832,0.3169706,0.321734,0.319922,0.318876,0.321379,0.321617,...,0.312216,0.291686,0.321583,0.320775,0.322444,0.31976,0.314131,0.311609,0.303362,0.311544
1,0.07971303,1.110223e-16,0.09172627,0.3138633,0.317247,0.321728,0.319754,0.319462,0.321379,0.321601,...,0.312216,0.294182,0.321583,0.320775,0.322397,0.31976,0.313742,0.31064,0.303362,0.311544
2,0.05465387,0.09172627,2.220446e-16,0.3231005,0.3262962,0.330786,0.329304,0.328904,0.330237,0.330716,...,0.32144,0.29879,0.330778,0.329678,0.331852,0.329484,0.323329,0.320752,0.312659,0.320646
3,0.3141832,0.3138633,0.3231005,1.110223e-16,0.001378758,0.000477,0.002186,0.009506,8.2e-05,0.000644,...,0.000611,0.000753,0.001107,0.000101,0.00272,0.005713,0.000597,0.000318,0.000659,0.00021
4,0.3169706,0.317247,0.3262962,0.001378758,1.110223e-16,0.000316,0.000111,0.003717,0.001173,0.000199,...,0.000214,0.001738,5.2e-05,0.000966,0.000251,0.001519,0.000196,0.000488,0.000592,0.000633


In [56]:
recommender_b4.columns = bran_cate_rating.index

In [57]:
recommender_b4.index =bran_cate_rating.index

In [58]:
recommender_b4.head()

asin,B0048ODJ3Y,B004IA9XLU,B005ODJO32,B008ITQIMO,B009IESEBQ,B009ZIILLI,B009ZRBVDE,B00A926XLE,B00A9AQPUU,B00AI3RRAA,...,B01HI5YYN8,B01HI76QG4,B01HIL1XZY,B01HIPOCYE,B01HIS30OY,B01HIS5O7A,B01HIURQWE,B01HIZEW1C,B01HJA3OUG,B01HJDR9DQ
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,1.110223e-16,0.07971303,0.05465387,0.3141832,0.3169706,0.321734,0.319922,0.318876,0.321379,0.321617,...,0.312216,0.291686,0.321583,0.320775,0.322444,0.31976,0.314131,0.311609,0.303362,0.311544
B004IA9XLU,0.07971303,1.110223e-16,0.09172627,0.3138633,0.317247,0.321728,0.319754,0.319462,0.321379,0.321601,...,0.312216,0.294182,0.321583,0.320775,0.322397,0.31976,0.313742,0.31064,0.303362,0.311544
B005ODJO32,0.05465387,0.09172627,2.220446e-16,0.3231005,0.3262962,0.330786,0.329304,0.328904,0.330237,0.330716,...,0.32144,0.29879,0.330778,0.329678,0.331852,0.329484,0.323329,0.320752,0.312659,0.320646
B008ITQIMO,0.3141832,0.3138633,0.3231005,1.110223e-16,0.001378758,0.000477,0.002186,0.009506,8.2e-05,0.000644,...,0.000611,0.000753,0.001107,0.000101,0.00272,0.005713,0.000597,0.000318,0.000659,0.00021
B009IESEBQ,0.3169706,0.317247,0.3262962,0.001378758,1.110223e-16,0.000316,0.000111,0.003717,0.001173,0.000199,...,0.000214,0.001738,5.2e-05,0.000966,0.000251,0.001519,0.000196,0.000488,0.000592,0.000633


In [60]:

search = 'B009IESEBQ'

for num in master_product.loc[master_product['asin'].str.contains(search), 'asin']:
    print(num)
    #print('Average rating', pivot.loc[title, :].mean())
    #print('Number of ratings', pivot.T[title].count())
    print('')
    print('10 closest products')
    print(round(recommender_b4[num].sort_values(),6)[1:15])
    print('')
    print('*******************************************************************************************')
    print('')

B009IESEBQ

10 closest products
asin
B00MGF7RB2    0.000010
B0156MFBI2    0.000011
B00VNJLBPK    0.000011
B00PH0E7RK    0.000011
B00YQD94RW    0.000012
B00JSJXH3Q    0.000013
B00QGJ91A4    0.000013
B00KCZSFWI    0.000013
B01D73XM24    0.000013
B0106HXV52    0.000013
B0115ZHH68    0.000014
B00LJX4VUW    0.000014
B012PEJ3AQ    0.000014
B012PELN7M    0.000014
Name: B009IESEBQ, dtype: float64

*******************************************************************************************



In [57]:
pickle.dump(recommender_b4, open('../flask_app_complete/recommender_b4.p', 'wb'))

# Tried KMean for category and brand but does not work well

In [60]:
kmeans_20 = KMeans(n_clusters = 20,random_state = 42,n_jobs = 3).fit(bran_cate)

In [61]:
silhouette_score(bran_cate, kmeans_20.labels_)

0.1375179036836911

In [62]:
kmeans_50 = KMeans(n_clusters = 50,random_state = 42,n_jobs = 3).fit(bran_cate)

In [63]:
silhouette_score(bran_cate, kmeans_50.labels_)

0.16981463985533263

In [64]:
kmeans_100 = KMeans(n_clusters = 100,random_state = 42,n_jobs = 3).fit(bran_cate)

In [65]:
silhouette_score(bran_cate, kmeans_100.labels_)

0.20576311971885797

In [66]:
kmeans_500 = KMeans(n_clusters = 500,random_state = 42,n_jobs = 3).fit(bran_cate)

In [67]:
silhouette_score(bran_cate, kmeans_500.labels_)

0.2577805597554169

In [320]:
bran_cate.head()

Unnamed: 0_level_0,product_brand_1 BY ONE,product_brand_10Gtek,product_brand_1MORE,product_brand_1PLUS,product_brand_1byone,product_brand_1st Choice,product_brand_2win2buy,product_brand_2xhome,product_brand_321OU,product_brand_360 Electrical,...,g_Home & Kitchen,g_Industrial & Scientific,g_Musical Instruments,g_Office Products,g_Patio,g_Pet Supplies,g_Sports & Outdoors,g_Tools & Home Improvement,g_Toys & Games,g_Video Games
brand_asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B004IA9XLU,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B005ODJO32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B008ITQIMO,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B009IESEBQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
bran_cate.loc[:,'cluster'] = kmeans_500.predict(bran_cate)

In [70]:
new=bran_cate.loc[:,'cluster']

In [71]:
Bran_cate = pd.DataFrame(new)

In [73]:
cluster = pd.get_dummies(columns=["cluster"], data=Bran_cate)
cluster.head()

Unnamed: 0_level_0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,...,cluster_490,cluster_491,cluster_492,cluster_493,cluster_494,cluster_495,cluster_496,cluster_497,cluster_498,cluster_499
brand_asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B004IA9XLU,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B005ODJO32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B008ITQIMO,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B009IESEBQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
cluster.shape

(10910, 500)

In [75]:
cluster.reset_index(inplace = True)

In [76]:
master_product.shape

(10910, 8)

In [77]:
cluster.rename(columns={'brand_asin':'asin'},inplace = True)

In [78]:
combined_cluster_rating =pd.merge(cluster, master_product, on='asin')

In [80]:
combined_cluster_rating.drop(columns = ['index','product_name','product_brand','category','description'],inplace = True)

In [82]:
combined_cluster_rating.set_index('asin',inplace = True)


In [84]:
recommender_content = pairwise_distances(combined_cluster_rating, metric='cosine')

array([[1.11022302e-16, 4.15110004e-02, 3.93486392e-02, ...,
        7.34462478e-01, 6.86783131e-01, 7.33041359e-01],
       [4.15110004e-02, 1.11022302e-16, 3.93486392e-02, ...,
        7.34462478e-01, 6.86783131e-01, 7.33041359e-01],
       [3.93486392e-02, 3.93486392e-02, 0.00000000e+00, ...,
        7.45426717e-01, 6.97455298e-01, 7.43997462e-01],
       ...,
       [7.34462478e-01, 7.34462478e-01, 7.45426717e-01, ...,
        0.00000000e+00, 1.79840017e-03, 2.29660788e-04],
       [6.86783131e-01, 6.86783131e-01, 6.97455298e-01, ...,
        1.79840017e-03, 1.11022302e-16, 1.72252273e-03],
       [7.33041359e-01, 7.33041359e-01, 7.43997462e-01, ...,
        2.29660788e-04, 1.72252273e-03, 1.11022302e-16]])

In [85]:
recommender_content=pd.DataFrame(recommender_content)

In [86]:
recommender_content.columns = combined_cluster_rating.index
recommender_content.index = combined_cluster_rating.index
recommender_content.head()

asin,B0048ODJ3Y,B004IA9XLU,B005ODJO32,B008ITQIMO,B009IESEBQ,B009ZIILLI,B009ZRBVDE,B00A926XLE,B00A9AQPUU,B00AI3RRAA,...,B01HI5YYN8,B01HI76QG4,B01HIL1XZY,B01HIPOCYE,B01HIS30OY,B01HIS5O7A,B01HIURQWE,B01HIZEW1C,B01HJA3OUG,B01HJDR9DQ
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,1.110223e-16,0.041511,0.039349,0.745563,0.768372,0.795521,0.780898,0.763315,0.794382,0.794284,...,0.739843,0.619961,0.792626,0.790458,0.792217,0.772203,0.750905,0.734462,0.686783,0.733041
B004IA9XLU,0.041511,1.110223e-16,0.039349,0.745563,0.768372,0.795521,0.780898,0.763315,0.794382,0.794284,...,0.739843,0.619961,0.792626,0.790458,0.792217,0.772203,0.750905,0.734462,0.686783,0.733041
B005ODJO32,0.03934864,0.03934864,0.0,0.756591,0.779523,0.80681,0.792114,0.774439,0.805665,0.805567,...,0.750838,0.630166,0.803901,0.801722,0.803489,0.783374,0.761962,0.745427,0.697455,0.743997
B008ITQIMO,0.7455634,0.7455634,0.756591,0.0,0.000381,0.00144,0.000764,0.000313,0.001378,0.001373,...,0.000185,0.010135,0.001286,0.001177,0.001265,0.000489,0.000145,0.000245,0.002416,0.000264
B009IESEBQ,0.7683722,0.7683722,0.779523,0.000381,0.0,0.000434,0.000126,0.00012,0.000401,0.000399,...,0.000588,0.013651,0.000353,0.000299,0.000343,7.1e-05,0.000267,0.000786,0.004183,0.000841


In [128]:
search = 'B00KMRVGFO'
#for asin in recommender_content.index.str.contains(search):
print(num)
print('')
print('10 closest products')
print(recommender_content[search].sort_values()[1:11])


B00KMRVGFO

10 closest products
asin
B00JU24Z3W    1.012060e-07
B017VQ3AWU    2.564708e-07
B01A6G0CTQ    3.781160e-07
B01809N39W    4.347386e-07
B00NIGO4NM    6.475875e-07
B00QT7LQ88    9.986262e-07
B01CPZHPZK    1.183353e-06
B01GN00OQA    1.255256e-06
B00YN34ISA    1.370880e-06
B01AFNBC3K    1.683074e-06
Name: B00KMRVGFO, dtype: float64


In [59]:
pickle.dump(asin_name, open('../flask_app_complete/asin_name.p', 'wb'))

In [88]:
pickle.dump(recommender_content, open('../flask_app_complete/recommender_content.p', 'wb'))

# TFIDF for item names

In [81]:
tfidf = TfidfVectorizer(stop_words='english',min_df=2,max_df=.90,ngram_range=(1,2))

In [82]:
tfidf_1000 = TfidfVectorizer(stop_words='english',min_df=2,max_df=.90,ngram_range=(1,2),max_features=1000)

In [83]:
name_matrix = tfidf_1000.fit_transform(master_product['product_name'])

In [84]:
name_matrix_1000 = tfidf_1000.fit_transform(master_product['product_name'])

In [85]:
name_matrix

<10910x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 126158 stored elements in Compressed Sparse Row format>

In [86]:
name_df = pd.DataFrame(name_matrix_1000.toarray(), columns=tfidf_1000.get_feature_names())

name_df.head()

Unnamed: 0,001,10,10 feet,10 inch,10 pack,100,1000,1080p,10ft,11,...,xp,xps,xr,xs,xs max,year,yellow,yi,yoga,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.150838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
name_df.shape

(10910, 1000)

In [92]:
name_df.head()

Unnamed: 0,001,10,10 feet,10 inch,10 pack,100,1000,1080p,10ft,11,...,xp,xps,xr,xs,xs max,year,yellow,yi,yoga,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.150838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Use Kmean to see if there is better solution

In [93]:
kmean_name_5 = KMeans(n_clusters = 5,random_state = 42,n_jobs = 3).fit(name_matrix)

In [94]:
silhouette_score(name_matrix, kmean_name_5.labels_)

0.016139420430836347

In [153]:
kmean_name = KMeans(n_clusters = 20,random_state = 42,n_jobs = 3).fit(name_matrix)

In [154]:
silhouette_score(name_matrix, kmean_name.labels_)

0.013769255752547652

In [84]:
kmeans_name_200 = KMeans(n_clusters = 200,random_state = 42,n_jobs = 3).fit(name_matrix)

In [85]:
silhouette_score(name_matrix, kmeans_name_200.labels_)

0.031056531123148937

In [95]:
name_df.loc[:,'cluster'] = kmean_name_5.predict(name_df)

In [96]:
name_df[['cluster']]

Unnamed: 0,cluster
0,4
1,4
2,1
3,0
4,4
...,...
10905,1
10906,2
10907,4
10908,4


In [97]:
bran_cate_rating.reset_index(inplace = True)

In [98]:
name_df['asin']=bran_cate_rating['asin']

In [99]:
list(bran_cate['asin']) == list(name_df['asin'])

True

In [101]:
name_cluster_df=name_df[['asin','cluster']]
name_cluster_df.head(10)

Unnamed: 0,asin,cluster
0,B0048ODJ3Y,4
1,B004IA9XLU,4
2,B005ODJO32,1
3,B008ITQIMO,0
4,B009IESEBQ,4
5,B009ZIILLI,0
6,B009ZRBVDE,4
7,B00A926XLE,4
8,B00A9AQPUU,4
9,B00AI3RRAA,3


In [102]:
name_df.set_index('asin',inplace = True)

# As clustering does not improve the result, will use brand, name and top 1000 name features

In [103]:
bcn_df = pd.merge(bran_cate_rating,name_df, on = 'asin')

In [104]:
bcn_df.shape

(10910, 5442)

In [107]:
bcn_df.drop(columns = ['level_0','index','cluster'],inplace = True)

In [108]:
bcn_df.head(10)

Unnamed: 0,asin,product_brand_1 BY ONE,product_brand_10Gtek,product_brand_1MORE,product_brand_1PLUS,product_brand_1byone,product_brand_1st Choice,product_brand_2win2buy,product_brand_2xhome,product_brand_321OU,...,xp,xps,xr,xs,xs max,year,yellow,yi,yoga,zoom
0,B0048ODJ3Y,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B004IA9XLU,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B005ODJO32,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,B008ITQIMO,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,B009IESEBQ,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,B009ZIILLI,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,B009ZRBVDE,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,B00A926XLE,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,B00A9AQPUU,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,B00AI3RRAA,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
bcn_df.isnull().sum().sum()

0

In [110]:
bcn_df.set_index('asin',inplace = True)

# Recommender with brand, category and name, with top 1000 features

In [111]:
recommender_bcn = pairwise_distances(bcn_df, metric='cosine')

In [112]:
recommender_bcn=pd.DataFrame(recommender_bcn)

In [113]:
recommender_bcn.columns = bcn_df.index
recommender_bcn.index = bcn_df.index
recommender_bcn.head()

asin,B0048ODJ3Y,B004IA9XLU,B005ODJO32,B008ITQIMO,B009IESEBQ,B009ZIILLI,B009ZRBVDE,B00A926XLE,B00A9AQPUU,B00AI3RRAA,...,B01HI5YYN8,B01HI76QG4,B01HIL1XZY,B01HIPOCYE,B01HIS30OY,B01HIS5O7A,B01HIURQWE,B01HIZEW1C,B01HJA3OUG,B01HJDR9DQ
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,3.330669e-16,0.097694,0.071902,0.320918,0.323678,0.328393,0.326599,0.325568,0.328041,0.328278,...,0.318972,0.298667,0.328243,0.327443,0.329094,0.326423,0.320867,0.318372,0.310202,0.318296
B004IA9XLU,0.09769441,0.0,0.108298,0.320601,0.323951,0.328387,0.326432,0.326149,0.328041,0.328261,...,0.318972,0.301139,0.328243,0.32744,0.329049,0.326422,0.320482,0.317412,0.310127,0.318307
B005ODJO32,0.07190159,0.108298,0.0,0.328863,0.332007,0.336481,0.335004,0.334621,0.335937,0.336412,...,0.327203,0.304785,0.336472,0.335383,0.337538,0.33516,0.329091,0.326537,0.318522,0.326418
B008ITQIMO,0.3209181,0.320601,0.328863,0.0,0.001383,0.000479,0.002189,0.009517,8.4e-05,0.000646,...,0.00062,0.000795,0.001109,0.000104,0.002723,0.005719,0.000603,0.000326,0.000682,0.000218
B009IESEBQ,0.3236776,0.323951,0.332007,0.001383,0.0,0.000318,0.000114,0.003727,0.001175,0.000201,...,0.000221,0.001779,5.4e-05,0.000968,0.000253,0.001523,0.000202,0.000496,0.000615,0.00064


In [114]:
search = 'B01HI5YYN8'

for num in master_product.loc[master_product['asin'].str.contains(search), 'asin']:
    print(num)
    #print('Average rating', pivot.loc[title, :].mean())
    #print('Number of ratings', pivot.T[title].count())
    print('')
    print('10 closest products')
    print(round(recommender_bcn[num].sort_values(),6)[1:11])
    print('')
    print('*******************************************************************************************')
    print('')

B01HI5YYN8

10 closest products
asin
B00TBI3G1C    0.000023
B014E9QLGE    0.000028
B01FECB318    0.000031
B0117RFP6I    0.000033
B00LT295L8    0.000033
B00YBC8LRM    0.000037
B00NG0QJDI    0.000039
B01B0C5MHM    0.000039
B00W8YAWDQ    0.000039
B01CU4PRTW    0.000039
Name: B01HI5YYN8, dtype: float64

*******************************************************************************************



In [115]:
search = 'Security Camera'

print(asin_name[asin_name['product_name'].str.contains(search)].iloc[0,0])
print(asin_name[asin_name['product_name'].str.contains(search)].iloc[0,1])
print(' ')
print('Recommendation')
print('10 closest product')
indexs = list(recommender_bcn[asin_name[asin_name['product_name'].str.contains(search)].iloc[0,0]].sort_values()[1:6].index)
for i in indexs:
    print (i,asin_dict[i])
    print ('---------------------')

B01HDZM6N8
 
Recommendation
10 closest product
B00TGAJ9X4 Nikon WU-1a Wireless Mobile Adapter 27081 for Nikon Df, Nikon 1 S2, COOLPIX P530, D3300, COOLPIX P7800, COOLPIX P330, COOLPIX A, D7100, COOLPIX P520, D5200, D3200 (Renewed)
---------------------
B00DUNYW7G Fujifilm Instax Mini Rainbow Film - 10 Exposures
---------------------
B015IY34FA Fujifilm Instax Mini Instant Film, 10 Sheets×5 Pack(Total 50 Shoots)
---------------------
B00HQ4W58I Nikon COOLPIX S3600 20.1 MP Digital Camera with 8x Zoom NIKKOR Lens and 720p HD Video (Silver) (Discontinued by Manufacturer)
---------------------
B00HLDFNKQ Canon PowerShot ELPH 340 HS 16MP Digital Camera (Black)
---------------------


In [85]:
pickle.dump(recommender_bcn, open('../flask_app_complete/recommender_bcn.p', 'wb'))

## This is practical when there is enough RAM to find a proper K

In [None]:
# scores = []
# for k in range(2000,4000):
#     cl = KMeans(n_clusters=k)
#     cl.fit(term_matrix)
#     inertia = cl.inertia_
#     sil = silhouette_score(term_matrix, cl.labels_)
#     scores.append([k, inertia, sil])
    
# score_df = pd.DataFrame(scores)
# score_df.columns = ['k', 'inertia', 'silhouette']

# User-based Recommender

In [160]:
review = pd.read_csv('../data/year_2018.csv')

In [161]:
review.head()

Unnamed: 0.1,Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewText,summary
0,6739449,5.0,,True,2018-10-04,A2IXGBWKX73O21,B01HI7D4VY,,"quality, very bright","for the price, excellent"
1,6739257,5.0,,True,2018-10-04,AHEX3FYTJ2AX6,B01HH36F74,{'Color:': ' CA100'},Works as expected.,Works as expected.
2,6738877,3.0,,False,2018-10-03,A2J77LP43D7QMX,B01HF0YGCK,{'Color:': ' Black'},For a 3 pack they had a great price when I bou...,Great for the price
3,6739102,5.0,,True,2018-10-03,A32FN9RNY65CKH,B01HGF6XTI,{'Size:': ' Charger'},Went to Boston and forgot my charger.\nOrdered...,Worked fine - got me through in a pinch
4,6735704,5.0,,True,2018-10-02,A2ZE26LV7HVSAD,B01GUITK24,{'Style Name:': ' Bookshelf Speaker Mount 2 Pa...,I had to use my own drywall anchors which were...,Solid and kind of heavy


In [162]:
review.shape

(377430, 10)

In [163]:
review.dtypes

Unnamed: 0      int64
overall       float64
vote          float64
verified         bool
reviewTime     object
reviewerID     object
asin           object
style          object
reviewText     object
summary        object
dtype: object

In [164]:
review.drop(['Unnamed: 0'], axis = 1,inplace = True)

In [165]:
review.rename(columns={'overall':'individual_rating'}, inplace=True)

In [172]:
review.head(5)

Unnamed: 0,individual_rating,vote,verified,reviewTime,reviewerID,asin,style,reviewText,summary
0,5.0,,True,2018-10-04,A2IXGBWKX73O21,B01HI7D4VY,,"quality, very bright","for the price, excellent"
1,5.0,,True,2018-10-04,AHEX3FYTJ2AX6,B01HH36F74,{'Color:': ' CA100'},Works as expected.,Works as expected.
2,3.0,,False,2018-10-03,A2J77LP43D7QMX,B01HF0YGCK,{'Color:': ' Black'},For a 3 pack they had a great price when I bou...,Great for the price
3,5.0,,True,2018-10-03,A32FN9RNY65CKH,B01HGF6XTI,{'Size:': ' Charger'},Went to Boston and forgot my charger.\nOrdered...,Worked fine - got me through in a pinch
4,5.0,,True,2018-10-02,A2ZE26LV7HVSAD,B01GUITK24,{'Style Name:': ' Bookshelf Speaker Mount 2 Pa...,I had to use my own drywall anchors which were...,Solid and kind of heavy


In [173]:
review.tail(5)

Unnamed: 0,individual_rating,vote,verified,reviewTime,reviewerID,asin,style,reviewText,summary
377425,5.0,,True,2018-01-01,A1F022GE3LCJHC,B00TE8XKIS,"{'Color:': ' White', 'Style:': ' Printer Only'}",Great product at great price,Five Stars
377426,5.0,,True,2018-01-01,A2BW5YKVJOZQSU,B000WHDGGQ,,Great!,Great!
377427,4.0,3.0,True,2018-01-01,AXUN2LN46O7R6,B01BESQYJW,{'Style:': ' Lens'},Too bulky!! Too heavy!! Otherwise a great lens,Otherwise a great
377428,5.0,,True,2018-01-01,A23FBKO33UK1HR,B016ZF44T6,,"hooked right up, no fuss. the horizontal bars...",exactly right. built well.
377429,5.0,,True,2018-01-01,A2Y5LRR5CO8V4X,B00GUX2YTQ,{'Size:': ' Batt'},Great!,Great!


In [175]:
mask=review[['individual_rating','reviewerID','asin']]
mask.head()

Unnamed: 0,individual_rating,reviewerID,asin
0,5.0,A2IXGBWKX73O21,B01HI7D4VY
1,5.0,AHEX3FYTJ2AX6,B01HH36F74
2,3.0,A2J77LP43D7QMX,B01HF0YGCK
3,5.0,A32FN9RNY65CKH,B01HGF6XTI
4,5.0,A2ZE26LV7HVSAD,B01GUITK24


In [177]:
mask.shape

(377430, 3)

In [171]:
pivot = pd.pivot_table(mask, index='asin', columns='reviewerID', values='individual_rating')
pivot.head()

IndexError: index 940183479 is out of bounds for axis 0 with size 940136683

In [121]:
review['vote'].fillna(0, inplace = True)

In [122]:
review['verified'] = review['verified'].astype(int)

In [123]:
review.head()

Unnamed: 0,individual_rating,vote,verified,reviewTime,reviewerID,asin,style,reviewText,summary
0,5.0,0.0,1,2018-10-04,A2IXGBWKX73O21,B01HI7D4VY,,"quality, very bright","for the price, excellent"
1,5.0,0.0,1,2018-10-04,AHEX3FYTJ2AX6,B01HH36F74,{'Color:': ' CA100'},Works as expected.,Works as expected.
2,3.0,0.0,0,2018-10-03,A2J77LP43D7QMX,B01HF0YGCK,{'Color:': ' Black'},For a 3 pack they had a great price when I bou...,Great for the price
3,5.0,0.0,1,2018-10-03,A32FN9RNY65CKH,B01HGF6XTI,{'Size:': ' Charger'},Went to Boston and forgot my charger.\nOrdered...,Worked fine - got me through in a pinch
4,5.0,0.0,1,2018-10-02,A2ZE26LV7HVSAD,B01GUITK24,{'Style Name:': ' Bookshelf Speaker Mount 2 Pa...,I had to use my own drywall anchors which were...,Solid and kind of heavy


In [124]:
review.fillna('empty_text',inplace = True)

In [125]:
review['content']=review['summary']+review['reviewText']

In [126]:
review[review['content']=='empty_textempty_text'].count() 

individual_rating    37
vote                 37
verified             37
reviewTime           37
reviewerID           37
asin                 37
style                37
reviewText           37
summary              37
content              37
dtype: int64

In [127]:
review.isnull().sum()

individual_rating    0
vote                 0
verified             0
reviewTime           0
reviewerID           0
asin                 0
style                0
reviewText           0
summary              0
content              0
dtype: int64

## for user based recommendation system

In [37]:
#review.fillna(0, inplace = True)

In [181]:
review.drop('reviewTime', axis=1, inplace=True)
master_product.drop(['category','description'], axis=1, inplace=True)

KeyError: "['reviewTime'] not found in axis"

In [182]:
df = pd.merge(master_product, review, on='asin')
df.shape

(161216, 14)

In [183]:
df.head(10)

Unnamed: 0,index,asin,product_name,product_brand,product_rating,product_rate_count,rating*count,individual_rating,vote,verified,reviewerID,style,reviewText,summary
0,639,B01HJDR9DQ,Geilienergy BT-17333 BT-27333 Handset Telephon...,GEILIENERGY,4.4,66.0,290.4,5.0,,True,A36X6GTJ2CQY94,{'Color:': ' 2PCS BT17333 Batteries'},Very good.,Five Stars
1,639,B01HJDR9DQ,Geilienergy BT-17333 BT-27333 Handset Telephon...,GEILIENERGY,4.4,66.0,290.4,5.0,,True,AFAI8GK0O6ZZG,{'Color:': ' 1PCS BT17333 Battery'},Fits the old cordless phones just fine and wor...,Fits and works. No issues.
2,639,B01HJDR9DQ,Geilienergy BT-17333 BT-27333 Handset Telephon...,GEILIENERGY,4.4,66.0,290.4,4.0,,True,A2HEUG9KBZL6KA,{'Color:': ' 2PCS BT17333 Batteries'},Worked as specified.,Four Stars
3,639,B01HJDR9DQ,Geilienergy BT-17333 BT-27333 Handset Telephon...,GEILIENERGY,4.4,66.0,290.4,5.0,,True,A3EKPQO4POAPU3,{'Color:': ' 1PCS BT17333 Battery'},good for the price (new),Five Stars
4,371,B01HJA3OUG,LETOUR DC 5V 30A Power Supply 150W AC 110V/220...,LETOUR,4.2,36.0,151.2,4.0,,True,A3BBXKRW6RRYBD,{'Color:': ' 5V 30A'},"It's doing its job, so I can't really complain...","Arrived Damaged, Still Works -- Wouldn't Excee..."
5,371,B01HJA3OUG,LETOUR DC 5V 30A Power Supply 150W AC 110V/220...,LETOUR,4.2,36.0,151.2,5.0,,True,AE50B0MLAS1B9,{'Color:': ' 5V 30A'},Works as advertised,Great product
6,205,B01HIZEW1C,AmazonBasics DSLR Camera and Laptop Backpack B...,AmazonBasics,4.3,66.0,283.8,5.0,,True,AO0PMFJ9VX8QY,,This is the perfect carry-on backpack; Amazon ...,Best AmazonBasics Backpack Yet!
7,205,B01HIZEW1C,AmazonBasics DSLR Camera and Laptop Backpack B...,AmazonBasics,4.3,66.0,283.8,5.0,,True,A2RU0H9MD4IH5M,,"Great backpack fits my camera, lens, gimbal, m...",good all around starter camera bag
8,205,B01HIZEW1C,AmazonBasics DSLR Camera and Laptop Backpack B...,AmazonBasics,4.3,66.0,283.8,5.0,,True,A2L12USPGEMCTM,,Love this backbag better then cheaper version....,Love this backbag better then cheaper version
9,696,B01HIURQWE,Dmax Armor for LG G Pad X 10.1 Screen Protecto...,Dmax Armor,4.1,86.0,352.6,5.0,,True,A5M0NUR4XYRLE,,"Fits perfectly, provides protection",Quality product


In [184]:
pivot = pd.pivot_table(df, index='asin', columns='reviewerID', values='individual_rating')
pivot.head()

MemoryError: Unable to allocate array with shape (10910, 101473) and data type float64

In [132]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print(sparse_pivot)

In [143]:
recommender_user = pairwise_distances(sparse_pivot, metric='cosine')

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [142]:
recommender_user = pd.DataFrame(recommender_user, columns=pivot.index, index=pivot.index)
recommender_user.head()

asin,B0048ODJ3Y,B004IA9XLU,B005ODJO32,B008ITQIMO,B009IESEBQ,B009ZIILLI,B009ZRBVDE,B00A926XLE,B00A9AQPUU,B00AI3RRAA,...,B01HI5YYN8,B01HI76QG4,B01HIL1XZY,B01HIPOCYE,B01HIS30OY,B01HIS5O7A,B01HIURQWE,B01HIZEW1C,B01HJA3OUG,B01HJDR9DQ
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0048ODJ3Y,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B004IA9XLU,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B005ODJO32,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B008ITQIMO,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B009IESEBQ,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [213]:
pickle.dump(recommender_user, open('../flask_app_complete/recommender_user.p', 'wb'))

In [147]:
search = 'B01HI5YYN8'

for num in master_product.loc[master_product['asin'].str.contains(search), 'asin']:
    print(num)
    #print('Average rating', pivot.loc[title, :].mean())
    #print('Number of ratings', pivot.T[title].count())
    print('')
    print('10 closest products')
    print(round(recommender_user[num].sort_values(),12)[1:1000])
    print('')
    print('*******************************************************************************************')
    print('')

B01HI5YYN8

10 closest products


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [212]:
search = 'Security Camera'

print(asin_name[asin_name['product_name'].str.contains(search)].iloc[0,0])
print(asin_name[asin_name['product_name'].str.contains(search)].iloc[0,1])
print(' ')
print('Recommendation')
print('10 closest product')
indexs = list(recommender_user[asin_name[asin_name['product_name'].str.contains(search)].iloc[0,0]].sort_values()[1:6].index)
for i in indexs:
    print (i,asin_dict[i])
    print ('---------------------')




B01HDZM6N8
 
Recommendation
10 closest product
B019ASJGGG HD 1080P Wireless IP Camera WiFi Home Security Cameras Baby/Elderly Monitor Nanny Cam Dog Camera Pan/Tilt with Two-Way Audio and Night Vision
---------------------
B01G76VBBW Safstar Photography Softbox Lighting Kit 24"x16" Socket Ligh Photo Portrait Studio Lighting Diffuser Soft Box Equipment (2 Softbox)
---------------------
B01BLG9DNG VonHaus Black Universal Wall Mount Speaker Brackets x 4
---------------------
B00TTLGHSK Arlo Accessory - Indoor/Outdoor Mount - White | Compatible with Arlo, Pro, Pro 2 | (VMA1000)
---------------------
B00VU2Z3J0 Apple 29W USB-C Power Adapter (MJ262LL/A) (Cable Not Included)
---------------------
