# Visualizing Amazon Products

In [1]:
import numpy as np
import pandas as pd
import h5py
from fse.models import SIF

import logging
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
PATH_DATA = "/Volumes/Ext_HDD/Research_Data/RAW/"
PATH_META = "/Volumes/Ext_HDD/Research_Data/Meta/"

DATA_COLUMNS = ["asin", "reviewText"]
META_COLUMNS = ['asin', 'brand', 'title', 'price', 'categories', 'description']

def load_data(category, asins=None):
    reviews = pd.DataFrame(columns=DATA_COLUMNS)
    with h5py.File(get_data_path(PATH_DATA, category), 'r') as hf:
        for c in DATA_COLUMNS:
            reviews[c] = hf[c]
    if asins is not None:
        reviews = reviews.query("asin in @asins")
    return reviews

def get_data_path(path_data, category):
    return f"{path_data}reviews_{category}"

def get_meta_path(path_meta, category):
    return f"{path_meta}{category}.pickle"

In [3]:
cat = "Electronics"

In [4]:
meta = pd.read_pickle(get_meta_path(PATH_META, cat))

In [5]:
data = load_data(cat, asins=meta.index.unique())

In [6]:
out = np.unique(data.asin, return_counts=True)
sel = out[0][out[1]>=10]

In [7]:
data = data[data.asin.isin(sel)]
meta = meta[meta.index.isin(sel)]
data = data.sample(frac=1)
print(len(meta))

97249


In [8]:
from fse import SplitCIndexedList
from collections import defaultdict

ASIN_TO_IDX = {asin : index for index, asin in enumerate(meta.index)}
ASIN_TO_COUNT = defaultdict(int)
IDX_TO_ASIN = [asin for asin in meta.index]

indexed_reviews = SplitCIndexedList(data.reviewText.values, custom_index = [ASIN_TO_IDX[asin] for asin in data.asin])

for asin in data.asin:
    ASIN_TO_COUNT[asin] += 1

In [10]:
from gensim.models.keyedvectors import FastTextKeyedVectors
ft = FastTextKeyedVectors.load("/home/ubuntu/fsedev/models/ft_crawl_300d_2m.model")

2019-09-11 09:21:20,896 : MainThread : INFO : loading FastTextKeyedVectors object from /home/ubuntu/fsedev/models/ft_crawl_300d_2m.model
2019-09-11 09:21:28,273 : MainThread : INFO : loading vectors from /home/ubuntu/fsedev/models/ft_crawl_300d_2m.model.vectors.npy with mmap=None
2019-09-11 09:21:53,556 : MainThread : INFO : loading vectors_vocab from /home/ubuntu/fsedev/models/ft_crawl_300d_2m.model.vectors_vocab.npy with mmap=None
2019-09-11 09:22:19,370 : MainThread : INFO : loading vectors_ngrams from /home/ubuntu/fsedev/models/ft_crawl_300d_2m.model.vectors_ngrams.npy with mmap=None
2019-09-11 09:22:45,435 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-09-11 09:22:45,442 : MainThread : INFO : setting ignored attribute vectors_vocab_norm to None
2019-09-11 09:22:45,444 : MainThread : INFO : setting ignored attribute vectors_ngrams_norm to None
2019-09-11 09:22:45,446 : MainThread : INFO : setting ignored attribute buckets_word to None
2019-09-11 09:22:45,

In [11]:
model = SIF(ft, components=10, workers=4)
model.train(indexed_reviews)

2019-09-11 09:22:45,484 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-09-11 09:22:50,486 : MainThread : INFO : SCANNING : finished 599965 sentences with 51660946 words
2019-09-11 09:22:55,486 : MainThread : INFO : SCANNING : finished 1246719 sentences with 107503549 words
2019-09-11 09:23:00,486 : MainThread : INFO : SCANNING : finished 1890246 sentences with 163003891 words
2019-09-11 09:23:05,486 : MainThread : INFO : SCANNING : finished 2535917 sentences with 218701908 words
2019-09-11 09:23:10,486 : MainThread : INFO : SCANNING : finished 3209470 sentences with 276692279 words
2019-09-11 09:23:15,486 : MainThread : INFO : SCANNING : finished 3843162 sentences with 331480729 words
2019-09-11 09:23:20,486 : MainThread : INFO : SCANNING : finished 4489106 sentences with 387356471 words
2019-09-11 09:23:25,486 : MainThread : INFO : SCANNING : finished 5187271 sentences with 447601765 words
2019-09-11 09:23:30,486 : MainThread : INFO : SCANNING : finish

(6875530, 593774622)

In [12]:
model.sv.vectors = model.sv.vectors.astype('float64')

In [13]:
import sys; sys.path.append('/home/ubuntu/FIt-SNE')
from fast_tsne import fast_tsne

mapping = fast_tsne(model.sv.vectors, perplexity=50, seed=42)

In [14]:
import matplotlib.pyplot as plt

col = np.array(['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99', '#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a'])

plt.figure(figsize=(5,5))
plt.scatter(mapping[:,0], mapping[:,1],  s=1)
plt.tight_layout()

In [15]:
def filter_cat(input):
    if len(input):
        return input[0][1]
    else:
        return "None"

In [16]:
df = pd.DataFrame()

df["x"] = mapping[:,0]
df["y"] = mapping[:,1]
df.index = IDX_TO_ASIN[:len(mapping)]

viz = df.join(meta[["title", "price", "brand", "description", "categories"]], how="left")
viz["prime_cat"] = viz.categories.apply(filter_cat)
viz["reviews"] = [ASIN_TO_COUNT[asin] for asin in viz.index]

In [17]:
import html

for col in ["title", "brand"]:
    for i in range(len(viz)):
        try:
            viz[col].iat[i] = html.unescape(viz[col].iat[i])
        except:
            pass

In [18]:
viz.to_excel("tableau.xlsx")