In [2]:
from urllib.request import urlopen
from collections import defaultdict
from tqdm import tqdm

import os
import json
import gzip
import pickle
import pandas as pd


### load the meta data

data = []
with gzip.open('meta_Electronics.json.gz') as f:
        for l in f:
            data.append(json.loads(l.strip()))

df = pd.DataFrame.from_dict(data)
df3 = df.fillna('')
df4 = df3[df3.title.str.contains('getTime')] # unformatted rows
df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows

### load the QA

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

qa_df = getDF('qa_Electronics.json.gz')

final_df = pd.merge(df5, qa_df, on='asin')
print('Overlap for {} items'.format(len(final_df)))
final_df = final_df.drop_duplicates(subset=['question'])


Overlap for 196917 items


In [3]:
only_product_df = final_df.drop_duplicates(subset=['asin'])

category_dict = defaultdict(lambda: defaultdict(int))
for i, row in tqdm(only_product_df.iterrows()):
    category = row['category']
    for i, c in enumerate(category):
        category_dict[i][c] += 1

print(len(category_dict))

23172it [00:02, 9258.44it/s]13



In [4]:
assigned_cluster = []
assigned_cluster_level = []
for i, row in tqdm(final_df.iterrows()):
    category = row['category']
    for i, c in enumerate(category):
        if category_dict[i][c] < 400:
            assigned_cluster.append(c)
            assigned_cluster_level.append(i)
            break
        elif i == len(category)-1:            
            assigned_cluster.append(c)
            assigned_cluster_level.append(i)

print(len(assigned_cluster))
print(len(final_df))
final_df['assigned_cluster'] = assigned_cluster
final_df['assigned_cluster_level'] = assigned_cluster_level
print(final_df.head(2)[['category', 'assigned_cluster']])


179577it [00:19, 9417.61it/s]
179577
179577
                                     category             assigned_cluster
0  [Electronics, eBook Readers & Accessories]  eBook Readers & Accessories
1  [Electronics, eBook Readers & Accessories]  eBook Readers & Accessories


In [6]:
final_df['assigned_cluster'].unique()

array(['eBook Readers & Accessories', 'TV Accessories & Parts',
       'Computer Cable Adapters', 'Tablets', 'Cases', 'Monitors',
       'Video Surveillance', 'External Components',
       'MP3 & MP4 Player Accessories', 'Memory Cards',
       'Batteries, Chargers & Accessories', 'Video Cables', 'USB Cables',
       'Cleaning & Repair', 'Blank Video Media', 'Audio Cables',
       'Telephone Accessories', 'Ethernet Cables', 'External Zip Drives',
       'Power Strips & Surge Protectors', 'Input Devices',
       'On-Ear Headphones', 'Earbud Headphones', 'CB & Two-Way Radios',
       'Floppy & Tape Drives', 'Video', 'VCRs', 'Home Theater',
       'Office Electronics Accessories', 'Binoculars & Scopes',
       'Cable Security Devices', 'Film Photography',
       'Over-Ear Headphones', 'Audio & Video Accessories', 'Radios',
       'Accessories & Supplies', 'Internal Hard Drives', 'Switches',
       'Security & Surveillance', 'Hubs', 'GPS, Finders & Accessories',
       'Flashes', 'Routers',

In [3]:
final_df.columns

Index(['category', 'description', 'title', 'image', 'brand', 'feature', 'rank',
       'main_cat', 'date', 'price', 'asin', 'also_buy', 'also_view',
       'similar_item', 'tech1', 'tech2', 'details', 'fit', 'questionType',
       'answerTime', 'unixTime', 'question', 'answerType', 'answer',
       'assigned_cluster', 'assigned_cluster_level'],
      dtype='object')

In [9]:
import yake
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize

VERB_TAGS = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

kw_extractor = yake.KeywordExtractor()

index = 1 # change this for other questions 
q = final_df['question'].iloc(index)
product_title = final_df['title'].iloc(index)
product_title = final_df['assigned_cluster'].iloc(index)

keywords = kw_extractor.extract_keywords(q)

for kw in keywords:
    

tokenized_q = word_tokenize(q)
pos_tags = nltk.pos_tag(tokenized_q)

print('Title: ' + product_title + '\n')
print('Question: ' + q + '\n')
print('Keywords: ' + keywords_list + '\n')
print('POS tags: ' + pos_tags + '\n')



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/t-bmajum/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
 44%|████▎     | 85895/196917 [13:10<14:47, 125.14it/s]

ValueError: max() arg is an empty sequence

In [1]:
text = "Easy to Learn, Work, and Play Easy to use with one click interface 7.5 hour battery life allows for \"One Day Computing\" Travel light, weighting only 2.94lbs Drop tested, Shock-Proof design Wireless 802.11b/g/n connectivity Save time with fast 15 sec boot time Preloaded with over 50 applications including Open Office Wide Multi-Touchpad for comfort and convenient Glossy IMR Pearl White color for elegant look Connect with friend with built-in Camera ZBD Guaranteed for best quality LCD Store & Share your files with 20G internet storage Protect your investment with sleeve case"

import yake

kw_extractor = yake.KeywordExtractor()
keywords = kw_extractor.extract_keywords(text)
for kw in keywords:
    print(kw)


('open office wide', 0.00010860644121698296)
('glossy imr pearl', 0.00010860644121698296)
('imr pearl white', 0.00010860644121698296)
('camera zbd guaranteed', 0.00010860644121698296)
('quality lcd store', 0.00014985524694779634)
('office wide multi-touchpad', 0.00017963776274479537)
('pearl white color', 0.00017963776274479537)
('including open office', 0.0001796377627447954)
('convenient glossy imr', 0.0001796377627447954)
('built-in camera zbd', 0.0001796377627447954)
('shock-proof design wireless', 0.00019891072292280027)
('applications including open', 0.00023846771841820355)
('internet storage protect', 0.00023846771841820355)
('connectivity save time', 0.00030535855075990463)
('boot time preloaded', 0.00030535855075990463)
('hour battery life', 0.0003949446519670444)
('sec boot time', 0.0004055412934082321)
('day computing', 0.0018959639534652487)
('lcd store', 0.0018959639534652487)
('easy to learn', 0.0020748961992834336)


In [1]:
# !pip install spacy
import spacy
from nltk import Tree
# !python -m spacy download en_core_web_lg

en_nlp = spacy.load('en_core_web_lg')

doc = en_nlp("Does any other cable beside the 30 pin work to charge HD+?")

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]


Does                                      
  ___|___________                              
 |             cable                          
 |    ___________|____________________         
 |   |     |         beside           |       
 |   |     |           |              |        
 |   |     |          work            |       
 |   |     |      _____|_____         |        
 |   |     |     |          pin     charge    
 |   |     |     |           |    ____|_____   
 ?  any  other  the          30  to        HD+



[None]

In [1]:
import stanfordnlp
# stanfordnlp.download('en')  
nlp = stanfordnlp.Pipeline()
doc = nlp("Is it possible to read using this product at night?")
a  = doc.sentences[0]
dir(a)
a.print_tokens()
a.print_dependencies()

print('Is it possible to read using this product at night?')

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/t-bmajum/stanfordnlp_resources/en_ew

In [33]:
dir(a)
a.tokens[0].words[0].xpos

# b = a.print_dependencies()
# for dep_edge in a.dependencies:
#     print((int(dep_edge[2].index), int(dep_edge[0].index), dep_edge[1]))

'VBZ'

In [4]:
for dep_edge in a.dependencies:
        print('{} -> {} | Relation {}'.format(dep_edge[0].text, dep_edge[2].text, dep_edge[1]))

possible -> Is | Relation cop
possible -> it | Relation expl
ROOT -> possible | Relation root
read -> to | Relation mark
possible -> read | Relation csubj
read -> using | Relation xcomp
product -> this | Relation det
using -> product | Relation obj
night -> at | Relation case
using -> night | Relation obl
possible -> ? | Relation punct


In [7]:
import networkx as nx
import yake
import stanfordnlp
import itertools

nlp_pipeline = stanfordnlp.Pipeline()
kw_extractor = yake.KeywordExtractor(n=2)

def build_schema(text, nlp_pipeline, kw_extractor):

    # run a sent tokenizer

    # run the following for each sent

    # VERB_TAGS = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    VERB_TAGS = ['VB', 'VBG', 'VBZ']

    # run StanfordNLP pipeline
    doc = nlp_pipeline(text)
    sent  = doc.sentences[0]

    # all tokens + root
    tokens = ['root']
    tokens += [t.words[0].text.lower() for t in sent.tokens]

    # obtain all verbs and their indices
    verbs = []
    verb_indices = []
    for t in sent.tokens:
        if t.words[0].xpos in VERB_TAGS:
            verbs.append(t.words[0].text)
            verb_indices.append(int(t.words[0].index))

    # remove all verbs from keywords
    try:
        keywords = kw_extractor.extract_keywords(text)
    except Exception:
        keywords = []

    unigram_keywords_map = {}
    for kw in keywords:
        unigram_keywords_map[kw[0].lower()] = kw[0].lower().split()
    
    unigram_keywords = list(itertools.chain(*list(unigram_keywords_map.values())))

    # after this everything will be unigrams

    keywords_wo_verbs = [kw for kw in unigram_keywords if kw not in verbs]

    # obtain all keyword indices
    keyword_indices = [tokens.index(kw) for kw in keywords_wo_verbs if kw in tokens]

    # initialize dependency tree
    G = nx.DiGraph()
    for dep_edge in sent.dependencies:
        G.add_edge(int(dep_edge[0].index), int(dep_edge[2].index), relation=dep_edge[1])

    relation_edge_dict = nx.get_edge_attributes(G,'relation')

    schema = {}

    tuple_schema = []

    for v in verb_indices:
        # find a path from verb v to each keywords
        for k in keyword_indices:
            # we are finding shortest paths
            try:
                path = nx.shortest_path(G, source=v, target=k)
            except:
                # print('No path obtained')
                continue
                # check is path contains more than 2 nodes
            if len(path) > 2:
                # walk backward from the target to source
                for i, node in reversed(list(enumerate(path))):
                    if node in verb_indices:
                        # retrieve the first parent verb of the keyword
                        # obtain the relation with its child on the path
                        tuple_schema.append((tokens[v], tokens[k], relation_edge_dict[(node, path[i+1])], len(path)))
                        break
            else:
                # default case for an one-hop path between verb and keyword
                tuple_schema.append((tokens[v], tokens[k], relation_edge_dict[(v, k)], 2))

    # retain only the closest verb for each keyword
    for kw in keywords_wo_verbs:
        kw_tuples = [t for t in tuple_schema if t[1] == kw]
        if kw_tuples:
            final_tuple = min(kw_tuples, key = lambda t: t[1])  
            schema[kw] = final_tuple[:-1] # drop the path length
        else:
            schema[kw] = kw

    merged_schema = {}
    captured_uni_kw = []
    for kw, uni_kw in unigram_keywords_map.items():
        if uni_kw[0] in keywords_wo_verbs:
            if uni_kw[0] not in captured_uni_kw:
                # collect tuples based on the first unigram entry
                merged_schema[kw] = schema[uni_kw[0]]
                captured_uni_kw.extend(uni_kw)
    
    for kw, t in merged_schema.items():
        if isinstance(t, tuple):  
            merged_schema[kw] = (t[0], kw, t[-1])
        else:
            merged_schema[kw] = kw

    merged_schema = list(merged_schema.values())

    # default case, no schema, hence keep all keywords including verbs
    if len(tuple_schema) == 0:
        merged_schema = [kw[0].lower() for kw in keywords]

    merged_schema = list(set(merged_schema))

    return merged_schema



Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/t-bmajum/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/t-bmajum/stanfordnlp_resources/en_ew

In [17]:
text = 'Is it possible to read using this product using graphical interface?'
build_schema(text, nlp_pipeline, kw_extractor)

[('read', 'product', 'obj'), ('read', 'graphical interface', 'obj')]

In [8]:
# build schema for product descriptions

kw_extractor = yake.KeywordExtractor(n=2)

def build_schema_from_desc(text, kw_extractor):
    # remove all verbs from keywords
    try:
        keywords = kw_extractor.extract_keywords(text)
    except Exception:
        keywords = []

    unigram_keywords_map = {}
    for kw in keywords:
        unigram_keywords_map[kw[0].lower()] = kw[0].lower().split()

    merged_schema = []

    captured_uni_kw = []
    for kw, uni_kw in unigram_keywords_map.items():
        if uni_kw[0] not in captured_uni_kw:
            # collect tuples based on the first unigram entry
            merged_schema.append(kw)
            captured_uni_kw.extend(uni_kw)
    
    return merged_schema
    
text = ' '.join(final_df.iloc[1]['description'])

build_schema_from_desc(text, kw_extractor)


['nook color',
 'dessin cover',
 'noir cover',
 'tablet',
 'standard nook',
 'e-reader',
 'style',
 'covers',
 'protect',
 'leather']

In [9]:
# get table schema

def build_schema_from_table(table):
    if table:
        return list(table.keys())
    else:
        return []

build_schema_from_table(final_df.iloc[1]['tech1'])

['Brand Name', 'Item Weight', 'Product Dimensions']

In [18]:
# text = 'Is it possible to read using this product at night?'

schemas = []

for i in range(100):
    text = final_df['question'].iloc[i].lower()
    schema = build_schema(text, nlp_pipeline, kw_extractor)

    schemas.append(
        {'question': text,
        'schema': schema}
    )

In [11]:
# save with schema

import json
prod_dict = {}
cat = "Laptops"
cat_df = final_df[final_df['assigned_cluster'] == cat]
grps = cat_df.groupby(cat_df['asin'])
for name, g in grps:
    item_dict = {}
    qa = []
    item_id = g['asin'].iloc[0]
    for i, r in g.iterrows():
        item_dict['title'] = r['title']
        # item_dict['description'] = r['description']
        item_dict['description_schema'] = build_schema_from_desc(' '.join(r['description']), kw_extractor)
        # item_dict['table1'] = r['tech1']
        # item_dict['table2'] = r['tech2']
        table_schema = []
        table_schema.extend(build_schema_from_table(r['tech1']))
        table_schema.extend(build_schema_from_table(r['tech2']))
        item_dict['table_schema'] = table_schema
        schema = build_schema(r['question'].lower(), nlp_pipeline, kw_extractor)
        qa.append({'question': r['question'], 'schema': schema})
    item_dict['questions'] = qa
    prod_dict[item_id] = item_dict

print(len(prod_dict))

with open('{}_schema.json'.format(cat), 'w') as fp:
    json.dump(prod_dict, fp, indent=4)

26


In [16]:
# grouping to obtain global schema

# creating a simple product schema object
# key: product id, value: schema

simple_prod_dict = {}
for p, s in prod_dict.items():
    local_schema = []
    local_schema.extend(s['description_schema'])
    local_schema.extend(s['table_schema'])
    for q in s['questions']:
        local_schema.extend(q['schema'])
    simple_prod_dict[p] = local_schema

dict_keys(['B00191QN6O', 'B001BY97IU', 'B001BY97JO', 'B001BYB5ZS', 'B001BYB620', 'B001FWXCFM', 'B001GCUOFC', 'B001GIPSAM', 'B001GNBD8I', 'B0021AG0RY', 'B00284CBKS', 'B0029PQFT4', 'B002DYIXMS', 'B002MUCC52', 'B002P3KMXA', 'B0030LQ438', 'B00322PYUY', 'B003JZC5NI', 'B003TPKDY6', 'B0041G5XFQ', 'B0042TS7GE', 'B004G8QZQK', 'B0092IBD8Y', 'B00CTHQORA', 'B00FK0BOUU', 'B00HIY9I7C'])

In [None]:
# grouping based on keywords
# then bigram
# then unigram

# take out good bigram
# for each bad bigram, tokenize each bigram, and see an unigram cluster to merge

corpus = list(itertools.chain.from_iterable(list(simple_prod_dict.values())))
only_keywords = [t[1].lower() if isinstance(t, tuple) else t.lower() for t in corpus]
print(len(set(only_keywords)))

vocabulary = defaultdict(int)
for t in only_keywords:
    vocabulary[t] += 1

sorted_vocab = sorted(vocabulary.items(), key=lambda kv: kv[1], reverse=True)

import collections

vocabulary = collections.OrderedDict(sorted_vocab)

# for k, v in vocabulary.items():
#     if len(k.split()) > 1:
#         print(k, v)

bigram_vocab = defaultdict(int)
unigram_vocab = defaultdict(int)
for k, v in vocabulary.items():
    if len(k.split()) == 2:
        bigram_vocab[k] = v
    if len(k.split()) == 1:
        unigram_vocab[k] = v

print(len(bigram_vocab))
print(len(unigram_vocab))

print(bigram_vocab)

counter = 0
for k, v in bigram_vocab.items():
    if v < 3:
        token_0, token_1 = k.split()
        value_0, value_1 = unigram_vocab[token_0], unigram_vocab[token_1]
        if value_0 + value_1 != 0:
            if value_1 >= value_0:
                unigram_vocab[token_1] += 1
                counter += 1 
            else:
                unigram_vocab[token_0] += 1
                counter += 1


unigram_vocab




In [23]:
import json
prod_dict = {}
cat = "Cameras"
cat_df = final_df[final_df['assigned_cluster'] == cat]
grps = cat_df.groupby(cat_df['asin'])
for name, g in grps:
    item_dict = {}
    qa = []
    item_id = g['asin'].iloc[0]
    for i, r in g.iterrows():
        item_dict['title'] = r['title']
        item_dict['description'] = r['description']
        item_dict['table1'] = r['tech1']
        item_dict['table2'] = r['tech2']
        schema = build_schema(r['question'].lower(), nlp_pipeline, kw_extractor)
        qa.append({'question': r['question'], 'schema': schema, 'answer': r['answer']})
    item_dict['qa'] = qa
    prod_dict[item_id] = item_dict

with open('{}_schema.json'.format(cat), 'w') as fp:
    json.dump(prod_dict, fp, indent=4)

In [4]:
import json
import random

prod_dict = {}
# cat = "Cameras"
# cat_df = final_df[final_df['assigned_cluster'] == cat]
grps = final_df.groupby(final_df['asin'])

sampled_products = random.sample(list(final_df['asin']), 1000)

for name, g in grps:
    if g['asin'].iloc[0] not in sampled_products:
        continue
    else:
        item_dict = {}
        qa = []
        item_id = g['asin'].iloc[0]
        for i, r in g.iterrows():
            item_dict['title'] = r['title']
            item_dict['category'] = r['assigned_cluster']
            item_dict['description'] = r['description']
            item_dict['table1'] = r['tech1']
            item_dict['table2'] = r['tech2']
            # schema = build_schema(r['question'].lower(), nlp_pipeline, kw_extractor)
            qa.append({'question': r['question']})
        item_dict['questions'] = qa
        prod_dict[item_id] = item_dict

with open('{}.json'.format('Sampled_1000'), 'w') as fp:
    json.dump(prod_dict, fp, indent=4)

In [23]:
nx.shortest_path(G, source=5, target=10)

[5, 6, 10]

In [4]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
text = word_tokenize(final_df.iloc[0]['question'])
nltk.pos_tag(text)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/t-bmajum/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Is', 'VBZ'),
 ('this', 'DT'),
 ('cover', 'NN'),
 ('the', 'DT'),
 ('one', 'NN'),
 ('that', 'WDT'),
 ('fits', 'VBZ'),
 ('the', 'DT'),
 ('old', 'JJ'),
 ('nook', 'NN'),
 ('color', 'NN'),
 ('?', '.'),
 ('Which', 'NNP'),
 ('I', 'PRP'),
 ('believe', 'VBP'),
 ('is', 'VBZ'),
 ('8x5', 'CD'),
 ('.', '.')]

In [1]:
import yake
kw_extractor = yake.KeywordExtractor()

masked_question = []
for q in fina

In [5]:
text = "Does this work with sony EX1 and can i read the sdhc card on my macbook pro through the sd card slot?"

kw_extractor = yake.KeywordExtractor()
keywords = kw_extractor.extract_keywords(text)

for kw in keywords:
    print(kw)


('work with sony', 0.020435055290363522)
('card slot', 0.02966460909236746)
('read the sdhc', 0.033892441937102495)
('macbook pro', 0.033892441937102495)
('sdhc card', 0.04949487345881267)
('sony', 0.11060549338282699)
('slot', 0.11060549338282699)
('card', 0.13023752997463905)
('work', 0.18105634546484617)
('read', 0.18105634546484617)
('sdhc', 0.18105634546484617)
('macbook', 0.18105634546484617)
('pro', 0.18105634546484617)


In [None]:
# use YAKE
# use POS taggging and remove verbs
# add any additional NN, NNP, NNS