# Topic Modeling
## Metadata and Review Data
This notebook takes metadata from Amazon products and extracts the identifying keys ('asins') of Sperry Top-Sider products. It also extracts the reviews matching the asins and saves the reviews as a json.

## Metadata

In [1]:
# load metadata
metajson = open('meta_Clothing_Shoes_and_Jewelry.json', 'r')

In [2]:
# create empty dictionary
allproducts = {}
listcategories = {}

In [3]:
# create counter to show read progress
count = 0

In [4]:
# read in data
for line in metajson:
    count += 1
    if count % 100000 == 0:
        print(count)
    product = eval(line)
    
    allproducts[ product['asin'] ] = product
    
    for categories in product['categories']:
        for category in categories:
            if category in listcategories:
                listcategories[category] += 1
            if category not in listcategories:
                listcategories[category] = 1

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [5]:
# restart counter
count = 0

In [6]:
# extract asins into set
asins = set()

for product in allproducts:
    theproduct = allproducts[product]
    count += 1
    if count % 100000 == 0:
        print(count/len(allproducts))
    for categories in theproduct['categories']:
        for category in categories:
            if 'sperry top-sider' in category.lower():
                asins.add(theproduct['asin'])

0.06651660520532346
0.13303321041064692
0.19954981561597038
0.26606642082129384
0.33258302602661727
0.39909963123194075
0.4656162364372642
0.5321328416425877
0.5986494468479111
0.6651660520532345
0.7316826572585581
0.7981992624638815
0.8647158676692049
0.9312324728745284
0.9977490780798518


In [7]:
# save asins as txt
with open("asins.txt", "w") as output:
    output.write(str(asins))

## Review Data

In [8]:
# load review data
reviewjson = open('reviews_Clothing_Shoes_and_Jewelry.json', 'r')

In [9]:
# restart counter
count = 0

# create empty dictonary
reviews = {}

In [10]:
# extract reviews into dict
for line in reviewjson:
    count += 1
    if count % 100000 == 0:
        print(count)
    review = eval(line)
    theasin = review['asin']
    reviewer = review['reviewerID']

    if theasin in asins:
        key = '%s.%s' % (theasin, reviewer)
        reviews[key] = review

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000


In [11]:
# export json file
import json
json.dump(reviews, open('sperryreviews.json', 'w'))