In [17]:
import os, json
import pandas as pd
import numpy as np
import gzip
from collections import Counter
from PIL import Image
import requests
from io import BytesIO
import math

In [2]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    d = {}
    for f in parse(path):
        d[i] = f
        i = i+1
    return pd.DataFrame.from_dict(d, orient = 'index')

def preprocessing(df):
    df['categories']= df.categories.apply(lambda x: x[0])
    return df

def removeCate(df, lis):
    for l in lis:
        df = df.loc[df.categories.apply(lambda x: l not in x), :]
    return df

def filterCate(df, lis):
    for l in lis:
        col = 'category_'+l
        df = df.loc[df[col] == 1, :]
    return df

def basicInfo(df):
    return df.shape, df.columns, df.isnull().sum(axis = 0), df.apply(lambda x: x.dtype, axis = 0)

def subSet(df, num, num_most_common):
    sub = df.loc[df.categories.apply(lambda x: len(x) == num), :]
    return Counter([i for j in sub.categories for i in j]).most_common(num_most_common)

def binaryCate(df, lis):
    for i in lis:
        df['category_' + i] = df.categories.apply(lambda x: 1 if i in x else 0)
    return df

def downloadPicSimple(df, path):
    asins = []
    classes = []
    result = pd.DataFrame()
    for i in range(df.shape[0]):
        url = df.iloc[i, 5]
        if pd.isna(url):
            continue
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            if img.mode == 'P':
                img = img.convert('RGB')
            asin = df.iloc[i,0]
            cate = ",".join(df.iloc[i,7])
            img.save(path+asin+'_'+cate+'.jpg')
            asins.append(asin)
            classes.append(cate)
        except OSError:
            continue
    result['asin'] = asins
    result['class'] = classes
    return result

def downloadPic(df, path, label, num_per_category = 10000000):
    asins = []
    classes = []
    result = pd.DataFrame()
    flag = False
    for i in range(df.shape[0]):
        if i == num_per_category:
            break
        url = df.iloc[i, 5]
        if pd.isna(url):
            continue
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            if flag != True:
                if img.mode == 'P':
                    img = img.convert('RGB')
                asin = df.iloc[i,0]
                cate = ",".join(df.iloc[i,7])
                img.save(path+asin+'_'+cate+'.jpg')
                asins.append(asin)
                classes.append(label)
        except OSError:
            flag = True
            continue
#         asin = df.iloc[i,0]
#         img.save(path+asin+'.jpg')
#         asins.append(asin)
#         classes.append(label)
        if flag == True:
            flag = False
    result['asin'] = asins
    result['class'] = classes
    return result
        
def downLoadCategories(df, lis, path, num_per_category):
    labels = pd.DataFrame({'asin':[], 'class': []})
    for l in lis:
        temp = df.loc[df.categories.apply(lambda x: l in x), :]
        d = downloadPic(temp, path, l, num_per_category)
        labels = labels.append(d)
        
    return labels#.to_csv('labels.csv')
        
def testProduct(df, asin_num):
    list1 = df.loc[df.asin==asin_num,:]
    return list1.categories.tolist(), list1.reviewTime

In [3]:
review = getDF("meta_Toys_and_Games.json.gz")
review = preprocessing(review)
review = removeCate(review, ['Grown-Up Toys'])

In [4]:
reviews = pd.read_json('reviews_Toys_and_Games.json', lines = True)

In [8]:
#small1 = review.iloc[0:10, :] #small2 = reviews.iloc[0:10, :] #df = pd.merge(small1, small2, how='left', on='asin', validate="one_to_many")#small1 = small1.merge(small2, left_on = 'asin', right_on = 'asin')
merged = pd.merge(review, reviews, how='left', on=['asin'])

In [None]:
df = review.copy()
df = preprocessing(df)
df = removeCate(df, ['Grown-Up Toys'])

In [9]:
def productInfo(df, asin_num = '0131358936'):
    try:
        subset = df.loc['asin' == asin_num, :]
        review_num = len(Counter(subset.reviewText))
    except KeyError:
        pass
    return df.isnull().sum(axis = 0), df.shape

In [10]:
def showImg(df, row, col1, col2):
    url = df.iloc[row, col1]
    review = df.iloc[row, col2]
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    if img.mode == 'P':
        img = img.convert('RGB')
    return img, review
# img, comment = showImg(temp1, 100, 5, 12)
# print(comment)
# img

In [11]:
cleaned = merged.copy()
def filtered(df, removelist):
    for l in removelist:
        df = df.loc[pd.notnull(df[l]), :]
    return df

cleaned = filtered(cleaned, ['title', 'price', 'reviewerID', 'description']) # reviewID represent the number of missing reviews

In [12]:
productInfo(cleaned, '0131358936')

(asin                   0
 description            0
 title                  0
 price                  0
 salesRank          41764
 imUrl                  0
 brand             259647
 categories             0
 related            23840
 reviewerID             0
 reviewerName       11277
 helpful                0
 reviewText             0
 overall                0
 summary                0
 unixReviewTime         0
 reviewTime             0
 dtype: int64, (1812934, 17))

In [18]:
# how many different products and how many different categories
len(cleaned.asin.unique()), cleaned.apply(lambda x: x.dtype)

(214049, asin              object
 description       object
 title             object
 price             object
 salesRank         object
 imUrl             object
 brand             object
 categories        object
 related           object
 reviewerID        object
 reviewerName      object
 helpful           object
 reviewText        object
 overall           object
 summary           object
 unixReviewTime    object
 reviewTime        object
 dtype: object)

In [13]:
def fillNAs(df, lis):
    for l in lis:
        df[l] = df[l].fillna('unknown')
    return df
cleaned = fillNAs(cleaned, ['salesRank', 'brand', 'related', 'reviewerID', 'reviewerName'])

In [43]:
# eda: how many categories
#binaryCate(df, lis)
#total categories
cleaned.isnull().sum(axis = 0)

asin              0
description       0
title             0
price             0
salesRank         0
imUrl             0
brand             0
categories        0
related           0
reviewerID        0
reviewerName      0
helpful           0
reviewText        0
overall           0
summary           0
unixReviewTime    0
reviewTime        0
dtype: int64

In [132]:
user_gr10 = list(dict(Counter(cleaned.asin).most_common(34000)).values())
key_gr10 = list(dict(Counter(cleaned.asin).most_common(34000)).keys())
df_gr10 = cleaned.loc[cleaned.asin.apply(lambda x: x in key_gr10), :]

In [None]:
from 

In [34]:
cleaned_train = cleaned.loc[0: math.floor(0.7 * cleaned.shape[0]), :]
cleaned_test = cleaned.loc[math.floor(0.7 * cleaned.shape[0]): -1, :]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [48]:
cleaned_1st.loc[cleaned_1st.asin == 'B000W3XEQM', 'price']

622197    13.93
622198    13.93
622199    13.93
622200    13.93
622201    13.93
          ...  
623891    13.93
623892    13.93
623893    13.93
623894    13.93
623895    13.93
Name: price, Length: 1699, dtype: float64

In [16]:
cleaned.loc[:, ['reviewerID', 'overall']].groupby('reviewerID').agg('mean').sort

Unnamed: 0_level_0,overall
reviewerID,Unnamed: 1_level_1
A0001528BGUBOEVR6T5U,5.0
A00018041RRVMCICCAP79,5.0
A00033963SSKB1BG7AL7C,1.0
A0004308GGXSGC3XLKV9,5.0
A00053701B3QMPT0WJYFN,4.5
...,...
AZZZOVIBXHGDR,4.0
AZZZRGMYLGFLM,5.0
AZZZU2TD7Q3ET,5.0
AZZZYAYJQSDOJ,4.5


In [None]:
temp = merged.dropna()

In [None]:
temp.shape, len(Counter(temp.asin))

In [None]:
most_common3 = list(dict(subSet(df, 3, 30)).keys())

In [None]:
temp1 = temp.loc[temp.reviewText.str.contains('inexpensive'), :]

In [None]:
temp1.shape, len(Counter(temp1.asin))

In [None]:
temp1.loc[0,12]

In [None]:
allpics = downloadPicSimple(df, './picall/')

In [None]:
labels = downLoadCategories(df, most_common3, './picall/', 5)

In [None]:
labels.to_csv('labels.csv')

In [None]:
#testProduct(reviews, '0735335192')
reviews.loc[reviews.asin =='0000191639', 'reviewTime']

In [None]:
keep_list = ['Learning & Education', 'Early Development Toys', 'Sorting & Stacking', 'Pretend Play', 'Building Toys'\
            'Toy Sports', 'Trains & Accessories', 'Kitchen Toys', 'Stacking Blocks', 'Beauty & Fashion','Tricycles, Scooters & Wagons'\
            ]
games = df.loc[df.categories.apply(lambda x: 'Games' in x), :]
puzzle = df.loc[df.categories.apply(lambda x: 'Jigsaw Puzzles' in x), :]
education = df.loc[df.categories.apply(lambda x: keep_list[7] in x), :]
#puzzle.iloc[0, 5]
education.iloc[100,5]

In [58]:
import requests
response = requests.get('https://openapi.etsy.com/v2/users/etsystore?api_key=h3fhkkwc15jzoo7rp8laituj')

In [130]:
#response = requests.get('https://openapi.etsy.com/v2/listings/696105354.json?api_key=h3fhkkwc15jzoo7rp8laituj?')
#response = requests.get('https://openapi.etsy.com/v2/listings/696105354.json?api_key=h3fhkkwc15jzoo7rp8laituj?')
response = requests.get('https://openapi.etsy.com/v2/listings/active?api_key=h3fhkkwc15jzoo7rp8laituj&keywords=toys&limit=100&offset=50000&includes=Images,Shop,User,inventory')

In [131]:
print(response.status_code)

200


In [132]:

# print(response.json())
result = response.json()    

In [141]:
results[0]

{'listing_id': 635346229,
 'state': 'active',
 'user_id': 55345573,
 'category_id': 69166491,
 'title': 'Custom Christmas Elf Cheerleader Sweater Clothing Photo Prop',
 'description': 'Item is NOT for children or pets & cannot be made to fit children or pets.\n\nItems fit the popular Christmas Elf and some items fit 12 inch dolls as well. \n\nFlat rate shipping for all orders placed at the same time.  No matter how many \nitems you purchase your shipping stays the same!\n\nItem is slip on no velcro is needed or used to make these items.\nMost items you must have removed the stitches holding the hands \ntogether in order to dress your elf.\n\nI sell year round except for a break after Christmas so you can plan &  prepare\naccordingly.  \n\nPlease remember items are custom made unless stated otherwise.  Pre-made\nitems can be found in the ready to ship & sale sections of my store.\n\nProcessing time is needed for custom work & is shown in the listing.\nExpect a few business days to two w

In [220]:
results = result['results']
data = pd.DataFrame()
for i in range(len(results)):
    temp = results[i]
    for k, v in temp.items():  
        if isinstance(v, list) and len(v) != 0 and isinstance(v[0], dict):
            img = v[0]
            for k1, v1 in img.items():
                data[k1] = v1
        elif isinstance(v, list):
            if len(v) == 0:
                data[k] = None
            else:
                print()
                data.loc[i, k] = [v]
        elif isinstance(v, dict):
            for k1, v1 in img.items():
                if isinstance(v1, list):
                    data[k1] = [v1]
                else:
                    data[k1] = v1
        else:
            print(v)
            data.loc[i, k] = v

635346229
active
55345573
69166491
Custom Christmas Elf Cheerleader Sweater Clothing Photo Prop
Item is NOT for children or pets & cannot be made to fit children or pets.

Items fit the popular Christmas Elf and some items fit 12 inch dolls as well. 

Flat rate shipping for all orders placed at the same time.  No matter how many 
items you purchase your shipping stays the same!

Item is slip on no velcro is needed or used to make these items.
Most items you must have removed the stitches holding the hands 
together in order to dress your elf.

I sell year round except for a break after Christmas so you can plan &  prepare
accordingly.  

Please remember items are custom made unless stated otherwise.  Pre-made
items can be found in the ready to ship & sale sections of my store.

Processing time is needed for custom work & is shown in the listing.
Expect a few business days to two weeks Jan-Sept & 1-3 weeks Oct - November.  
December 1st I will remove all custom items and only in stock i

ValueError: Must have equal len keys and value when setting with an ndarray

In [206]:
d = pd.DataFrame()
d['1'] = [[]]

In [207]:
d

Unnamed: 0,1
0,[]
