In [None]:
import os, json
import pandas as pd
import numpy as np
import gzip
from collections import Counter
from PIL import Image
import requests
from io import BytesIO
import math

In [None]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    d = {}
    for f in parse(path):
        d[i] = f
        i = i+1
    return pd.DataFrame.from_dict(d, orient = 'index')

def preprocessing(df):
    df['categories']= df.categories.apply(lambda x: x[0])
    return df

def removeCate(df, lis):
    for l in lis:
        df = df.loc[df.categories.apply(lambda x: l not in x), :]
    return df

def filterCate(df, lis):
    for l in lis:
        col = 'category_'+l
        df = df.loc[df[col] == 1, :]
    return df

def basicInfo(df):
    return df.shape, df.columns, df.isnull().sum(axis = 0), df.apply(lambda x: x.dtype, axis = 0)

def subSet(df, num, num_most_common):
    sub = df.loc[df.categories.apply(lambda x: len(x) == num), :]
    return Counter([i for j in sub.categories for i in j]).most_common(num_most_common)

def binaryCate(df, lis):
    for i in lis:
        df['category_' + i] = df.categories.apply(lambda x: 1 if i in x else 0)
    return df

def downloadPicSimple(df, path):
    asins = []
    classes = []
    result = pd.DataFrame()
    for i in range(df.shape[0]):
        url = df.iloc[i, 5]
        if pd.isna(url):
            continue
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            if img.mode == 'P':
                img = img.convert('RGB')
            asin = df.iloc[i,0]
            cate = ",".join(df.iloc[i,7])
            img.save(path+asin+'_'+cate+'.jpg')
            asins.append(asin)
            classes.append(cate)
        except OSError:
            continue
    result['asin'] = asins
    result['class'] = classes
    return result

def downloadPic(df, path, label, num_per_category = 10000000):
    asins = []
    classes = []
    result = pd.DataFrame()
    flag = False
    for i in range(df.shape[0]):
        if i == num_per_category:
            break
        url = df.iloc[i, 5]
        if pd.isna(url):
            continue
        try:
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            if flag != True:
                if img.mode == 'P':
                    img = img.convert('RGB')
                asin = df.iloc[i,0]
                cate = ",".join(df.iloc[i,7])
                img.save(path+asin+'_'+cate+'.jpg')
                asins.append(asin)
                classes.append(label)
        except OSError:
            flag = True
            continue
#         asin = df.iloc[i,0]
#         img.save(path+asin+'.jpg')
#         asins.append(asin)
#         classes.append(label)
        if flag == True:
            flag = False
    result['asin'] = asins
    result['class'] = classes
    return result
        
def downLoadCategories(df, lis, path, num_per_category):
    labels = pd.DataFrame({'asin':[], 'class': []})
    for l in lis:
        temp = df.loc[df.categories.apply(lambda x: l in x), :]
        d = downloadPic(temp, path, l, num_per_category)
        labels = labels.append(d)
        
    return labels#.to_csv('labels.csv')
        
def testProduct(df, asin_num):
    list1 = df.loc[df.asin==asin_num,:]
    return list1.categories.tolist(), list1.reviewTime

In [None]:
review = getDF("meta_Toys_and_Games.json.gz")
review = preprocessing(review)
review = removeCate(review, ['Grown-Up Toys'])

In [None]:
reviews = pd.read_json('reviews_Toys_and_Games.json', lines = True)

In [None]:
#small1 = review.iloc[0:10, :] #small2 = reviews.iloc[0:10, :] #df = pd.merge(small1, small2, how='left', on='asin', validate="one_to_many")#small1 = small1.merge(small2, left_on = 'asin', right_on = 'asin')
merged = pd.merge(review, reviews, how='left', on=['asin'])

In [None]:
df = review.copy()
df = preprocessing(df)
df = removeCate(df, ['Grown-Up Toys'])

In [None]:
def productInfo(df, asin_num = '0131358936'):
    try:
        subset = df.loc['asin' == asin_num, :]
        review_num = len(Counter(subset.reviewText))
    except KeyError:
        pass
    return df.isnull().sum(axis = 0), df.shape

In [None]:
def showImg(df, row, col1, col2):
    url = df.iloc[row, col1]
    review = df.iloc[row, col2]
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    if img.mode == 'P':
        img = img.convert('RGB')
    return img, review
# img, comment = showImg(temp1, 100, 5, 12)
# print(comment)
# img

In [None]:
cleaned = merged.copy()
def filtered(df, removelist):
    for l in removelist:
        df = df.loc[pd.notnull(df[l]), :]
    return df

cleaned = filtered(cleaned, ['title', 'price', 'reviewerID', 'description']) # reviewID represent the number of missing reviews

In [None]:
productInfo(cleaned, '0131358936')

In [None]:
# how many different products and how many different categories
len(cleaned.asin.unique()), cleaned.apply(lambda x: x.dtype)

In [None]:
def fillNAs(df, lis):
    for l in lis:
        df[l] = df[l].fillna('unknown')
    return df
cleaned = fillNAs(cleaned, ['salesRank', 'brand', 'related', 'reviewerID', 'reviewerName'])

In [None]:
# eda: how many categories
#binaryCate(df, lis)
#total categories
cleaned.isnull().sum(axis = 0)

In [None]:
user_gr10 = list(dict(Counter(cleaned.asin).most_common(34000)).values())
key_gr10 = list(dict(Counter(cleaned.asin).most_common(34000)).keys())
df_gr10 = cleaned.loc[cleaned.asin.apply(lambda x: x in key_gr10), :]

In [None]:
from 

In [None]:
cleaned_train = cleaned.loc[0: math.floor(0.7 * cleaned.shape[0]), :]
cleaned_test = cleaned.loc[math.floor(0.7 * cleaned.shape[0]): -1, :]


In [None]:
cleaned_1st.loc[cleaned_1st.asin == 'B000W3XEQM', 'price']

In [None]:
cleaned.loc[:, ['reviewerID', 'overall']].groupby('reviewerID').agg('mean').sort

In [None]:
temp = merged.dropna()

In [None]:
temp.shape, len(Counter(temp.asin))

In [None]:
most_common3 = list(dict(subSet(df, 3, 30)).keys())

In [None]:
temp1 = temp.loc[temp.reviewText.str.contains('inexpensive'), :]

In [None]:
temp1.shape, len(Counter(temp1.asin))

In [None]:
temp1.loc[0,12]

In [None]:
allpics = downloadPicSimple(df, './picall/')

In [None]:
labels = downLoadCategories(df, most_common3, './picall/', 5)

In [None]:
labels.to_csv('labels.csv')

In [None]:
#testProduct(reviews, '0735335192')
reviews.loc[reviews.asin =='0000191639', 'reviewTime']

In [None]:
keep_list = ['Learning & Education', 'Early Development Toys', 'Sorting & Stacking', 'Pretend Play', 'Building Toys'\
            'Toy Sports', 'Trains & Accessories', 'Kitchen Toys', 'Stacking Blocks', 'Beauty & Fashion','Tricycles, Scooters & Wagons'\
            ]
games = df.loc[df.categories.apply(lambda x: 'Games' in x), :]
puzzle = df.loc[df.categories.apply(lambda x: 'Jigsaw Puzzles' in x), :]
education = df.loc[df.categories.apply(lambda x: keep_list[7] in x), :]
#puzzle.iloc[0, 5]
education.iloc[100,5]

In [None]:
import requests
response = requests.get('https://openapi.etsy.com/v2/users/etsystore?api_key=')

In [None]:
#response = requests.get('https://openapi.etsy.com/v2/listings/696105354.json?api_key=h3fhkkwc15jzoo7rp8laituj?')
#response = requests.get('https://openapi.etsy.com/v2/listings/696105354.json?api_key=h3fhkkwc15jzoo7rp8laituj?')
response = requests.get('https://openapi.etsy.com/v2/listings/active?api_key=&keywords=toys&limit=100&offset=50000&includes=Images,Shop,User,inventory')

In [None]:
print(response.status_code)

In [None]:

# print(response.json())
result = response.json()    

In [None]:
results[0]

In [None]:
results = result['results']
data = pd.DataFrame()
for i in range(len(results)):
    temp = results[i]
    for k, v in temp.items():  
        if isinstance(v, list) and len(v) != 0 and isinstance(v[0], dict):
            img = v[0]
            for k1, v1 in img.items():
                data[k1] = v1
        elif isinstance(v, list):
            if len(v) == 0:
                data[k] = None
            else:
                print()
                data.loc[i, k] = [v]
        elif isinstance(v, dict):
            for k1, v1 in img.items():
                if isinstance(v1, list):
                    data[k1] = [v1]
                else:
                    data[k1] = v1
        else:
            print(v)
            data.loc[i, k] = v

In [None]:
d = pd.DataFrame()
d['1'] = [[]]