In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_user.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"sunkai",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/scripts/ (stored 0%)
updating: python/scripts/consul/ (stored 0%)
updating: python/scripts/consul/create_consul_watch.sh (deflated 17%)
updating: python/scripts/consul/consul_watch_load.py (deflated 72%)
updating: python/scripts/consul/build.sh (deflated 30%)
updating: python/scripts/consul/Dockerfile (deflated 44%)
updating: python/scripts/preprocessing/ (stored 0%)
updating: python/scripts/preprocessing/preprocessor_service.py (deflated 69%)
updating: python/scripts/preprocessing/example_preprocessor.py (deflated 54%)
updating: python/scripts/preprocessing/example_requirements.txt (stored 0%)
updating: python/scripts/preprocessing/test_example_preprocessor.py (deflated 51%)
updating: python/tests/ (stored 0%)
updating: python/tests/sparse_two_tower_export_demo.py (deflated 73%)
updating: python/tests/sparse_wdl_grpc_test.py (deflated 52%)
updating: python/tests/dense_xgboost_grpc_test.py (deflated 44%)
updating: python/tests/mnist_mlp.py

22/09/08 19:44:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1662637491977
Debug -- uiWebUrl: http://172.16.0.154:4040


In [5]:
product_dataset = read_dataset(spark, product_path)



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset

DataFrame[brand: string, category: array<string>, category_levels: bigint, description: array<string>, image: string, item_id: string, price: string, title: string, url: string]

In [7]:
product_dataset.limit(20).toPandas()

Unnamed: 0,brand,category,category_levels,description,image,item_id,price,title,url
0,Vintage,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,,,B001L51GRO,$8.88,4518 Vintage Olive Drab w/ Red Star Fatigue Ca...,https://www.amazon.com/dp/B001L51GRO
1,,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,,,B001L52YW0,$13.99,var aPageStart = (new Date()).getTime();\nvar ...,https://www.amazon.com/dp/B001L52YW0
2,PearlsOnly,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ri...",5,,,B001L53NQ6,$85.00,Chantel Black 9-10mm AA Quality Freshwater 925...,https://www.amazon.com/dp/B001L53NQ6
3,Wild Syde,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,,,B001L51C78,$26.99,Military Style Mini Alice Pack Black Pack,https://www.amazon.com/dp/B001L51C78
4,Government Issue,"[Clothing, Shoes & Jewelry, Women, Uniforms, W...",5,[This scarf is made from 100 Percent Soft Wool...,,B001L51C9G,$11.69,var aPageStart = (new Date()).getTime();\nvar ...,https://www.amazon.com/dp/B001L51C9G
5,Wild Syde,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[Supreme low profile military insignia cap. co...,,B001L4ZBFI,,9266 US Marine Corps Veteran Cap,https://www.amazon.com/dp/B001L4ZBFI
6,PearlsOnly,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ri...",5,,,B001L53O3I,$85.00,Fiona Lavender 9-10mm AA Quality Freshwater 92...,https://www.amazon.com/dp/B001L53O3I
7,Wild Syde,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,"[A Perfect Fashionable Attire, Built Durably T...",,B001L536UO,$10.99,Camouflage Caps ACU Digital Camo Fatigue Cap XLRG,https://www.amazon.com/dp/B001L536UO
8,Wild Syde,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,"[DRAWSTRING BOTTOMS, REINFORCED SEAT & KNEES, ...",,B001L4ZB4Y,$39.99,Camouflage Pants City Camo Vintage Paratrooper...,https://www.amazon.com/dp/B001L4ZB4Y
9,PearlsOnly,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ri...",5,,,B001L53OB0,$85.00,Sadie Black 9-10mm AA Quality Freshwater 925 S...,https://www.amazon.com/dp/B001L53OB0


In [8]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [9]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [10]:
product_subset.limit(10).toPandas()

Unnamed: 0,brand,category,category_levels,description,image,item_id,price,title,url
0,Government Issue,"[Clothing, Shoes & Jewelry, Women, Uniforms, W...",5,[This scarf is made from 100 Percent Soft Wool...,,B001L51C9G,$11.69,var aPageStart = (new Date()).getTime();\nvar ...,https://www.amazon.com/dp/B001L51C9G
1,Flip Flop Socks,"[Clothing, Shoes & Jewelry, Women, Clothing]",3,,,B001L5AMQU,$9.99,Crew Flip Flop Tabi Casual Big Toe Socks,https://www.amazon.com/dp/B001L5AMQU
2,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[IZOD is premium clothing for everyday wear],,B001L6O1IY,,IZOD Women's Long Sleeve Striped Henley,https://www.amazon.com/dp/B001L6O1IY
3,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[IZOD is premium clothing for everyday wear],,B001L6PNX6,,IZOD Women's Raglan Long Sleeve Square Neck Wi...,https://www.amazon.com/dp/B001L6PNX6
4,,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[IZOD is premium clothing for everyday wear],,B001L6X0VI,,IZOD Women's 3/4 Sleeve Shawl Collar Solid Pul...,https://www.amazon.com/dp/B001L6X0VI
5,,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[IZOD is premium clothing for everyday wear],,B001L715NC,,IZOD Women's Flap Pocket Colored Pant,https://www.amazon.com/dp/B001L715NC
6,SockGuy,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[Made with 75% ultra-wicking micro denier acry...,,B001L6LC9U,$4.99 - $25.55,SockGuy Men's Chains Socks,https://www.amazon.com/dp/B001L6LC9U
7,,"[Clothing, Shoes & Jewelry, Women, Clothing, F...",5,[IZOD is premium clothing for everyday wear],,B001L711NQ,,var aPageStart = (new Date()).getTime();\nvar ...,https://www.amazon.com/dp/B001L711NQ
8,A&E Designs,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[This James Dean T-shirt is a great 100% presh...,,B001L7JCLY,$21.99,James Dean T-shirt Dream Live Adult Black Tee ...,https://www.amazon.com/dp/B001L7JCLY
9,Unknown,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[SeXy 3 Piece Fishnet Set with lace trim. Whit...,,B001L7UQKU,,SeXy BLACK Plus Size Bustier Camisette g-Strin...,https://www.amazon.com/dp/B001L7UQKU


In [11]:
review_dataset.limit(10).toPandas()

Unnamed: 0,item_id,rating,timestamp,user_id
0,B006CBAOQ6,1.0,1424304000,A1AG1NX2VXKY2T
1,B006CBAOQ6,5.0,1424304000,A3B1USQFGCWH15
2,B006CBAOQ6,5.0,1424044800,AXL1XBXHN1AQ1
3,B006CBAOQ6,4.0,1424044800,AP3CLPX9J5PAV
4,B006CBAOQ6,4.0,1423958400,A3REHYIAGCIFTB
5,B006CBAOQ6,5.0,1423267200,A2YICSVD18YH1R
6,B006CBAOQ6,3.0,1423094400,A3U28J1QPXC8SC
7,B006CBAOQ6,4.0,1422489600,ADZOZ3PZ6RLL7
8,B006CBAOQ6,5.0,1422489600,A1CSXM2VZLZPPA
9,B006CBAOQ6,5.0,1422316800,A2U3SUU8SNP8IW


In [12]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [13]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [14]:
user_5core.count()

                                                                                

11047

In [15]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,A3DSSCVD3ODGQA
1,A3DVUL185PD63D
2,A21D8YZ9CDZQD7
3,A2VUZN3Q0H6BY7
4,A1YK47ZUFDBQ8Q
5,A2SXQ8QP7LORDO
6,A2ALF1CXASLJLC
7,A1JUWBYJ6A6RU0
8,APZAG6JETO1MR
9,A220STV60KRN7W


In [16]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [17]:
review_5core.count()

                                                                                

614734

In [18]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B01E6POSZM,3.0,1503360000,A110EEWWSOHAIS
1,B01EM71COE,1.0,1503360000,A110EEWWSOHAIS
2,B00BNBB3ME,5.0,1418515200,A110EEWWSOHAIS
3,B00BZ4AN4S,1.0,1503360000,A110EEWWSOHAIS
4,B01457U2E2,5.0,1503360000,A110EEWWSOHAIS
5,B01C4LEWJ2,4.0,1488412800,A110EEWWSOHAIS
6,B002RAKPME,5.0,1503360000,A110EEWWSOHAIS
7,B003I8548O,5.0,1454371200,A110EEWWSOHAIS
8,B0048W5IHQ,4.0,1488153600,A110EEWWSOHAIS
9,B00NJ8C2NI,3.0,1503360000,A110EEWWSOHAIS


In [20]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [21]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,brand,category,category_levels,description,image,item_id,price,title,url
0,My Michelle,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,B0008G21LM,,My Michelle Juniors Floral Print Ruffle Tank,https://www.amazon.com/dp/B0008G21LM
1,Olga,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[This new minimizing bra provides shape and bo...,,B0009HN7WM,,Olga Women's Sheer Tapestry Underwire Minimize...,https://www.amazon.com/dp/B0009HN7WM
2,Warm Things,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Wrap yourself in the cozy comfort of the fine...,,B000B85LQE,,Warm Things Quilted Down Bed Jacket,https://www.amazon.com/dp/B000B85LQE
3,,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Lovely embroidered mesh center Embroidered de...,,B000EU0OSY,,Lilyette by Bali Women's Minimizer Bra #427,https://www.amazon.com/dp/B000EU0OSY
4,,"[Clothing, Shoes & Jewelry, Women, Clothing, 9...",5,[Today's danskin enjoys status as the ultimate...,,B000NCIGVK,,Danskin Women's Revival Stovepipe Bootleg Pant,https://www.amazon.com/dp/B000NCIGVK


In [22]:
product_5core.count()

                                                                                

107093

In [23]:
user_5core.write.parquet(user_path, mode='overwrite')

                                                                                

In [24]:
product_5core.write.parquet(item_path, mode='overwrite')

                                                                                

In [25]:
review_5core.write.parquet(interaction_path, mode='overwrite')

                                                                                

In [26]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_user.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [27]:
stop_spark(spark)

Debug -- spark stop


22/09/08 20:04:48 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
