In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_user.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings
import pyspark.sql.functions as F

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"wanggen",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'
item2image = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.tsv'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/algos/ (stored 0%)
updating: python/algos/deepfm_net.py (deflated 69%)
updating: python/algos/feature/ (stored 0%)
updating: python/algos/feature/woe_encoder.py (deflated 64%)
updating: python/algos/feature/.ipynb_checkpoints/ (stored 0%)
updating: python/algos/feature/.ipynb_checkpoints/target_encoder-checkpoint.py (deflated 51%)
updating: python/algos/feature/sequential_encoder.py (deflated 60%)
updating: python/algos/feature/target_encoder.py (deflated 51%)
updating: python/algos/feature/neg_sampler.py (deflated 66%)
updating: python/algos/feature/__pycache__/ (stored 0%)
updating: python/algos/feature/__pycache__/woe_encoder.cpython-38.pyc (deflated 43%)
updating: python/algos/feature/__pycache__/target_encoder.cpython-38.pyc (deflated 36%)
updating: python/algos/feature/__pycache__/__init__.cpython-38.pyc (deflated 29%)
updating: python/algos/feature/__pycache__/sequential_encoder.cpython-38.pyc (deflated 53%)
updating: python/algos/f

22/09/22 15:20:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1663831241179
Debug -- uiWebUrl: http://172.16.0.156:4040


In [5]:
product_dataset = read_dataset(spark, product_path)
product_dataset = product_dataset.distinct()



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_levels: long (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
item2image_df = spark.read.option("delimiter", "\t").csv('s3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.tsv')
item2image_df = item2image_df.withColumnRenamed('_c0', 'item_id')
item2image_df = item2image_df.withColumnRenamed('_c1', 'image')
item2image_df = item2image_df.distinct()
item2image_df.printSchema()
item2image_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- image: string (nullable = true)





+----------+--------------------------------------------------------------------+
|item_id   |image                                                               |
+----------+--------------------------------------------------------------------+
|B00AQRLD4E|null                                                                |
|B00AQSTK0C|https://images-na.ssl-images-amazon.com/images/I/51tTiTgGKvL.jpg    |
|B00AQXS59Y|https://images-na.ssl-images-amazon.com/images/I/51FkqXp5LTL.jpg    |
|B00AR7X39Q|https://images-na.ssl-images-amazon.com/images/I/31S%2BRAjGI3L.jpg  |
|B00AR80NS4|https://images-na.ssl-images-amazon.com/images/I/41uxgLCTctL.jpg    |
|B00ARALGS8|null                                                                |
|B00ARAZGY8|https://images-na.ssl-images-amazon.com/images/I/41n%2BD%2BQUwsL.jpg|
|B00ARBSDVA|https://images-na.ssl-images-amazon.com/images/I/41vvxWot6ML.jpg    |
|B00ARCZH9A|https://images-na.ssl-images-amazon.com/images/I/51gAOcvHzGL.jpg    |
|B00ARE2PMU|http

                                                                                

In [8]:
product_dataset = product_dataset.drop('image')
product_dataset = product_dataset.join(item2image_df, on='item_id', how='leftouter')

product_dataset = product_dataset.filter(F.col("image").isNotNull())
product_dataset = product_dataset.filter(F.col("image") != '')
product_dataset = product_dataset.filter(F.col("title") != '')

product_dataset.count()
product_dataset.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0000E02V7,,"[Clothing, Shoes & Jewelry, Men, Shoes, Boots,...",5,"[Amazon carries Timberland footwear for men, w...",$107.94 - $188.00,Timberland Men's Premium Wp Chukka Newman Boot,https://www.amazon.com/dp/B0000E02V7,https://images-na.ssl-images-amazon.com/images...
1,B0001GSYVA,FALKE,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Incredibly soft sock made with skin-friendly ...,$15.75 - $24.00,Falke Women's Sensitive London Sock,https://www.amazon.com/dp/B0001GSYVA,https://images-na.ssl-images-amazon.com/images...
2,B0001N5TDE,NRS,"[Clothing, Shoes & Jewelry, Men, Clothing, Act...",5,[FEATURES of the NRS HydroSkin Wetsocks The 0....,,NRS HydroSkin Socks,https://www.amazon.com/dp/B0001N5TDE,https://images-na.ssl-images-amazon.com/images...
3,B0001NM3E2,Western Pack,"[Clothing, Shoes & Jewelry, Luggage & Travel G...",5,[This is an all-purpose duffel that keeps on g...,,"Western Pack RB Series 18"" Duffel Bag (Black)",https://www.amazon.com/dp/B0001NM3E2,https://images-na.ssl-images-amazon.com/images...
4,B00023DSE0,Amerileather,"[Clothing, Shoes & Jewelry, Luggage & Travel G...",5,[Carry two bottles of wine to a dinner party o...,$7.83,AmeriLeather Leather Double Wine Case Holder,https://www.amazon.com/dp/B00023DSE0,https://images-na.ssl-images-amazon.com/images...
5,B00027M0SQ,Jewels For Me,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ri...",5,[This ring features a 7x5mm pear-cut center st...,$199.00,14kt Gold Aquamarine and Diamond 7x5mm Pear Go...,https://www.amazon.com/dp/B00027M0SQ,https://images-na.ssl-images-amazon.com/images...
6,B0002L9H8S,WONDERLUX,"[Clothing, Shoes & Jewelry, Baby, Baby Girls, ...",5,"[Highest quality -100% cotton rib shortsleeve,...",,Miniwear 5 Pack Short Sleeve Bodysuit in Pink:...,https://www.amazon.com/dp/B0002L9H8S,https://images-na.ssl-images-amazon.com/images...
7,B0002UP9FO,Kerusso,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[The love and passion of Christ has built a br...,,"Build A Bridge - Christian T-Shirt, White Large",https://www.amazon.com/dp/B0002UP9FO,https://images-na.ssl-images-amazon.com/images...
8,B000302AJQ,,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ne...",5,[This Sterling Silver medal is solid sterling ...,$16.50,Dainty Sterling Silver St Christopher Medal Ne...,https://www.amazon.com/dp/B000302AJQ,https://images-na.ssl-images-amazon.com/images...
9,B00062B1MI,Propper,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,[The Proper BDU trouser is sewn to military sp...,$10.88 - $75.69,Propper Men's Bdu Trouser – Button Fly - 65/35...,https://www.amazon.com/dp/B00062B1MI,https://images-na.ssl-images-amazon.com/images...


In [9]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [10]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [11]:
product_subset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0000866GL,Goddess,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Goddess Style 304. Soft cup full coverage bra...,$39.99,Goddess Crepeset Soft Cup Bra,https://www.amazon.com/dp/B0000866GL,https://images-na.ssl-images-amazon.com/images...
1,B00080VG96,Corvette Central,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[Bring everyone to attention in this great loo...,$43.95,Corvette C6 Women's Jacket Cadet Black,https://www.amazon.com/dp/B00080VG96,https://images-na.ssl-images-amazon.com/images...
2,B0008G21LM,My Michelle,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,My Michelle Juniors Floral Print Ruffle Tank,https://www.amazon.com/dp/B0008G21LM,https://images-na.ssl-images-amazon.com/images...
3,B000B85LQE,Warm Things,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Wrap yourself in the cozy comfort of the fine...,,Warm Things Quilted Down Bed Jacket,https://www.amazon.com/dp/B000B85LQE,https://images-na.ssl-images-amazon.com/images...
4,B000E93X7Y,Carole,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,"[For a cool, comfortable start to your day or ...",$28.95,Carole Batiste Embroidered Nightgown,https://www.amazon.com/dp/B000E93X7Y,https://images-na.ssl-images-amazon.com/images...
5,B000FUD27I,Hello Kitty,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Hello Kitty Plus Size 2piece pajama set. Each...,,Hello Kitty Junior's Comfy N' Cozy Pink White ...,https://www.amazon.com/dp/B000FUD27I,https://images-na.ssl-images-amazon.com/images...
6,B000H70KQK,Nordic Fall,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,Nordic Fall Women's Basic Soft Shell Jacket,https://www.amazon.com/dp/B000H70KQK,https://images-na.ssl-images-amazon.com/images...
7,B000J4CYMO,Tourmaster,"[Clothing, Shoes & Jewelry, Women, Clothing, A...",5,[Tourmaster Women's Polyester Black Venture Pa...,,Tourmaster VENTURE MOTORCYCLE PANTS BLACK WOME...,https://www.amazon.com/dp/B000J4CYMO,https://images-na.ssl-images-amazon.com/images...
8,B000K0ZUM8,Dakine,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,,$37.95 - $100.04,Dakine Womens Flow Loose Fit Hooded Long Sleeve,https://www.amazon.com/dp/B000K0ZUM8,https://images-na.ssl-images-amazon.com/images...
9,B000KJ6HD0,,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,,,Roxy Juniors Savage Dress,https://www.amazon.com/dp/B000KJ6HD0,https://images-na.ssl-images-amazon.com/images...


In [12]:
review_dataset.limit(10).toPandas()

Unnamed: 0,item_id,rating,timestamp,user_id
0,B006CBAOQ6,1.0,1424304000,A1AG1NX2VXKY2T
1,B006CBAOQ6,5.0,1424304000,A3B1USQFGCWH15
2,B006CBAOQ6,5.0,1424044800,AXL1XBXHN1AQ1
3,B006CBAOQ6,4.0,1424044800,AP3CLPX9J5PAV
4,B006CBAOQ6,4.0,1423958400,A3REHYIAGCIFTB
5,B006CBAOQ6,5.0,1423267200,A2YICSVD18YH1R
6,B006CBAOQ6,3.0,1423094400,A3U28J1QPXC8SC
7,B006CBAOQ6,4.0,1422489600,ADZOZ3PZ6RLL7
8,B006CBAOQ6,5.0,1422489600,A1CSXM2VZLZPPA
9,B006CBAOQ6,5.0,1422316800,A2U3SUU8SNP8IW


In [13]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [14]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [15]:
user_5core.count()

                                                                                

9441

In [16]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,A1I34N9LFOSCX7
1,A3Q1QHUK0JDLIE
2,A1ODZ1IJJ4UWQK
3,A2EOPS340QPGVN
4,ARABNSS9CV3SR
5,A1S5S2GB4AT18H
6,A1ML6RV84LYP4T
7,ASPEHKL0HQMWT
8,AENSGPP7TSU4V
9,A3S256AULRHUUD


In [17]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [18]:
review_5core.count()

                                                                                

547472

In [19]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B000SOPLF2,5.0,1525046400,A1005AVG5ETV5F
1,B01FWLKHSG,5.0,1525046400,A1005AVG5ETV5F
2,B007GCIXQI,5.0,1525046400,A1005AVG5ETV5F
3,B007NJ1424,5.0,1525046400,A1005AVG5ETV5F
4,B001AV8JN8,5.0,1525046400,A1005AVG5ETV5F
5,B000UPQMES,5.0,1525046400,A1005AVG5ETV5F
6,B001A68V0O,5.0,1525046400,A1005AVG5ETV5F
7,B001CY0TYA,5.0,1525046400,A1005AVG5ETV5F
8,B00245YDKW,5.0,1525046400,A1005AVG5ETV5F
9,B00246C7X6,5.0,1525046400,A1005AVG5ETV5F


In [20]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [21]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B000QWG2RM,,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,"[Invisible under even the clingiest dresses, t...",,Calvin Klein Women's Seamless Thong Panty,https://www.amazon.com/dp/B000QWG2RM,https://images-na.ssl-images-amazon.com/images...
1,B0014GY2H6,Devon & Jones,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,"[Refined, feminine, lightweight warmth in 86% ...",$17.49 - $28.44,Devon & Jones Pink Everyday Cardigan Sweater D...,https://www.amazon.com/dp/B0014GY2H6,https://images-na.ssl-images-amazon.com/images...
2,B001ANMXXI,Principle Business Enterprises,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Terries Slip Resistant Socks not only provide...,$6.52,Terries Slip Resistant Socks XX-LARGE - DOUBLE...,https://www.amazon.com/dp/B001ANMXXI,https://images-na.ssl-images-amazon.com/images...
3,B001B16M7M,Sugar Lips,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[The original SUGARLIPS ribbed seamless tank t...,$14.95,Sugarlips Women's Misses Original Seamless Rib...,https://www.amazon.com/dp/B001B16M7M,https://images-na.ssl-images-amazon.com/images...
4,B001D6AUPK,,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[Slim fit. Sits slightly below waist. Cotton/S...,,Dickies Women's Slim Fit Boot Cut Leg Twill Pa...,https://www.amazon.com/dp/B001D6AUPK,https://images-na.ssl-images-amazon.com/images...


In [22]:
product_5core.count()

                                                                                

91157

In [23]:
user_5core.write.parquet(user_path, mode='overwrite')

                                                                                

In [24]:
product_5core.write.parquet(item_path, mode='overwrite')

                                                                                

In [25]:
review_5core.write.parquet(interaction_path, mode='overwrite')

                                                                                

In [26]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_user.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [27]:
stop_spark(spark)

Debug -- spark stop


22/09/22 15:31:32 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
