In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings
import pyspark.sql.functions as F

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"wanggen",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'
item2image_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.valid.tsv'
category_valid_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_category_valid.parquet'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/algos/ (stored 0%)
updating: python/algos/deepfm_net.py (deflated 69%)
updating: python/algos/feature/ (stored 0%)
updating: python/algos/feature/woe_encoder.py (deflated 64%)
updating: python/algos/feature/.ipynb_checkpoints/ (stored 0%)
updating: python/algos/feature/.ipynb_checkpoints/target_encoder-checkpoint.py (deflated 51%)
updating: python/algos/feature/sequential_encoder.py (deflated 60%)
updating: python/algos/feature/target_encoder.py (deflated 51%)
updating: python/algos/feature/neg_sampler.py (deflated 66%)
updating: python/algos/feature/__pycache__/ (stored 0%)
updating: python/algos/feature/__pycache__/woe_encoder.cpython-38.pyc (deflated 43%)
updating: python/algos/feature/__pycache__/target_encoder.cpython-38.pyc (deflated 36%)
updating: python/algos/feature/__pycache__/__init__.cpython-38.pyc (deflated 29%)
updating: python/algos/feature/__pycache__/sequential_encoder.cpython-38.pyc (deflated 53%)
updating: python/algos/f

22/10/08 17:55:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1665222907475
Debug -- uiWebUrl: http://172.16.0.156:4040


In [5]:
product_dataset = read_dataset(spark, product_path)
product_dataset = product_dataset.distinct()



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_levels: long (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
item2image_df = spark.read.option("delimiter", "\t").csv(item2image_path)
item2image_df = item2image_df.withColumnRenamed('_c0', 'item_id')
item2image_df = item2image_df.withColumnRenamed('_c1', 'image')
item2image_df = item2image_df.distinct()
item2image_df.printSchema()
item2image_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- image: string (nullable = true)





+----------+------------------------------------------------------------------+
|item_id   |image                                                             |
+----------+------------------------------------------------------------------+
|B00FF71V42|https://images-na.ssl-images-amazon.com/images/I/41RdM1kkkiL.jpg  |
|B00FFAAMCG|https://images-na.ssl-images-amazon.com/images/I/31cxgIJ0XJL.jpg  |
|B00FFD3NII|https://images-na.ssl-images-amazon.com/images/I/31fGtVd%2BaSL.jpg|
|B00FFE4EV2|https://images-na.ssl-images-amazon.com/images/I/41nIYM6I57L.jpg  |
|B00FFISZSQ|https://images-na.ssl-images-amazon.com/images/I/41cLwbPIXZL.jpg  |
|B00FFKF0UK|https://images-na.ssl-images-amazon.com/images/I/51gaKyg0xEL.jpg  |
|B00FFL917C|https://images-na.ssl-images-amazon.com/images/I/41DyBGtg68L.jpg  |
|B00FFM13OA|https://images-na.ssl-images-amazon.com/images/I/41rJGolhD2L.jpg  |
|B00FFQKWZC|https://images-na.ssl-images-amazon.com/images/I/31ZenUSLJbL.jpg  |
|B00FFRJRVG|https://images-na.ssl-images

                                                                                

In [8]:
category_valid = spark.read.parquet(category_valid_path)
category_valid.printSchema()
category_valid.show(20, False)

root
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_path: string (nullable = true)
 |-- valid_level: long (nullable = true)



[Stage 7:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|category                                                                                        |category_path                                                                                 |valid_level|
+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|[Clothing, Shoes & Jewelry, Luggage & Travel Gear, Umbrellas, Folding Umbrellas]                |Clothing, Shoes & Jewelry->Luggage & Travel Gear->Umbrellas->Folding Umbrellas                |3          |
|[Clothing, Shoes & Jewelry, Men, Accessories, Hats & Caps, Baseball Caps]                       |Clothing, Shoes & Jewelry->Men->Accessories->Hats & Caps->Baseball Caps       

                                                                                

In [9]:
product_dataset = product_dataset.drop('image')
product_dataset = product_dataset.join(item2image_df, on='item_id', how='leftouter')

product_dataset = product_dataset.filter(F.col("image").isNotNull())
product_dataset = product_dataset.filter(F.col("image") != '')
product_dataset = product_dataset.filter(F.col("title") != '')
product_dataset = product_dataset.filter(F.length(F.col("title")) <= 300)

product_dataset = product_dataset.withColumn('category_path', F.concat_ws('->', 'category'))
product_dataset = category_valid.filter(F.col('valid_level') > 0).select('category_path').distinct()\
                    .join(product_dataset, on='category_path', how='inner').drop('category_path')

product_dataset.limit(20).toPandas()
print(product_dataset.count())



1123952


                                                                                

In [10]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [11]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [12]:
product_subset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B000H70KQK,Nordic Fall,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,Nordic Fall Women's Basic Soft Shell Jacket,https://www.amazon.com/dp/B000H70KQK,https://images-na.ssl-images-amazon.com/images...
1,B000KJ6HD0,,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,,,Roxy Juniors Savage Dress,https://www.amazon.com/dp/B000KJ6HD0,https://images-na.ssl-images-amazon.com/images...
2,B000OMWJXU,,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,"[Brand New, Spanx, Higher Power. Just how high...",$25.00 - $49.99,SPANX Higher Power New & Slimproved,https://www.amazon.com/dp/B000OMWJXU,https://images-na.ssl-images-amazon.com/images...
3,B000TTTK94,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,"[This waterproof, insulated jacket has slimmin...",,Columbia Sportswear Women's Rosella Ridge Jacket,https://www.amazon.com/dp/B000TTTK94,https://images-na.ssl-images-amazon.com/images...
4,B000UB3JHK,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[Dickies women's short sleeve stretch oxford s...,,Dickies Women's Short Sleeve Stretch Oxford Shirt,https://www.amazon.com/dp/B000UB3JHK,https://images-na.ssl-images-amazon.com/images...
5,B001C46ND6,Best Friends Animal Society,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,Best Friends Animal Society Dog Adoption T-Shi...,https://www.amazon.com/dp/B001C46ND6,https://images-na.ssl-images-amazon.com/images...
6,B001D77GI8,MICHAEL Michael Kors,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,"[Michael Kors is renowned for his classic, chi...",,MICHAEL Michael Kors Women's Single Breasted ...,https://www.amazon.com/dp/B001D77GI8,https://images-na.ssl-images-amazon.com/images...
7,B001DKOP9S,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,[This shapely style features a slimming silhou...,,Columbia Sportswear Women's Giverny Frost Jacket,https://www.amazon.com/dp/B001DKOP9S,https://images-na.ssl-images-amazon.com/images...
8,B001EHDBQS,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,[Elegant but not pretentious. Polished and per...,,Jones New York Women's Petite Single Breasted ...,https://www.amazon.com/dp/B001EHDBQS,https://images-na.ssl-images-amazon.com/images...
9,B001EZX7NM,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,[Shell is made of 100% cotton two-toned canvas...,,Woolrich Women's West Brook Jacket,https://www.amazon.com/dp/B001EZX7NM,https://images-na.ssl-images-amazon.com/images...


In [13]:
review_dataset.limit(10).toPandas()

Unnamed: 0,item_id,rating,timestamp,user_id
0,B001AESJOE,5.0,1370649600,A2ACWR0YQ2E1JV
1,B001AESJOE,5.0,1370390400,A3ARXNAGFEYM0Z
2,B001AESJOE,5.0,1369785600,A30FZXZRWL6IHX
3,B001AESJOE,5.0,1369440000,A77HIRDFBD81R
4,B001AESJOE,5.0,1368576000,A5V79SDA58MYC
5,B001AESJOE,5.0,1368489600,AK1J8IT6PP47E
6,B001AESJOE,5.0,1367020800,AXJWP32O6WRJN
7,B001AESJOE,5.0,1366416000,A6QKY66UJXD0S
8,B001AESJOE,5.0,1365465600,AS1OQJ4PF3DLW
9,B001AESJOE,3.0,1365379200,A23PMJDW2K6W2G


In [14]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [15]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [16]:
user_5core.count()

                                                                                

1212

In [17]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,A1N0KPRDV8M95
1,A165P3MOJV3OVZ
2,A8KBJDU553RF2
3,A6H4H1E3OIIOT
4,A3HWECWIGPTN67
5,A159IR0ZIDS388
6,A2LFM653MMEC9L
7,A1RXX1LVOQFH3I
8,A2IVKM8ZHFN7BE
9,A3EBPHKGHMBPPI


In [18]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [19]:
review_5core.count()

                                                                                

105155

In [20]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B00DXTYH7W,4.0,1442016000,A3KRRVHYFDDU59
1,B00EFFXG5W,5.0,1473897600,A3KRRVHYFDDU59
2,B00IIKSOFM,3.0,1449100800,A3KRRVHYFDDU59
3,B00ISBD1FO,5.0,1436400000,A3KRRVHYFDDU59
4,B00J5AEKT8,5.0,1473292800,A3KRRVHYFDDU59
5,B00JGI9UDK,5.0,1434931200,A3KRRVHYFDDU59
6,B00LZW04N0,5.0,1473292800,A3KRRVHYFDDU59
7,B00M5EUB4E,4.0,1444003200,A3KRRVHYFDDU59
8,B00NG4BXBM,5.0,1477958400,A3KRRVHYFDDU59
9,B00M3ZJWR2,5.0,1473292800,A3KRRVHYFDDU59


In [21]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [22]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0069J1Y2Y,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,$21.59 - $26.99,PattyBoutik Women's Off Shoulder Long Sleeve Top,https://www.amazon.com/dp/B0069J1Y2Y,https://images-na.ssl-images-amazon.com/images...
1,B006MQR4QY,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,AK Anne Klein Women's Ruffle Trench Coat,https://www.amazon.com/dp/B006MQR4QY,https://images-na.ssl-images-amazon.com/images...
2,B006UCPQAQ,,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,,,"HE02713BK10, Black, 8US, Ever Pretty Chic Lace...",https://www.amazon.com/dp/B006UCPQAQ,https://images-na.ssl-images-amazon.com/images...
3,B0078LJ5KU,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,XOXO Juniors Smock Bottom Embellished Printed ...,https://www.amazon.com/dp/B0078LJ5KU,https://images-na.ssl-images-amazon.com/images...
4,B007P5S2JY,,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[Self fabric waistband with smooth interior el...,,PUMA Women's Velour Pant,https://www.amazon.com/dp/B007P5S2JY,https://images-na.ssl-images-amazon.com/images...


In [23]:
product_5core.count()

                                                                                

16883

In [24]:
'''
import mimetypes, urllib
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

def check_url(url):
    """Returns True if the url returns a response code between 200-300,
       otherwise return False.
    """
    try:
        headers = {
            "Range": "bytes=0-10",
            "User-Agent": "MyTestAgent",
            "Accept": "*/*"
        }

        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        print('response.code: ', response.code)
        return response.code in range(200, 209)
    except Exception as e:
        print('exception: ', e)
        return False

is_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())


review_5core = review_5core.limit(10000)

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core = product_5core.filter(is_image_and_ready_udf('image'))
product_5core.cache()

review_5core = product_5core.select('item_id').distinct().join(review_5core, on='item_id', how='inner')
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()
'''

'\nimport mimetypes, urllib\nfrom pyspark.sql.types import BooleanType\nfrom pyspark.sql.functions import udf\n\ndef is_url_image(url):    \n    mimetype,encoding = mimetypes.guess_type(url)\n    return (mimetype and mimetype.startswith(\'image\'))\n\ndef check_url(url):\n    """Returns True if the url returns a response code between 200-300,\n       otherwise return False.\n    """\n    try:\n        headers = {\n            "Range": "bytes=0-10",\n            "User-Agent": "MyTestAgent",\n            "Accept": "*/*"\n        }\n\n        req = urllib.request.Request(url, headers=headers)\n        response = urllib.request.urlopen(req)\n        print(\'response.code: \', response.code)\n        return response.code in range(200, 209)\n    except Exception as e:\n        print(\'exception: \', e)\n        return False\n\nis_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())\n\n\nreview_5core = review_5core.limit(10000)\n\nproduct_5core = re

In [25]:
# review_5core = review_5core.orderBy(F.rand()).limit(10000)
review_5core = review_5core.limit(3000)
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core.cache()

DataFrame[item_id: string, brand: string, category: array<string>, category_levels: bigint, description: array<string>, price: string, title: string, url: string, image: string]

In [26]:
from pyspark.sql.types import ArrayType

def concat_list_field(dataset, array_join_sep=u'\u0001'):
    array_cols = [f.name for f in dataset.schema.fields if isinstance(f.dataType, ArrayType)]
    for array_col in array_cols:
        dataset = dataset.withColumn(array_col, F.concat_ws(array_join_sep, F.col(array_col)))
    return dataset

review_5core = concat_list_field(review_5core)
user_5core = concat_list_field(user_5core)
product_5core = concat_list_field(product_5core)

In [27]:
print('review_5core: ', review_5core.count())
print('user_5core: ', user_5core.count())
print('product_5core: ', product_5core.count())

                                                                                

review_5core:  3000


                                                                                

user_5core:  38




product_5core:  928


                                                                                

In [28]:
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.small.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.small.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.small.parquet'

In [29]:
product_5core.limit(100).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B00HGT81WW,Dinamit Jeans,"Clothing, Shoes & JewelryWomenClothingJeans",4,,$9.99 - $14.99,Dinamit Jeans Retro Flower Printed Skinny Pants,https://www.amazon.com/dp/B00HGT81WW,https://images-na.ssl-images-amazon.com/images...
1,B00V4GB1IE,Urban CoCo,"Clothing, Shoes & JewelryWomenClothingTops,...",5,"Size chart XS;Bust 33"" --Sleeve 9.4"" --Length...",$9.99 - $15.95,Women's Vogue Shoulder Off Wide Hem Design Top...,https://www.amazon.com/dp/B00V4GB1IE,https://images-na.ssl-images-amazon.com/images...
2,B0101114BG,VIRGIN ONLY,"Clothing, Shoes & JewelryWomenClothingPants...",5,There is nothing sexier and more flattering th...,$19.99 - $23.99,VIRGIN ONLY Women's Light Blue Tencel Pants,https://www.amazon.com/dp/B0101114BG,https://images-na.ssl-images-amazon.com/images...
3,B011DU3MU0,Meaneor,"Clothing, Shoes & JewelryWomenClothingSweat...",5,Measurements: 1. Use similar clothing to compa...,$12.99 - $18.99,Meaneor Women's Long Sleeve Waterfall Asymmetr...,https://www.amazon.com/dp/B011DU3MU0,https://images-na.ssl-images-amazon.com/images...
4,B0177VIGCS,Zeagoo,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,$4.49 - $24.49,Zeagoo Women's Casual Floral Print Short Sleev...,https://www.amazon.com/dp/B0177VIGCS,https://images-na.ssl-images-amazon.com/images...
5,B019DLEIHC,Graceful.u,"Clothing, Shoes & JewelryWomenClothingDress...",5,Graceful.u is a registered brand and sold only...,,Graceful.u Round Neck Cap Sleeve Lace Spliced ...,https://www.amazon.com/dp/B019DLEIHC,https://images-na.ssl-images-amazon.com/images...
6,B01C8PLLSO,Jiqiuguer,"Clothing, Shoes & JewelryWomenClothingSweat...",5,,$65.00 - $66.99,Jiqiuguer Women's Lightweight Linen Jacket 3/4...,https://www.amazon.com/dp/B01C8PLLSO,https://images-na.ssl-images-amazon.com/images...
7,B00P8VODO0,v28,"Clothing, Shoes & JewelryWomenClothingSocks...",5,,$9.99,V28 Women Juniors 80s Eighty's Ribbed Leg Warm...,https://www.amazon.com/dp/B00P8VODO0,https://images-na.ssl-images-amazon.com/images...
8,B00PVWWNFC,Lovaru,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,,Adogirl Women's Long-sleeve Chiffon Fashion Sl...,https://www.amazon.com/dp/B00PVWWNFC,https://images-na.ssl-images-amazon.com/images...
9,B00VU1KRA6,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,Tank top features a tank fit with a flowing bo...,$16.99,Sakkas Rachel Verigated Embroidered Neck Picot...,https://www.amazon.com/dp/B00VU1KRA6,https://images-na.ssl-images-amazon.com/images...


In [30]:
# user_5core.repartition(1).write.option("header", True).csv(user_path, mode='overwrite')
user_5core.repartition(1).write.parquet(user_path, mode='overwrite')

                                                                                

In [31]:
# product_5core.repartition(1).write.option("header", True).csv(item_path, mode='overwrite')
product_5core.repartition(1).write.parquet(item_path, mode='overwrite')

                                                                                

In [32]:
# review_5core.repartition(1).write.option("header", True).csv(interaction_path, mode='overwrite')
review_5core.repartition(1).write.parquet(interaction_path, mode='overwrite')

                                                                                

In [33]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [34]:
stop_spark(spark)

Debug -- spark stop


22/10/08 18:05:01 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
