In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings
import pyspark.sql.functions as F

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"wanggen",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'
item2image = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.tsv'
category_valid_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_category_valid.parquet'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/algos/ (stored 0%)
updating: python/algos/deepfm_net.py (deflated 69%)
updating: python/algos/feature/ (stored 0%)
updating: python/algos/feature/woe_encoder.py (deflated 64%)
updating: python/algos/feature/.ipynb_checkpoints/ (stored 0%)
updating: python/algos/feature/.ipynb_checkpoints/target_encoder-checkpoint.py (deflated 51%)
updating: python/algos/feature/sequential_encoder.py (deflated 60%)
updating: python/algos/feature/target_encoder.py (deflated 51%)
updating: python/algos/feature/neg_sampler.py (deflated 66%)
updating: python/algos/feature/__pycache__/ (stored 0%)
updating: python/algos/feature/__pycache__/woe_encoder.cpython-38.pyc (deflated 43%)
updating: python/algos/feature/__pycache__/target_encoder.cpython-38.pyc (deflated 36%)
updating: python/algos/feature/__pycache__/__init__.cpython-38.pyc (deflated 29%)
updating: python/algos/feature/__pycache__/sequential_encoder.cpython-38.pyc (deflated 53%)
updating: python/algos/f

22/10/08 17:01:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1665219682319
Debug -- uiWebUrl: http://172.16.0.156:4040


In [5]:
product_dataset = read_dataset(spark, product_path)
product_dataset = product_dataset.distinct()



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_levels: long (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
item2image_df = spark.read.option("delimiter", "\t").csv('s3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.tsv')
item2image_df = item2image_df.withColumnRenamed('_c0', 'item_id')
item2image_df = item2image_df.withColumnRenamed('_c1', 'image')
item2image_df = item2image_df.distinct()
item2image_df.printSchema()
item2image_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- image: string (nullable = true)





+----------+--------------------------------------------------------------------+
|item_id   |image                                                               |
+----------+--------------------------------------------------------------------+
|B00LCB5ILM|https://images-na.ssl-images-amazon.com/images/I/31gV8c3LKpL.jpg    |
|B00LCBA35S|https://images-na.ssl-images-amazon.com/images/I/31NlMQY0LzL.jpg    |
|B00LCC3GP6|null                                                                |
|B00LCM9ZOW|null                                                                |
|B00LCRRVG6|https://images-na.ssl-images-amazon.com/images/I/31XtVXufKgL.jpg    |
|B00LCS2IXQ|https://images-na.ssl-images-amazon.com/images/I/31PSBrGYulL.jpg    |
|B00LCV9S2C|https://images-na.ssl-images-amazon.com/images/I/41APRaXYmgL.jpg    |
|B00LCVK0QK|null                                                                |
|B00LCWCKQ2|https://images-na.ssl-images-amazon.com/images/I/51vK%2BuVBM9L.jpg  |
|B00LCWJSG2|http

                                                                                

In [8]:
category_valid = spark.read.parquet(category_valid_path)
category_valid.printSchema()
category_valid.show(20, False)

root
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_path: string (nullable = true)
 |-- valid_level: long (nullable = true)

+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|category                                                                                        |category_path                                                                                 |valid_level|
+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|[Clothing, Shoes & Jewelry, Luggage & Travel Gear, Umbrellas, Folding Umbrellas]                |Clothing, Shoes & Jewelry->Luggage & Travel Gear->Umbrellas->Folding Umbrellas                |3         

                                                                                

In [9]:
product_dataset = product_dataset.drop('image')
product_dataset = product_dataset.join(item2image_df, on='item_id', how='leftouter')

product_dataset = product_dataset.filter(F.col("image").isNotNull())
product_dataset = product_dataset.filter(F.col("image") != '')
product_dataset = product_dataset.filter(F.col("title") != '')
product_dataset = product_dataset.filter(F.length(F.col("title")) <= 300)

product_dataset = product_dataset.withColumn('category_path', F.concat_ws('->', 'category'))
product_dataset = category_valid.filter(F.col('valid_level') > 0).select('category_path').distinct()\
                    .join(product_dataset, on='category_path', how='inner').drop('category_path')

product_dataset.limit(20).toPandas()
print(product_dataset.count())



1263769


                                                                                

In [10]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [11]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [12]:
product_subset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0001GSYVA,FALKE,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Incredibly soft sock made with skin-friendly ...,$15.75 - $24.00,Falke Women's Sensitive London Sock,https://www.amazon.com/dp/B0001GSYVA,https://images-na.ssl-images-amazon.com/images...
1,B000G0P2I4,,"[Clothing, Shoes & Jewelry, Women, Clothing, J...",5,[Lee Relaxed Fit jeans are cut to fit your nat...,,Lee Women's Relaxed Fit Straight Leg Jean,https://www.amazon.com/dp/B000G0P2I4,https://images-na.ssl-images-amazon.com/images...
2,B000I38P74,Maison Jules,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,,$39.99,Maison Jules Women's Polka Dot Crew-Neck Pullo...,https://www.amazon.com/dp/B000I38P74,https://images-na.ssl-images-amazon.com/images...
3,B000WASW3K,,"[Clothing, Shoes & Jewelry, Women, Clothing, J...",4,"[From YMI wannabettabutt line, say hello to th...",,YMI Women's Wannabettabutt Heave Stitch Triple...,https://www.amazon.com/dp/B000WASW3K,https://images-na.ssl-images-amazon.com/images...
4,B0010ESQXS,Cashmere Boutique,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,"[Luxurious, soft and really warm, this pea coa...",$229.00,Women's Cashmere Pea Coat,https://www.amazon.com/dp/B0010ESQXS,https://images-na.ssl-images-amazon.com/images...
5,B0014GY2H6,Devon & Jones,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,"[Refined, feminine, lightweight warmth in 86% ...",$17.49 - $28.44,Devon & Jones Pink Everyday Cardigan Sweater D...,https://www.amazon.com/dp/B0014GY2H6,https://images-na.ssl-images-amazon.com/images...
6,B0016SJP9W,Bigmansland,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,$21.99,"Bigmansland Ladies Short Sleeve Easy Care, Soi...",https://www.amazon.com/dp/B0016SJP9W,https://images-na.ssl-images-amazon.com/images...
7,B001ANMXXI,Principle Business Enterprises,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Terries Slip Resistant Socks not only provide...,$6.52,Terries Slip Resistant Socks XX-LARGE - DOUBLE...,https://www.amazon.com/dp/B001ANMXXI,https://images-na.ssl-images-amazon.com/images...
8,B001B16M7M,Sugar Lips,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[The original SUGARLIPS ribbed seamless tank t...,$14.95,Sugarlips Women's Misses Original Seamless Rib...,https://www.amazon.com/dp/B001B16M7M,https://images-na.ssl-images-amazon.com/images...
9,B001CL8JXQ,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,AK Anne Klein Women's Single Breasted Swing Coat,https://www.amazon.com/dp/B001CL8JXQ,https://images-na.ssl-images-amazon.com/images...


In [13]:
review_dataset.limit(10).toPandas()

Unnamed: 0,item_id,rating,timestamp,user_id
0,B001AESJOE,5.0,1370649600,A2ACWR0YQ2E1JV
1,B001AESJOE,5.0,1370390400,A3ARXNAGFEYM0Z
2,B001AESJOE,5.0,1369785600,A30FZXZRWL6IHX
3,B001AESJOE,5.0,1369440000,A77HIRDFBD81R
4,B001AESJOE,5.0,1368576000,A5V79SDA58MYC
5,B001AESJOE,5.0,1368489600,AK1J8IT6PP47E
6,B001AESJOE,5.0,1367020800,AXJWP32O6WRJN
7,B001AESJOE,5.0,1366416000,A6QKY66UJXD0S
8,B001AESJOE,5.0,1365465600,AS1OQJ4PF3DLW
9,B001AESJOE,3.0,1365379200,A23PMJDW2K6W2G


In [14]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [15]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [16]:
user_5core.count()

                                                                                

2304

In [17]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,AENSGPP7TSU4V
1,ARABNSS9CV3SR
2,A8NQVRADRP408
3,A3DVLJYY2MWCF
4,A111X4MKU6RAM8
5,A3KRRVHYFDDU59
6,A37DS11GFDJEVL
7,A165P3MOJV3OVZ
8,A125XVN95UDZWO
9,A1N0KPRDV8M95


In [18]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [19]:
review_5core.count()

                                                                                

185134

In [20]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B00NWGRB4W,1.0,1416441600,A111X4MKU6RAM8
1,B00CPQN4RI,1.0,1453420800,A111X4MKU6RAM8
2,B00CI36ATM,1.0,1389830400,A111X4MKU6RAM8
3,B00GAI69LU,4.0,1389830400,A111X4MKU6RAM8
4,B001CVVRT4,5.0,1464220800,A111X4MKU6RAM8
5,B01167VMXE,5.0,1481587200,A111X4MKU6RAM8
6,B012OURN9E,5.0,1470096000,A111X4MKU6RAM8
7,B00C5KIVHC,1.0,1483056000,A111X4MKU6RAM8
8,B00EKD4LIA,1.0,1396569600,A111X4MKU6RAM8
9,B00ESFANUU,3.0,1460419200,A111X4MKU6RAM8


In [21]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [22]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0014GY2H6,Devon & Jones,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,"[Refined, feminine, lightweight warmth in 86% ...",$17.49 - $28.44,Devon & Jones Pink Everyday Cardigan Sweater D...,https://www.amazon.com/dp/B0014GY2H6,https://images-na.ssl-images-amazon.com/images...
1,B001B16M7M,Sugar Lips,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[The original SUGARLIPS ribbed seamless tank t...,$14.95,Sugarlips Women's Misses Original Seamless Rib...,https://www.amazon.com/dp/B001B16M7M,https://images-na.ssl-images-amazon.com/images...
2,B001D6AUPK,,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[Slim fit. Sits slightly below waist. Cotton/S...,,Dickies Women's Slim Fit Boot Cut Leg Twill Pa...,https://www.amazon.com/dp/B001D6AUPK,https://images-na.ssl-images-amazon.com/images...
3,B002A3GR6G,FineBrandShop,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,Thermo T-Shirt Thermal Grey 29 inches long.,https://www.amazon.com/dp/B002A3GR6G,https://images-na.ssl-images-amazon.com/images...
4,B004MW3S6A,,"[Clothing, Shoes & Jewelry, Women, Clothing, J...",4,[Miraclebody jeans signature samantha bootcut ...,$49.49,Miraclebody by Miraclesuit Women's Samantha Bo...,https://www.amazon.com/dp/B004MW3S6A,https://images-na.ssl-images-amazon.com/images...


In [23]:
product_5core.count()

                                                                                

38162

In [24]:
'''
import mimetypes, urllib
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

def check_url(url):
    """Returns True if the url returns a response code between 200-300,
       otherwise return False.
    """
    try:
        headers = {
            "Range": "bytes=0-10",
            "User-Agent": "MyTestAgent",
            "Accept": "*/*"
        }

        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        print('response.code: ', response.code)
        return response.code in range(200, 209)
    except Exception as e:
        print('exception: ', e)
        return False

is_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())


review_5core = review_5core.limit(10000)

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core = product_5core.filter(is_image_and_ready_udf('image'))
product_5core.cache()

review_5core = product_5core.select('item_id').distinct().join(review_5core, on='item_id', how='inner')
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()
'''

'\nimport mimetypes, urllib\nfrom pyspark.sql.types import BooleanType\nfrom pyspark.sql.functions import udf\n\ndef is_url_image(url):    \n    mimetype,encoding = mimetypes.guess_type(url)\n    return (mimetype and mimetype.startswith(\'image\'))\n\ndef check_url(url):\n    """Returns True if the url returns a response code between 200-300,\n       otherwise return False.\n    """\n    try:\n        headers = {\n            "Range": "bytes=0-10",\n            "User-Agent": "MyTestAgent",\n            "Accept": "*/*"\n        }\n\n        req = urllib.request.Request(url, headers=headers)\n        response = urllib.request.urlopen(req)\n        print(\'response.code: \', response.code)\n        return response.code in range(200, 209)\n    except Exception as e:\n        print(\'exception: \', e)\n        return False\n\nis_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())\n\n\nreview_5core = review_5core.limit(10000)\n\nproduct_5core = re

In [25]:
# review_5core = review_5core.orderBy(F.rand()).limit(10000)
review_5core = review_5core.limit(3000)
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core.cache()

DataFrame[item_id: string, brand: string, category: array<string>, category_levels: bigint, description: array<string>, price: string, title: string, url: string, image: string]

In [26]:
from pyspark.sql.types import ArrayType

def concat_list_field(dataset, array_join_sep=u'\u0001'):
    array_cols = [f.name for f in dataset.schema.fields if isinstance(f.dataType, ArrayType)]
    for array_col in array_cols:
        dataset = dataset.withColumn(array_col, F.concat_ws(array_join_sep, F.col(array_col)))
    return dataset

review_5core = concat_list_field(review_5core)
user_5core = concat_list_field(user_5core)
product_5core = concat_list_field(product_5core)

In [27]:
print('review_5core: ', review_5core.count())
print('user_5core: ', user_5core.count())
print('product_5core: ', product_5core.count())

                                                                                

review_5core:  3000


                                                                                

user_5core:  37




product_5core:  1250


                                                                                

In [28]:
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.small.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.small.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.small.parquet'

In [29]:
product_5core.limit(100).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B00HGT81WW,Dinamit Jeans,"Clothing, Shoes & JewelryWomenClothingJeans",4,,$9.99 - $14.99,Dinamit Jeans Retro Flower Printed Skinny Pants,https://www.amazon.com/dp/B00HGT81WW,https://images-na.ssl-images-amazon.com/images...
1,B00NVPOLMO,,"Clothing, Shoes & JewelryWomenClothingDress...",5,Textured contrast inset waist fit and flare wi...,,Ivy & Blu Women's Sleeveless Textured Colorblo...,https://www.amazon.com/dp/B00NVPOLMO,https://images-na.ssl-images-amazon.com/images...
2,B00RHY325W,prAna,"Clothing, Shoes & JewelryWomenClothingJeans",4,The prAna kara jean is a Fitted pant cut with ...,$9.97 - $170.38,prAna Kara Jean,https://www.amazon.com/dp/B00RHY325W,https://images-na.ssl-images-amazon.com/images...
3,B00V4GB1IE,Urban CoCo,"Clothing, Shoes & JewelryWomenClothingTops,...",5,"Size chart XS;Bust 33"" --Sleeve 9.4"" --Length...",$9.99 - $15.95,Women's Vogue Shoulder Off Wide Hem Design Top...,https://www.amazon.com/dp/B00V4GB1IE,https://images-na.ssl-images-amazon.com/images...
4,B0177VIGCS,Zeagoo,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,$4.49 - $24.49,Zeagoo Women's Casual Floral Print Short Sleev...,https://www.amazon.com/dp/B0177VIGCS,https://images-na.ssl-images-amazon.com/images...
5,B019DLEIHC,Graceful.u,"Clothing, Shoes & JewelryWomenClothingDress...",5,Graceful.u is a registered brand and sold only...,,Graceful.u Round Neck Cap Sleeve Lace Spliced ...,https://www.amazon.com/dp/B019DLEIHC,https://images-na.ssl-images-amazon.com/images...
6,B01C8PLLSO,Jiqiuguer,"Clothing, Shoes & JewelryWomenClothingSweat...",5,,$65.00 - $66.99,Jiqiuguer Women's Lightweight Linen Jacket 3/4...,https://www.amazon.com/dp/B01C8PLLSO,https://images-na.ssl-images-amazon.com/images...
7,B00GGC0A4W,,"Clothing, Shoes & JewelryWomenClothingCoats...",5,Trendy hooded anorak with novelty gold trim. G...,,Via Spiga Women's Anorak with Hood and Zipper,https://www.amazon.com/dp/B00GGC0A4W,https://images-na.ssl-images-amazon.com/images...
8,B00KZZ8VPG,Kenneth Cole,"Clothing, Shoes & JewelryWomenClothingCoats...",5,,$89.99 - $109.99,Kenneth Cole New York Women's Chevron Down Coa...,https://www.amazon.com/dp/B00KZZ8VPG,https://images-na.ssl-images-amazon.com/images...
9,B00P8VODO0,v28,"Clothing, Shoes & JewelryWomenClothingSocks...",5,,$9.99,V28 Women Juniors 80s Eighty's Ribbed Leg Warm...,https://www.amazon.com/dp/B00P8VODO0,https://images-na.ssl-images-amazon.com/images...


In [30]:
# user_5core.repartition(1).write.option("header", True).csv(user_path, mode='overwrite')
user_5core.repartition(1).write.parquet(user_path, mode='overwrite')

                                                                                

In [31]:
# product_5core.repartition(1).write.option("header", True).csv(item_path, mode='overwrite')
product_5core.repartition(1).write.parquet(item_path, mode='overwrite')

                                                                                

In [32]:
# review_5core.repartition(1).write.option("header", True).csv(interaction_path, mode='overwrite')
review_5core.repartition(1).write.parquet(interaction_path, mode='overwrite')

                                                                                

In [33]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [34]:
stop_spark(spark)

Debug -- spark stop


22/10/08 17:12:10 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
