In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings
import pyspark.sql.functions as F

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"wanggen",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'
item2image_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.valid.tsv'
category_valid_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_category_valid.parquet'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/algos/ (stored 0%)
updating: python/algos/deepfm_net.py (deflated 69%)
updating: python/algos/feature/ (stored 0%)
updating: python/algos/feature/woe_encoder.py (deflated 64%)
updating: python/algos/feature/.ipynb_checkpoints/ (stored 0%)
updating: python/algos/feature/.ipynb_checkpoints/target_encoder-checkpoint.py (deflated 51%)
updating: python/algos/feature/sequential_encoder.py (deflated 60%)
updating: python/algos/feature/target_encoder.py (deflated 51%)
updating: python/algos/feature/neg_sampler.py (deflated 66%)
updating: python/algos/feature/__pycache__/ (stored 0%)
updating: python/algos/feature/__pycache__/woe_encoder.cpython-38.pyc (deflated 43%)
updating: python/algos/feature/__pycache__/target_encoder.cpython-38.pyc (deflated 36%)
updating: python/algos/feature/__pycache__/__init__.cpython-38.pyc (deflated 29%)
updating: python/algos/feature/__pycache__/sequential_encoder.cpython-38.pyc (deflated 53%)
updating: python/algos/f

22/10/09 13:35:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1665293739682
Debug -- uiWebUrl: http://172.16.0.156:4040


In [5]:
product_dataset = read_dataset(spark, product_path)
product_dataset = product_dataset.distinct()



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_levels: long (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
item2image_df = spark.read.option("delimiter", "\t").csv(item2image_path)
item2image_df = item2image_df.withColumnRenamed('_c0', 'item_id')
item2image_df = item2image_df.withColumnRenamed('_c1', 'image')
item2image_df = item2image_df.distinct()
item2image_df.printSchema()
item2image_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- image: string (nullable = true)



[Stage 4:>                                                          (0 + 8) / 8]

+----------+----------------------------------------------------------------+
|item_id   |image                                                           |
+----------+----------------------------------------------------------------+
|B00L2MGOOQ|https://images-na.ssl-images-amazon.com/images/I/51NfBS18gmL.jpg|
|B00L365V68|https://images-na.ssl-images-amazon.com/images/I/31RM0BVDz0L.jpg|
|B00L398ZPY|https://images-na.ssl-images-amazon.com/images/I/61QB2H9lzlL.jpg|
|B00L3CTP9Q|https://images-na.ssl-images-amazon.com/images/I/31AggIZ0QAL.jpg|
|B00L3GNUI4|https://images-na.ssl-images-amazon.com/images/I/41y0MnBi9EL.jpg|
|B00L3MTAO6|https://images-na.ssl-images-amazon.com/images/I/31m2ykMCfhL.jpg|
|B00L3TYFBM|https://images-na.ssl-images-amazon.com/images/I/41buRfTf6zL.jpg|
|B00L3VGP04|https://images-na.ssl-images-amazon.com/images/I/21uIMDohqJL.jpg|
|B00L3YEZF8|https://images-na.ssl-images-amazon.com/images/I/313wI-77gnL.jpg|
|B00L43LJYI|https://images-na.ssl-images-amazon.com/images/I/518

                                                                                

In [8]:
category_valid = spark.read.parquet(category_valid_path)
category_valid.printSchema()
category_valid.show(20, False)

root
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_path: string (nullable = true)
 |-- valid_level: long (nullable = true)



[Stage 7:>                                                          (0 + 1) / 1]

+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|category                                                                                        |category_path                                                                                 |valid_level|
+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|[Clothing, Shoes & Jewelry, Luggage & Travel Gear, Umbrellas, Folding Umbrellas]                |Clothing, Shoes & Jewelry->Luggage & Travel Gear->Umbrellas->Folding Umbrellas                |3          |
|[Clothing, Shoes & Jewelry, Men, Accessories, Hats & Caps, Baseball Caps]                       |Clothing, Shoes & Jewelry->Men->Accessories->Hats & Caps->Baseball Caps       

                                                                                

In [9]:
product_dataset = product_dataset.drop('image')
product_dataset = product_dataset.join(item2image_df, on='item_id', how='leftouter')

product_dataset = product_dataset.filter(F.col("image").isNotNull())
product_dataset = product_dataset.filter(F.col("image") != '')
product_dataset = product_dataset.filter(F.col("title") != '')
product_dataset = product_dataset.filter(F.length(F.col("title")) <= 300)

product_dataset = product_dataset.withColumn('category_path', F.concat_ws('->', 'category'))
product_dataset = category_valid.filter(F.col('valid_level') > 0).select('category_path').distinct()\
                    .join(product_dataset, on='category_path', how='inner').drop('category_path')

product_dataset.limit(20).toPandas()
print(product_dataset.count())



1123952


                                                                                

In [10]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [11]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [12]:
product_subset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0002LK9UI,Alia,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[Ultra soft Feathertouch crafts this easy-care...,,Feathertouch Pull-On Pant,https://www.amazon.com/dp/B0002LK9UI,https://images-na.ssl-images-amazon.com/images...
1,B0002NYPWO,,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Feel absolutely fabulous with these Hanes pan...,$3.93 - $34.15,Hanes Women's Absolutely Ultra Sheer Control T...,https://www.amazon.com/dp/B0002NYPWO,https://images-na.ssl-images-amazon.com/images...
2,B0007V0ZYS,Hot Fash,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,[Stop him in his tracks with this stunning hal...,,Sleeveless Flattering Skirt Stylish Formal Hal...,https://www.amazon.com/dp/B0007V0ZYS,https://images-na.ssl-images-amazon.com/images...
3,B000FZVLQW,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,[Fashion and funtion meet in this stylish reve...,,Columbia Sportswear Women's Double the Fun Vest,https://www.amazon.com/dp/B000FZVLQW,https://images-na.ssl-images-amazon.com/images...
4,B0018QSSKO,Mudd,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[MUDD Junior's Striped Short is the perfect li...,,MUDD Juniors Striped Short,https://www.amazon.com/dp/B0018QSSKO,https://images-na.ssl-images-amazon.com/images...
5,B001E77NBW,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,[Offering products for every aspect of the act...,,Roxy Juniors Calypso Hooded Fleece,https://www.amazon.com/dp/B001E77NBW,https://images-na.ssl-images-amazon.com/images...
6,B001FWYZ62,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[Stylish clothes for everyday wear.],,XOXO Juniors Fold And Tuck Blouse,https://www.amazon.com/dp/B001FWYZ62,https://images-na.ssl-images-amazon.com/images...
7,B001GCVL4K,,"[Clothing, Shoes & Jewelry, Women, Clothing, J...",5,[Stretch extend tab embroidered back pocket jean],,Gloria Vanderbilt Women's Toby Jean,https://www.amazon.com/dp/B001GCVL4K,https://images-na.ssl-images-amazon.com/images...
8,B001SYRSDE,eVogues Apparel,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,eVogues Women's Sexy O-Ring Necklace Sleeveles...,https://www.amazon.com/dp/B001SYRSDE,https://images-na.ssl-images-amazon.com/images...
9,B00243FOIY,,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,[A.Byer Bold blossom floral fitted dress. This...,,A. Byer Juniors Bold Blossom Floral Print Belt...,https://www.amazon.com/dp/B00243FOIY,https://images-na.ssl-images-amazon.com/images...


In [13]:
review_dataset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B006CBAOQ6,1.0,1424304000,A1AG1NX2VXKY2T
1,B006CBAOQ6,5.0,1424304000,A3B1USQFGCWH15
2,B006CBAOQ6,5.0,1424044800,AXL1XBXHN1AQ1
3,B006CBAOQ6,4.0,1424044800,AP3CLPX9J5PAV
4,B006CBAOQ6,4.0,1423958400,A3REHYIAGCIFTB
5,B006CBAOQ6,5.0,1423267200,A2YICSVD18YH1R
6,B006CBAOQ6,3.0,1423094400,A3U28J1QPXC8SC
7,B006CBAOQ6,4.0,1422489600,ADZOZ3PZ6RLL7
8,B006CBAOQ6,5.0,1422489600,A1CSXM2VZLZPPA
9,B006CBAOQ6,5.0,1422316800,A2U3SUU8SNP8IW


In [14]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [15]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [16]:
user_5core.count()

                                                                                

1212

In [17]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,AENSGPP7TSU4V
1,A3KRRVHYFDDU59
2,A3VG66KUM1WH4Z
3,A2SXQ8QP7LORDO
4,A2XTRTRAPHO821
5,A309G6ZUPNXXZU
6,A2GXHN4EU46RM
7,A220STV60KRN7W
8,A1S5OQUVLF7HK
9,A1FEMGOF4SS3PY


In [18]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [19]:
review_5core.count()

                                                                                

105155

In [20]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B000B6LO6W,5.0,1521417600,A3KRRVHYFDDU59
1,B000H0V944,4.0,1392508800,A3KRRVHYFDDU59
2,B01CGV1FTK,5.0,1527552000,A3KRRVHYFDDU59
3,B01D1R5Q3O,4.0,1521417600,A3KRRVHYFDDU59
4,B01DBJQB18,5.0,1521417600,A3KRRVHYFDDU59
5,B01E9GP8RK,5.0,1473897600,A3KRRVHYFDDU59
6,B01FJ8OJK4,5.0,1510704000,A3KRRVHYFDDU59
7,B01FY9XN00,5.0,1527552000,A3KRRVHYFDDU59
8,B00OA0HIHO,5.0,1472428800,A3KRRVHYFDDU59
9,B00Q5L44I2,5.0,1472428800,A3KRRVHYFDDU59


In [21]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [22]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0069J1Y2Y,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,$21.59 - $26.99,PattyBoutik Women's Off Shoulder Long Sleeve Top,https://www.amazon.com/dp/B0069J1Y2Y,https://images-na.ssl-images-amazon.com/images...
1,B006MQR4QY,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,AK Anne Klein Women's Ruffle Trench Coat,https://www.amazon.com/dp/B006MQR4QY,https://images-na.ssl-images-amazon.com/images...
2,B006UCPQAQ,,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,,,"HE02713BK10, Black, 8US, Ever Pretty Chic Lace...",https://www.amazon.com/dp/B006UCPQAQ,https://images-na.ssl-images-amazon.com/images...
3,B0078LJ5KU,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,XOXO Juniors Smock Bottom Embellished Printed ...,https://www.amazon.com/dp/B0078LJ5KU,https://images-na.ssl-images-amazon.com/images...
4,B007P5S2JY,,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[Self fabric waistband with smooth interior el...,,PUMA Women's Velour Pant,https://www.amazon.com/dp/B007P5S2JY,https://images-na.ssl-images-amazon.com/images...


In [23]:
product_5core.count()

                                                                                

16883

In [24]:
'''
import mimetypes, urllib
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

def check_url(url):
    """Returns True if the url returns a response code between 200-300,
       otherwise return False.
    """
    try:
        headers = {
            "Range": "bytes=0-10",
            "User-Agent": "MyTestAgent",
            "Accept": "*/*"
        }

        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        print('response.code: ', response.code)
        return response.code in range(200, 209)
    except Exception as e:
        print('exception: ', e)
        return False

is_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())


review_5core = review_5core.limit(10000)

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core = product_5core.filter(is_image_and_ready_udf('image'))
product_5core.cache()

review_5core = product_5core.select('item_id').distinct().join(review_5core, on='item_id', how='inner')
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()
'''

'\nimport mimetypes, urllib\nfrom pyspark.sql.types import BooleanType\nfrom pyspark.sql.functions import udf\n\ndef is_url_image(url):    \n    mimetype,encoding = mimetypes.guess_type(url)\n    return (mimetype and mimetype.startswith(\'image\'))\n\ndef check_url(url):\n    """Returns True if the url returns a response code between 200-300,\n       otherwise return False.\n    """\n    try:\n        headers = {\n            "Range": "bytes=0-10",\n            "User-Agent": "MyTestAgent",\n            "Accept": "*/*"\n        }\n\n        req = urllib.request.Request(url, headers=headers)\n        response = urllib.request.urlopen(req)\n        print(\'response.code: \', response.code)\n        return response.code in range(200, 209)\n    except Exception as e:\n        print(\'exception: \', e)\n        return False\n\nis_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())\n\n\nreview_5core = review_5core.limit(10000)\n\nproduct_5core = re

In [25]:
# review_5core = review_5core.orderBy(F.rand()).limit(10000)
review_5core = review_5core.limit(10000)
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core.cache()

DataFrame[item_id: string, brand: string, category: array<string>, category_levels: bigint, description: array<string>, price: string, title: string, url: string, image: string]

In [26]:
from pyspark.sql.types import ArrayType

def concat_list_field(dataset, array_join_sep=u'\u0001'):
    array_cols = [f.name for f in dataset.schema.fields if isinstance(f.dataType, ArrayType)]
    for array_col in array_cols:
        dataset = dataset.withColumn(array_col, F.concat_ws(array_join_sep, F.col(array_col)))
    return dataset

review_5core = concat_list_field(review_5core)
user_5core = concat_list_field(user_5core)
product_5core = concat_list_field(product_5core)

In [27]:
print('review_5core: ', review_5core.count())
print('user_5core: ', user_5core.count())
print('product_5core: ', product_5core.count())

                                                                                

review_5core:  10000


                                                                                

user_5core:  109




product_5core:  2673


                                                                                

In [28]:
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.small.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.small.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.small.parquet'


In [29]:
product_5core.limit(100).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0069J1Y2Y,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,$21.59 - $26.99,PattyBoutik Women's Off Shoulder Long Sleeve Top,https://www.amazon.com/dp/B0069J1Y2Y,https://images-na.ssl-images-amazon.com/images...
1,B008EYPQFI,,"Clothing, Shoes & JewelryWomenClothingJeans",4,Look one size smaller in this Not Your Daughte...,$37.81 - $158.00,NYDJ Women's Petite Size Alina Legging Jean,https://www.amazon.com/dp/B008EYPQFI,https://images-na.ssl-images-amazon.com/images...
2,B00B1ZNI1G,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,Women's embroidered short sleeve scoop neck bl...,,Sakkas Embroidered 100% Cotton Scoop Neck Semi...,https://www.amazon.com/dp/B00B1ZNI1G,https://images-na.ssl-images-amazon.com/images...
3,B00BHB82NS,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,,Democracy Women's Rayon Lawn Print 3/4 Sleeve ...,https://www.amazon.com/dp/B00BHB82NS,https://images-na.ssl-images-amazon.com/images...
4,B00BMHE5W4,,"Clothing, Shoes & JewelryWomenClothingJeans",4,,$27.99 - $40.00,WallFlower Women's Juniors Basic Legendary Str...,https://www.amazon.com/dp/B00BMHE5W4,https://images-na.ssl-images-amazon.com/images...
5,B00ES83MGO,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,PattyBoutik Sweetheart Cross-front V Neck Ruch...,$29.99,PattyBoutik Women's Cross Front V Neck Ruched ...,https://www.amazon.com/dp/B00ES83MGO,https://images-na.ssl-images-amazon.com/images...
6,B00GUGX3WA,Lindy Bop,"Clothing, Shoes & JewelryWomenClothingDress...",5,STYLE NAME:BRIGITTE A ELEGANT JERSEY PENCIL DR...,,Lindy Bop 'Brigitte' Vintage Office Secretary ...,https://www.amazon.com/dp/B00GUGX3WA,https://images-na.ssl-images-amazon.com/images...
7,B00H3RIRLM,Hering,"Clothing, Shoes & JewelryWomenClothingTops,...",5,Spaghetti Strap Tank topNon adjustable strapsB...,,Junior Women's Basic Wirefree Shelf Bra Tank T...,https://www.amazon.com/dp/B00H3RIRLM,https://images-na.ssl-images-amazon.com/images...
8,B00HGT81WW,Dinamit Jeans,"Clothing, Shoes & JewelryWomenClothingJeans",4,,$9.99 - $14.99,Dinamit Jeans Retro Flower Printed Skinny Pants,https://www.amazon.com/dp/B00HGT81WW,https://images-na.ssl-images-amazon.com/images...
9,B00JGVAR34,Glamour Empire,"Clothing, Shoes & JewelryWomenClothingDress...",5,Beautiful jersey cocktail / summer dress with ...,$9.72 - $24.23,Glamour Empire Women's Knee Length Short Sleev...,https://www.amazon.com/dp/B00JGVAR34,https://images-na.ssl-images-amazon.com/images...


In [30]:
# user_5core.repartition(1).write.option("header", True).csv(user_path, mode='overwrite')
user_5core.repartition(1).write.parquet(user_path, mode='overwrite')

                                                                                

In [31]:
# product_5core.repartition(1).write.option("header", True).csv(item_path, mode='overwrite')
product_5core.repartition(1).write.parquet(item_path, mode='overwrite')

                                                                                

In [32]:
# review_5core.repartition(1).write.option("header", True).csv(interaction_path, mode='overwrite')
review_5core.repartition(1).write.parquet(interaction_path, mode='overwrite')

                                                                                

In [33]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [34]:
stop_spark(spark)

Debug -- spark stop


22/10/09 13:51:48 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
