In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings
import pyspark.sql.functions as F

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"wanggen",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'
item2image = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.tsv'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/algos/ (stored 0%)
updating: python/algos/deepfm_net.py (deflated 69%)
updating: python/algos/feature/ (stored 0%)
updating: python/algos/feature/woe_encoder.py (deflated 64%)
updating: python/algos/feature/.ipynb_checkpoints/ (stored 0%)
updating: python/algos/feature/.ipynb_checkpoints/target_encoder-checkpoint.py (deflated 51%)
updating: python/algos/feature/sequential_encoder.py (deflated 60%)
updating: python/algos/feature/target_encoder.py (deflated 51%)
updating: python/algos/feature/neg_sampler.py (deflated 66%)
updating: python/algos/feature/__pycache__/ (stored 0%)
updating: python/algos/feature/__pycache__/woe_encoder.cpython-38.pyc (deflated 43%)
updating: python/algos/feature/__pycache__/target_encoder.cpython-38.pyc (deflated 36%)
updating: python/algos/feature/__pycache__/__init__.cpython-38.pyc (deflated 29%)
updating: python/algos/feature/__pycache__/sequential_encoder.cpython-38.pyc (deflated 53%)
updating: python/algos/f

22/09/26 18:14:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1664187272369
Debug -- uiWebUrl: http://172.16.0.156:4040


In [5]:
product_dataset = read_dataset(spark, product_path)
product_dataset = product_dataset.distinct()



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_levels: long (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
item2image_df = spark.read.option("delimiter", "\t").csv('s3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.tsv')
item2image_df = item2image_df.withColumnRenamed('_c0', 'item_id')
item2image_df = item2image_df.withColumnRenamed('_c1', 'image')
item2image_df = item2image_df.distinct()
item2image_df.printSchema()
item2image_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- image: string (nullable = true)





+----------+--------------------------------------------------------------------+
|item_id   |image                                                               |
+----------+--------------------------------------------------------------------+
|B00LCB5ILM|https://images-na.ssl-images-amazon.com/images/I/31gV8c3LKpL.jpg    |
|B00LCBA35S|https://images-na.ssl-images-amazon.com/images/I/31NlMQY0LzL.jpg    |
|B00LCC3GP6|null                                                                |
|B00LCM9ZOW|null                                                                |
|B00LCRRVG6|https://images-na.ssl-images-amazon.com/images/I/31XtVXufKgL.jpg    |
|B00LCS2IXQ|https://images-na.ssl-images-amazon.com/images/I/31PSBrGYulL.jpg    |
|B00LCV9S2C|https://images-na.ssl-images-amazon.com/images/I/41APRaXYmgL.jpg    |
|B00LCVK0QK|null                                                                |
|B00LCWCKQ2|https://images-na.ssl-images-amazon.com/images/I/51vK%2BuVBM9L.jpg  |
|B00LCWJSG2|http

                                                                                

In [8]:
product_dataset = product_dataset.drop('image')
product_dataset = product_dataset.join(item2image_df, on='item_id', how='leftouter')

product_dataset = product_dataset.filter(F.col("image").isNotNull())
product_dataset = product_dataset.filter(F.col("image") != '')
product_dataset = product_dataset.filter(F.col("title") != '')

product_dataset.count()
product_dataset.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B000050B0Q,,"[Clothing, Shoes & Jewelry, Costumes & Accesso...",4,[Rubie's Costume Company has designed quality ...,$4.94,Rubie's Pirate Pak With Earring And Eye Patch,https://www.amazon.com/dp/B000050B0Q,https://images-na.ssl-images-amazon.com/images...
1,B0000AHE2V,Amazon Collection,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ea...",4,[Chandelier earrings are versatile enough to b...,,14k Gold Two-Tone Diamond-Cut Chandelier Post ...,https://www.amazon.com/dp/B0000AHE2V,https://images-na.ssl-images-amazon.com/images...
2,B0000WL750,,"[Clothing, Shoes & Jewelry, Men, Uniforms, Wor...",5,[Denim bib overall is the perfect workwear cho...,$54.99 - $104.67,Carhartt Men's Denim Unlined Bib Overall R08,https://www.amazon.com/dp/B0000WL750,https://images-na.ssl-images-amazon.com/images...
3,B0001N6C7Q,Western Pack,"[Clothing, Shoes & Jewelry, Luggage & Travel G...",3,"[Western Pack Pony Series 15"" Mini Duffel Bag ...",,"Western Pack Pony Series 15"" Mini Duffel Bag (...",https://www.amazon.com/dp/B0001N6C7Q,https://images-na.ssl-images-amazon.com/images...
4,B0001ZVGE8,JewelryWeb,"[Clothing, Shoes & Jewelry, Women, Jewelry, Ne...",5,[Baguette and Round Diamond Cross - 0.25 Carat...,,Baguette and Round Diamond Cross,https://www.amazon.com/dp/B0001ZVGE8,https://images-na.ssl-images-amazon.com/images...
5,B0002UU1OS,Pleaser,"[Clothing, Shoes & Jewelry, Women, Shoes, Boot...",5,[<b>Heel Height: Approx. 4 3/4'' Tall. Provoca...,$75.39 - $120.00,Pleaser Demonia Women's Slush-225 Boot,https://www.amazon.com/dp/B0002UU1OS,https://images-na.ssl-images-amazon.com/images...
6,B0002Z1J58,Pleaser,"[Clothing, Shoes & Jewelry, Women, Shoes, Boots]",4,[Shaft Height: Approx. 16 1/4'' Tall. Heel Hei...,$63.95 - $87.00,Pleaser Women's Electra-2020 Boot,https://www.amazon.com/dp/B0002Z1J58,https://images-na.ssl-images-amazon.com/images...
7,B00067TZ26,OSCAR DE LA RENTA,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,"[Flouncy and flirty, this chemise is gorgeous!...",,OSCAR DE LA RENTA Chemise,https://www.amazon.com/dp/B00067TZ26,https://images-na.ssl-images-amazon.com/images...
8,B0006Q6H1O,Freestyle,"[Clothing, Shoes & Jewelry, Women, Watches, Wr...",5,"[A brand-new, unused, and unworn item, in the ...",,FREESTYLE Tabu Women's Silver-tone watch with ...,https://www.amazon.com/dp/B0006Q6H1O,https://images-na.ssl-images-amazon.com/images...
9,B000783NRW,,"[Clothing, Shoes & Jewelry, Men, Clothing, Act...",5,"[Long sleeve crew neck pull over fleece, gener...",$42.95 - $47.95,Russell Athletic Men's Big & Tall Basic Crew N...,https://www.amazon.com/dp/B000783NRW,https://images-na.ssl-images-amazon.com/images...


In [9]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [10]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [11]:
product_subset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0001GSYVA,FALKE,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Incredibly soft sock made with skin-friendly ...,$15.75 - $24.00,Falke Women's Sensitive London Sock,https://www.amazon.com/dp/B0001GSYVA,https://images-na.ssl-images-amazon.com/images...
1,B00099Z6QU,Fitness Wear,"[Clothing, Shoes & Jewelry, Women, Clothing, A...",5,[Capris have contrast color piping. Elastic wa...,,Womens Sport Capri by Fitness Wear in Black wi...,https://www.amazon.com/dp/B00099Z6QU,https://images-na.ssl-images-amazon.com/images...
2,B000FCA43Q,Naturana,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,"[A gorgeous and very feminine underwired bra, ...",$5.05 - $34.99,Naturana Women's Seamless,https://www.amazon.com/dp/B000FCA43Q,https://images-na.ssl-images-amazon.com/images...
3,B000G0P2I4,,"[Clothing, Shoes & Jewelry, Women, Clothing, J...",5,[Lee Relaxed Fit jeans are cut to fit your nat...,,Lee Women's Relaxed Fit Straight Leg Jean,https://www.amazon.com/dp/B000G0P2I4,https://images-na.ssl-images-amazon.com/images...
4,B000I38P74,Maison Jules,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,,$39.99,Maison Jules Women's Polka Dot Crew-Neck Pullo...,https://www.amazon.com/dp/B000I38P74,https://images-na.ssl-images-amazon.com/images...
5,B000NWKOUQ,,"[Clothing, Shoes & Jewelry, Women, Clothing]",3,[Instant gratification without dieting! Praise...,,"NYDJ Women's Sarah 5 Pocket Bootcut Jeans, Bla...",https://www.amazon.com/dp/B000NWKOUQ,https://images-na.ssl-images-amazon.com/images...
6,B000Q60ZEE,Vanca Craft Japan,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",5,[The key chains for the cat-lovers of the worl...,,VANCA Craft Petit Mascot Leather Mini Animal K...,https://www.amazon.com/dp/B000Q60ZEE,https://images-na.ssl-images-amazon.com/images...
7,B000QWG2RM,,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,"[Invisible under even the clingiest dresses, t...",,Calvin Klein Women's Seamless Thong Panty,https://www.amazon.com/dp/B000QWG2RM,https://images-na.ssl-images-amazon.com/images...
8,B000U7RM36,National,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Our T-Back Front Hook Bra provides exceptiona...,,"National T-Back Bra, White, 46D",https://www.amazon.com/dp/B000U7RM36,https://images-na.ssl-images-amazon.com/images...
9,B000W6P8PO,,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Bulky» Our long underwear is anything but! Ma...,,Jockey Women's Long Underwear No Panty Line Pr...,https://www.amazon.com/dp/B000W6P8PO,https://images-na.ssl-images-amazon.com/images...


In [12]:
review_dataset.limit(10).toPandas()

Unnamed: 0,item_id,rating,timestamp,user_id
0,B00943ZPV8,4.0,1361923200,AKEK15M5CAVRU
1,B00943ZPV8,3.0,1360022400,A2O6V16I3DATXR
2,B00943ZPV8,5.0,1359331200,A245G62BIJ7WCN
3,B00943ZPV8,5.0,1359072000,A188O1Q8FRGOIF
4,B00943ZPV8,5.0,1357776000,AH0NF9YS81E1U
5,B00944CDWG,1.0,1374192000,A2DV54Y1RWKYLC
6,B00944CDWG,5.0,1370044800,AEI4FXKTQMF4D
7,B00944CDWG,4.0,1368144000,A29K6RN1I6FYPB
8,B00944CDWG,3.0,1367971200,A3KOG6FQUU1I21
9,B00944CDWG,5.0,1461456000,A20HN8RIH78NXR


In [13]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [14]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [15]:
user_5core.count()

                                                                                

9441

In [16]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,A29RZ6MEYBCX57
1,A3FRGRLR6NPXX8
2,AT4TEQ216VX3K
3,A165P3MOJV3OVZ
4,A3OQ4I8U6TYCVE
5,A1N0KPRDV8M95
6,A125XVN95UDZWO
7,A3FMK5TW8HVBZZ
8,A1RNXMX5N153K
9,A321SOUJ80WDP4


In [17]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [18]:
review_5core.count()

                                                                                

547472

In [19]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B0018R3A0G,4.0,1387238400,A120RH58WVY4W6
1,B005I72AA8,5.0,1345852800,A120RH58WVY4W6
2,B00BNREL1I,4.0,1394236800,A120RH58WVY4W6
3,B00BOHLDAO,4.0,1400025600,A120RH58WVY4W6
4,B0073JM9WI,5.0,1353888000,A120RH58WVY4W6
5,B0073LV4OU,5.0,1352073600,A120RH58WVY4W6
6,B00780YE7A,5.0,1427932800,A120RH58WVY4W6
7,B007HLCWQK,5.0,1427932800,A120RH58WVY4W6
8,B007V039FU,5.0,1416441600,A120RH58WVY4W6
9,B0084FZX14,3.0,1428537600,A120RH58WVY4W6


In [20]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [21]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0008G21LM,My Michelle,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,My Michelle Juniors Floral Print Ruffle Tank,https://www.amazon.com/dp/B0008G21LM,https://images-na.ssl-images-amazon.com/images...
1,B000B85LQE,Warm Things,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",5,[Wrap yourself in the cozy comfort of the fine...,,Warm Things Quilted Down Bed Jacket,https://www.amazon.com/dp/B000B85LQE,https://images-na.ssl-images-amazon.com/images...
2,B000NCIGVK,,"[Clothing, Shoes & Jewelry, Women, Clothing, 9...",5,[Today's danskin enjoys status as the ultimate...,,Danskin Women's Revival Stovepipe Bootleg Pant,https://www.amazon.com/dp/B000NCIGVK,https://images-na.ssl-images-amazon.com/images...
3,B000TTTK94,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,"[This waterproof, insulated jacket has slimmin...",,Columbia Sportswear Women's Rosella Ridge Jacket,https://www.amazon.com/dp/B000TTTK94,https://images-na.ssl-images-amazon.com/images...
4,B000UB3JHK,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[Dickies women's short sleeve stretch oxford s...,,Dickies Women's Short Sleeve Stretch Oxford Shirt,https://www.amazon.com/dp/B000UB3JHK,https://images-na.ssl-images-amazon.com/images...


In [22]:
product_5core.count()

                                                                                

91157

In [23]:
# review_5core = review_5core.orderBy(F.rand()).limit(10000)
review_5core = review_5core.limit(10000)
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core.cache()

DataFrame[item_id: string, brand: string, category: array<string>, category_levels: bigint, description: array<string>, price: string, title: string, url: string, image: string]

In [24]:
from pyspark.sql.types import ArrayType

def concat_list_field(dataset, array_join_sep=u'\u0001'):
    array_cols = [f.name for f in dataset.schema.fields if isinstance(f.dataType, ArrayType)]
    for array_col in array_cols:
        dataset = dataset.withColumn(array_col, F.concat_ws(array_join_sep, F.col(array_col)))
    return dataset

review_5core = concat_list_field(review_5core)
user_5core = concat_list_field(user_5core)
product_5core = concat_list_field(product_5core)

In [25]:
print('review_5core: ', review_5core.count())
print('user_5core: ', user_5core.count())
print('product_5core: ', product_5core.count())

                                                                                

review_5core:  10000


                                                                                

user_5core:  5750




product_5core:  3866


                                                                                

In [26]:
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.small.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.small.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.small.parquet'

In [27]:
product_5core.limit(100).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B005GVQ0D4,,"Clothing, Shoes & JewelryWomenClothingLeggi...",5,These cropped stretch leggings with slit hems ...,$18.05 - $18.12,Woman Within Plus Size Stretch Cotton Capri Le...,https://www.amazon.com/dp/B005GVQ0D4,https://images-na.ssl-images-amazon.com/images...
1,B005SW4YO8,,"Clothing, Shoes & JewelryWomenUniforms, Work...",5,Dickies misses 11 inch relaxed fit cotton carg...,$20.35 - $94.20,Dickies Women's 11-Inch Relaxed Cargo Short St...,https://www.amazon.com/dp/B005SW4YO8,https://images-na.ssl-images-amazon.com/images...
2,B0072LD9QW,ToBeInStyle,"Clothing, Shoes & JewelryWomenClothingLeggings",4,These Leggings that complete nearly any outfit...,$6.95 - $9.95,ToBeInStyle Women's Footless Elastic Legging,https://www.amazon.com/dp/B0072LD9QW,https://images-na.ssl-images-amazon.com/images...
3,B008QVAEDI,My Michelle,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,,My Michelle Juniors Ruffle Tight Fit Cami,https://www.amazon.com/dp/B008QVAEDI,https://images-na.ssl-images-amazon.com/images...
4,B00BBW9EEE,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,"100% cotton preshrunk jersey, high-density fab...",$2.60 - $14.99,Fruit of the Loom Womens 5 Oz. 100% Heavy Cott...,https://www.amazon.com/dp/B00BBW9EEE,https://images-na.ssl-images-amazon.com/images...
5,B00DHW2BIM,Soho Girls,"Clothing, Shoes & JewelryWomenClothingSkirt...",5,,,Soho Apparel Junior Ladies Floral Lace Overlay...,https://www.amazon.com/dp/B00DHW2BIM,https://images-na.ssl-images-amazon.com/images...
6,B00IVGHASU,,"Clothing, Shoes & JewelryWomenClothingDress...",5,This versatile high-low maxi offers an effortl...,$198.00,BCBGMAXAZRIA Women's Tara Tiered Asymmetrical-...,https://www.amazon.com/dp/B00IVGHASU,https://images-na.ssl-images-amazon.com/images...
7,B00KDK1HW2,,"Clothing, Shoes & JewelryWomenClothingActiv...",5,Today's Danskin enjoys status as the ultimate ...,$18.00,Danskin Women Side Tie Hot Short,https://www.amazon.com/dp/B00KDK1HW2,https://images-na.ssl-images-amazon.com/images...
8,B00KZZ8VPG,Kenneth Cole,"Clothing, Shoes & JewelryWomenClothingCoats...",5,,$89.99 - $109.99,Kenneth Cole New York Women's Chevron Down Coa...,https://www.amazon.com/dp/B00KZZ8VPG,https://images-na.ssl-images-amazon.com/images...
9,B00P8VODO0,v28,"Clothing, Shoes & JewelryWomenClothingSocks...",5,,$9.99,V28 Women Juniors 80s Eighty's Ribbed Leg Warm...,https://www.amazon.com/dp/B00P8VODO0,https://images-na.ssl-images-amazon.com/images...


In [28]:
# user_5core.repartition(1).write.option("header", True).csv(user_path, mode='overwrite')
user_5core.repartition(1).write.parquet(user_path, mode='overwrite')

                                                                                

In [29]:
# product_5core.repartition(1).write.option("header", True).csv(item_path, mode='overwrite')
product_5core.repartition(1).write.parquet(item_path, mode='overwrite')

                                                                                

In [30]:
# review_5core.repartition(1).write.option("header", True).csv(interaction_path, mode='overwrite')
review_5core.repartition(1).write.parquet(interaction_path, mode='overwrite')

In [31]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 14:11:37 3247339254 amazon_fashion_product.json
2022-09-08 14:13:55 3059298908 amazon_fashion_review.json


In [32]:
stop_spark(spark)

Debug -- spark stop


22/09/26 18:25:38 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
