In [1]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.100K.extra.parquet/
                           PRE amazon_fashion_interaction.100K.parquet/
                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.100K.extra.parquet/
                           PRE amazon_fashion_item.100K.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.100K.extra.parquet/
                           PRE amazon_fashion_user.100K.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 

In [2]:
import os
import subprocess
import yaml
import argparse
import sys 
import importlib
import metaspore as ms
import pandas as pd
import warnings
import pyspark.sql.functions as F

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def init_spark(local, app_name, batch_size, worker_count, server_count,
               worker_memory, server_memory, coordinator_memory, **kwargs):
    subprocess.run(['zip', '-r', os.getcwd() + '/python.zip', 'python'], cwd='../../../')
    spark_confs={
        "spark.kubernetes.namespace":"wanggen",
        "spark.network.timeout":"500",
        "spark.submit.pyFiles":"python.zip",
        "spark.ui.showConsoleProgress": "true",
        "spark.kubernetes.executor.deleteOnTermination":"true",
    }
    spark = ms.spark.get_session(
        local=local,
        app_name=app_name,
        batch_size=batch_size,
        worker_count=worker_count,
        server_count=server_count,
        worker_memory=worker_memory,
        server_memory=server_memory,
        coordinator_memory=coordinator_memory,
        spark_confs=spark_confs)
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(spark, dataset_path):
    dataset = spark.read.json(dataset_path)
    print('Debug -- item dataset count:', dataset.count())
    return dataset

In [4]:
# spark
params = {}
params['local'] = False
params['app_name'] = 'Amazon Fashion Dataset'
params['batch_size'] = 512
params['worker_count'] = 4
params['worker_cpu'] = 4
params['server_count'] = 4
params['server_cpu'] = 4
params['worker_memory'] = '6G'
params['server_memory'] = '6G'
params['coordinator_memory'] = '2G'

# input files
product_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_product.json'
review_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_review.json'
item2image_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_id2img.valid.tsv'
category_valid_path = 's3://dmetasoul-bucket/xwb/tmp/amazon_fashion_category_valid.parquet'
item_ids_aug_path = 's3://dmetasoul-bucket/xwb/tmp/item_ids.aug.tsv'

# output files
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.parquet'

spark = init_spark(**params)

updating: python/ (stored 0%)
updating: python/algos/ (stored 0%)
updating: python/algos/deepfm_net.py (deflated 69%)
updating: python/algos/feature/ (stored 0%)
updating: python/algos/feature/woe_encoder.py (deflated 64%)
updating: python/algos/feature/.ipynb_checkpoints/ (stored 0%)
updating: python/algos/feature/.ipynb_checkpoints/target_encoder-checkpoint.py (deflated 51%)
updating: python/algos/feature/sequential_encoder.py (deflated 60%)
updating: python/algos/feature/target_encoder.py (deflated 51%)
updating: python/algos/feature/neg_sampler.py (deflated 66%)
updating: python/algos/feature/__pycache__/ (stored 0%)
updating: python/algos/feature/__pycache__/woe_encoder.cpython-38.pyc (deflated 43%)
updating: python/algos/feature/__pycache__/target_encoder.cpython-38.pyc (deflated 36%)
updating: python/algos/feature/__pycache__/__init__.cpython-38.pyc (deflated 29%)
updating: python/algos/feature/__pycache__/sequential_encoder.cpython-38.pyc (deflated 53%)
updating: python/algos/f

22/10/13 18:25:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Debug -- spark init
Debug -- version: 3.1.2
Debug -- applicaitonId: spark-application-1665656716536
Debug -- uiWebUrl: http://172.16.0.156:4040


In [5]:
product_dataset = read_dataset(spark, product_path)
product_dataset = product_dataset.distinct()



Debug -- item dataset count: 2685059


                                                                                

In [6]:
product_dataset.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_levels: long (nullable = true)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- image: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- price: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
item2image_df = spark.read.option("delimiter", "\t").csv(item2image_path)
item2image_df = item2image_df.withColumnRenamed('_c0', 'item_id')
item2image_df = item2image_df.withColumnRenamed('_c1', 'image')
item2image_df = item2image_df.distinct()
item2image_df.printSchema()
item2image_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- image: string (nullable = true)





+----------+----------------------------------------------------------------+
|item_id   |image                                                           |
+----------+----------------------------------------------------------------+
|7890000286|https://images-na.ssl-images-amazon.com/images/I/41ZTltTw8HL.jpg|
|9830030296|https://images-na.ssl-images-amazon.com/images/I/413guZeLSjL.jpg|
|B0000731JZ|https://images-na.ssl-images-amazon.com/images/I/31VOsl2X9JL.jpg|
|B0000E09PI|https://images-na.ssl-images-amazon.com/images/I/515sEqEExkL.jpg|
|B00023T290|https://images-na.ssl-images-amazon.com/images/I/41jPArV9aFL.jpg|
|B00024WC7S|https://images-na.ssl-images-amazon.com/images/I/41vs4F7x-IL.jpg|
|B00024WLAQ|https://images-na.ssl-images-amazon.com/images/I/31Hcek6lg0L.jpg|
|B00024WOTY|https://images-na.ssl-images-amazon.com/images/I/41DX2nY6FfL.jpg|
|B00027MV3A|https://images-na.ssl-images-amazon.com/images/I/41BJMYXA8RL.jpg|
|B00028SNNQ|https://images-na.ssl-images-amazon.com/images/I/31M

                                                                                

In [8]:
category_valid = spark.read.parquet(category_valid_path)
category_valid.printSchema()
category_valid.show(20, False)

root
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- category_path: string (nullable = true)
 |-- valid_level: long (nullable = true)

+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|category                                                                                        |category_path                                                                                 |valid_level|
+------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------+
|[Clothing, Shoes & Jewelry, Luggage & Travel Gear, Umbrellas, Folding Umbrellas]                |Clothing, Shoes & Jewelry->Luggage & Travel Gear->Umbrellas->Folding Umbrellas                |3         

                                                                                

In [9]:
product_dataset = product_dataset.drop('image')
product_dataset = product_dataset.join(item2image_df, on='item_id', how='leftouter')

product_dataset = product_dataset.filter(F.col("image").isNotNull())
product_dataset = product_dataset.filter(F.col("image") != '')
product_dataset = product_dataset.filter(F.col("title") != '')
product_dataset = product_dataset.filter(F.length(F.col("title")) <= 300)
product_dataset = product_dataset.filter(F.length(F.col("title")) >= 5)

product_dataset = product_dataset.withColumn('category_path', F.concat_ws('->', 'category'))

product_dataset = category_valid.filter(F.col('valid_level') > 0).select('category_path').distinct()\
                    .join(product_dataset, on='category_path', how='inner').drop('category_path')

product_dataset.limit(20).toPandas()
print(product_dataset.count())




1123917


                                                                                

In [10]:
# product_dataset.filter(F.col('item_id') == 'B018448KC8').show(1,False)

In [11]:
review_dataset = read_dataset(spark, review_path)



Debug -- item dataset count: 32292099


                                                                                

In [12]:
import pyspark.sql.functions as F
product_subset = product_dataset.where(F.array_contains(F.col("category"), 'Clothing') & 
                                       F.array_contains(F.col('category'), 'Women'))

In [13]:
product_subset.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0001GSYVA,FALKE,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Incredibly soft sock made with skin-friendly ...,$15.75 - $24.00,Falke Women's Sensitive London Sock,https://www.amazon.com/dp/B0001GSYVA,https://images-na.ssl-images-amazon.com/images...
1,B000G0P2I4,,"[Clothing, Shoes & Jewelry, Women, Clothing, J...",5,[Lee Relaxed Fit jeans are cut to fit your nat...,,Lee Women's Relaxed Fit Straight Leg Jean,https://www.amazon.com/dp/B000G0P2I4,https://images-na.ssl-images-amazon.com/images...
2,B000I38P74,Maison Jules,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,,$39.99,Maison Jules Women's Polka Dot Crew-Neck Pullo...,https://www.amazon.com/dp/B000I38P74,https://images-na.ssl-images-amazon.com/images...
3,B0010ESQXS,Cashmere Boutique,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,"[Luxurious, soft and really warm, this pea coa...",$229.00,Women's Cashmere Pea Coat,https://www.amazon.com/dp/B0010ESQXS,https://images-na.ssl-images-amazon.com/images...
4,B0014GY2H6,Devon & Jones,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,"[Refined, feminine, lightweight warmth in 86% ...",$17.49 - $28.44,Devon & Jones Pink Everyday Cardigan Sweater D...,https://www.amazon.com/dp/B0014GY2H6,https://images-na.ssl-images-amazon.com/images...
5,B0016SJP9W,Bigmansland,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,$21.99,"Bigmansland Ladies Short Sleeve Easy Care, Soi...",https://www.amazon.com/dp/B0016SJP9W,https://images-na.ssl-images-amazon.com/images...
6,B001ANMXXI,Principle Business Enterprises,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",5,[Terries Slip Resistant Socks not only provide...,$6.52,Terries Slip Resistant Socks XX-LARGE - DOUBLE...,https://www.amazon.com/dp/B001ANMXXI,https://images-na.ssl-images-amazon.com/images...
7,B001B16M7M,Sugar Lips,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,[The original SUGARLIPS ribbed seamless tank t...,$14.95,Sugarlips Women's Misses Original Seamless Rib...,https://www.amazon.com/dp/B001B16M7M,https://images-na.ssl-images-amazon.com/images...
8,B001CL8JXQ,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,AK Anne Klein Women's Single Breasted Swing Coat,https://www.amazon.com/dp/B001CL8JXQ,https://images-na.ssl-images-amazon.com/images...
9,B001JBZ1MQ,Harriton,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,$19.04 - $25.81,"Harriton Ladies' Long Sleeve Twill Shirt, White",https://www.amazon.com/dp/B001JBZ1MQ,https://images-na.ssl-images-amazon.com/images...


In [14]:
review_dataset.limit(10).toPandas()

Unnamed: 0,item_id,rating,timestamp,user_id
0,B002OB4234,5.0,1408492800,A3CWY12FRVB32X
1,B002OB4234,5.0,1408492800,A2A5EQASDHE46A
2,B002OB4234,5.0,1407888000,AGPF7572AYJL7
3,B002OB4234,4.0,1407715200,A161SE7XSZ6I7E
4,B002OB4234,4.0,1407715200,A1AG4H4O7U84KM
5,B002OB4234,5.0,1407715200,A255NTZA63ZYVU
6,B002OB4234,4.0,1407715200,AN12CK1QJDO5V
7,B002OB4234,5.0,1407628800,A116OI9U0XS14H
8,B002OF5AOU,5.0,1371513600,A146IBKVP081TC
9,B002OF5AOU,3.0,1370649600,A36V6MNBY07A5C


In [15]:
review_dataset

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [16]:
review_dataset.registerTempTable('review_dataset') 
product_subset.registerTempTable('product_subset')
query ="""
select 
    ta.user_id
from
(
    select
        ta.user_id,
        count(distinct ta.item_id, ta.timestamp) as review_count
    from
        review_dataset ta
    join
        product_subset tb
    on 
        ta.item_id=tb.item_id
    group by ta.user_id
) ta
where ta.review_count >= 20
"""
user_5core = spark.sql(query)

In [17]:
user_5core.count()

                                                                                

1212

In [18]:
user_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,user_id
0,AENSGPP7TSU4V
1,A3KRRVHYFDDU59
2,A165P3MOJV3OVZ
3,A1N0KPRDV8M95
4,A8KBJDU553RF2
5,A6H4H1E3OIIOT
6,A3HWECWIGPTN67
7,A159IR0ZIDS388
8,A2LFM653MMEC9L
9,A3EBPHKGHMBPPI


In [19]:
user_5core.registerTempTable('user_5core')

query ="""
select distinct
    tb.*
from
    user_5core ta
join
    review_dataset tb
on ta.user_id=tb.user_id
"""
review_5core = spark.sql(query)

In [20]:
review_5core.count()

                                                                                

105155

In [21]:
review_5core.limit(20).toPandas()

                                                                                

Unnamed: 0,item_id,rating,timestamp,user_id
0,B009MBQYVW,3.0,1435190400,A3KRRVHYFDDU59
1,B009Z1S6WY,5.0,1521417600,A3KRRVHYFDDU59
2,B00ATUWZPO,3.0,1444003200,A3KRRVHYFDDU59
3,B00BCRKVNQ,5.0,1444003200,A3KRRVHYFDDU59
4,B00BGVJUYY,5.0,1454457600,A3KRRVHYFDDU59
5,B00BMTT40U,5.0,1473897600,A3KRRVHYFDDU59
6,B00BNT7D20,4.0,1444003200,A3KRRVHYFDDU59
7,B00BU96BJO,5.0,1439942400,A3KRRVHYFDDU59
8,B00C01MI3E,4.0,1436400000,A3KRRVHYFDDU59
9,B01CGV1FTK,5.0,1527552000,A3KRRVHYFDDU59


In [22]:
review_5core.registerTempTable('review_5core')

query ="""
select distinct
    tb.*
from
    review_5core ta
join
    product_subset tb
on ta.item_id=tb.item_id
"""
product_5core = spark.sql(query)

In [23]:
product_5core.limit(5).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0069J1Y2Y,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,$21.59 - $26.99,PattyBoutik Women's Off Shoulder Long Sleeve Top,https://www.amazon.com/dp/B0069J1Y2Y,https://images-na.ssl-images-amazon.com/images...
1,B006MQR4QY,,"[Clothing, Shoes & Jewelry, Women, Clothing, C...",5,,,AK Anne Klein Women's Ruffle Trench Coat,https://www.amazon.com/dp/B006MQR4QY,https://images-na.ssl-images-amazon.com/images...
2,B006UCPQAQ,,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",5,,,"HE02713BK10, Black, 8US, Ever Pretty Chic Lace...",https://www.amazon.com/dp/B006UCPQAQ,https://images-na.ssl-images-amazon.com/images...
3,B0078LJ5KU,,"[Clothing, Shoes & Jewelry, Women, Clothing, T...",5,,,XOXO Juniors Smock Bottom Embellished Printed ...,https://www.amazon.com/dp/B0078LJ5KU,https://images-na.ssl-images-amazon.com/images...
4,B007P5S2JY,,"[Clothing, Shoes & Jewelry, Women, Clothing, P...",5,[Self fabric waistband with smooth interior el...,,PUMA Women's Velour Pant,https://www.amazon.com/dp/B007P5S2JY,https://images-na.ssl-images-amazon.com/images...


In [24]:
product_5core.count()

                                                                                

16883

In [25]:
'''
import mimetypes, urllib
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

def is_url_image(url):    
    mimetype,encoding = mimetypes.guess_type(url)
    return (mimetype and mimetype.startswith('image'))

def check_url(url):
    """Returns True if the url returns a response code between 200-300,
       otherwise return False.
    """
    try:
        headers = {
            "Range": "bytes=0-10",
            "User-Agent": "MyTestAgent",
            "Accept": "*/*"
        }

        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        print('response.code: ', response.code)
        return response.code in range(200, 209)
    except Exception as e:
        print('exception: ', e)
        return False

is_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())


review_5core = review_5core.limit(10000)

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core = product_5core.filter(is_image_and_ready_udf('image'))
product_5core.cache()

review_5core = product_5core.select('item_id').distinct().join(review_5core, on='item_id', how='inner')
review_5core.cache()

user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()
'''

'\nimport mimetypes, urllib\nfrom pyspark.sql.types import BooleanType\nfrom pyspark.sql.functions import udf\n\ndef is_url_image(url):    \n    mimetype,encoding = mimetypes.guess_type(url)\n    return (mimetype and mimetype.startswith(\'image\'))\n\ndef check_url(url):\n    """Returns True if the url returns a response code between 200-300,\n       otherwise return False.\n    """\n    try:\n        headers = {\n            "Range": "bytes=0-10",\n            "User-Agent": "MyTestAgent",\n            "Accept": "*/*"\n        }\n\n        req = urllib.request.Request(url, headers=headers)\n        response = urllib.request.urlopen(req)\n        print(\'response.code: \', response.code)\n        return response.code in range(200, 209)\n    except Exception as e:\n        print(\'exception: \', e)\n        return False\n\nis_image_and_ready_udf = udf(lambda image: is_url_image(image) and check_url(image), BooleanType())\n\n\nreview_5core = review_5core.limit(10000)\n\nproduct_5core = re

In [26]:
item_ids_aug_df = spark.read.option("delimiter", "\t").csv(item_ids_aug_path)
item_ids_aug_df = item_ids_aug_df.withColumnRenamed('_c0', 'item_id')
item_ids_aug_df = item_ids_aug_df.withColumnRenamed('_c1', 'extra_item_id')
item_ids_aug_df = item_ids_aug_df.distinct()
item_ids_aug_df.printSchema()
item_ids_aug_df.show(20, False)

root
 |-- item_id: string (nullable = true)
 |-- extra_item_id: string (nullable = true)

+----------+-------------+
|item_id   |extra_item_id|
+----------+-------------+
|B0132JUHV2|B01DNUD5I2   |
|B015ISU4KO|B01E4WUADQ   |
|B016A5VW4Q|B01FNUPOF2   |
|B016Q0076O|B013IUBXFE   |
|B016ZDQEWI|B00VS4B1VY   |
|B018JABMHW|B00R4TJJOS   |
|B019IBMJKU|B01A4CDCLC   |
|B01B4M40D0|B00LO4N7LK   |
|B009FZA25Y|B00W3HY108   |
|B00MN7REHA|B0194ZDORM   |
|B00OB8UTQW|B01A9RP7WY   |
|B00PV6UUVM|B01E7QKE5S   |
|B00VYG4PZK|B013W7M3XE   |
|B00XJQ4QAW|B00YV59XKI   |
|B013GAHVVG|B00JGVAR34   |
|B01B29G3V2|B01GU62EJM   |
|B01CSXQ82A|B01FQWMFKO   |
|B01CZ2F7QM|B00LO4N7LK   |
|B00JDVH7QW|B01GGR0ICU   |
|B00K3B6SX4|B018QC4HHU   |
+----------+-------------+
only showing top 20 rows



In [27]:
# review_5core = review_5core.orderBy(F.rand()).limit(10000)
review_5core = review_5core.limit(100000)

In [28]:
from time import time 

extra_review = review_5core.join(item_ids_aug_df, on='item_id', how='inner')\
                .drop('item_id', 'timestamp')\
                .withColumnRenamed('extra_item_id', 'item_id')\
                .withColumn('timestamp', F.lit(int(time())))\
                .select(*review_5core.columns)
print('extra_review count: ', extra_review.count())

review_5core = review_5core.union(extra_review)



extra_review count:  52827


                                                                                

In [29]:
user_5core = review_5core.select('user_id').distinct().join(user_5core, on='user_id', how='inner')
user_5core.cache()

product_5core = review_5core.select('item_id').distinct().join(product_5core, on='item_id', how='inner')
product_5core.cache()

review_5core = review_5core.join(product_5core.select('item_id').distinct(), on='item_id', how='inner').select(*review_5core.columns)
review_5core.cache()

DataFrame[item_id: string, rating: double, timestamp: bigint, user_id: string]

In [30]:
from pyspark.sql.types import ArrayType

def concat_list_field(dataset, array_join_sep=u'\u0001'):
    array_cols = [f.name for f in dataset.schema.fields if isinstance(f.dataType, ArrayType)]
    for array_col in array_cols:
        dataset = dataset.withColumn(array_col, F.concat_ws(array_join_sep, F.col(array_col)))
    return dataset

review_5core = concat_list_field(review_5core)
user_5core = concat_list_field(user_5core)
product_5core = concat_list_field(product_5core)

In [31]:
print('review_5core: ', review_5core.count())
print('user_5core: ', user_5core.count())
print('product_5core: ', product_5core.count())

                                                                                ]

review_5core:  84611


                                                                                

user_5core:  1210
product_5core:  16283


                                                                                

In [32]:
user_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_user.100K.extra.parquet'
item_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_item.100K.extra.parquet'
interaction_path = 's3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/amazon_fashion_interaction.100K.extra.parquet'


In [33]:
product_5core.limit(10).toPandas()

                                                                                

Unnamed: 0,item_id,brand,category,category_levels,description,price,title,url,image
0,B0069J1Y2Y,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,$21.59 - $26.99,PattyBoutik Women's Off Shoulder Long Sleeve Top,https://www.amazon.com/dp/B0069J1Y2Y,https://images-na.ssl-images-amazon.com/images...
1,B006MQR4QY,,"Clothing, Shoes & JewelryWomenClothingCoats...",5,,,AK Anne Klein Women's Ruffle Trench Coat,https://www.amazon.com/dp/B006MQR4QY,https://images-na.ssl-images-amazon.com/images...
2,B0078LJ5KU,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,,XOXO Juniors Smock Bottom Embellished Printed ...,https://www.amazon.com/dp/B0078LJ5KU,https://images-na.ssl-images-amazon.com/images...
3,B007P5S2JY,,"Clothing, Shoes & JewelryWomenClothingPants...",5,Self fabric waistband with smooth interior ela...,,PUMA Women's Velour Pant,https://www.amazon.com/dp/B007P5S2JY,https://images-na.ssl-images-amazon.com/images...
4,B007TUODIE,,"Clothing, Shoes & JewelryWomenClothingDress...",5,,,MYNE Women's Gemma Dress,https://www.amazon.com/dp/B007TUODIE,https://images-na.ssl-images-amazon.com/images...
5,B008EYPQFI,,"Clothing, Shoes & JewelryWomenClothingJeans",4,Look one size smaller in this Not Your Daughte...,$37.81 - $158.00,NYDJ Women's Petite Size Alina Legging Jean,https://www.amazon.com/dp/B008EYPQFI,https://images-na.ssl-images-amazon.com/images...
6,B008VO5UES,D.E.P.T.,"Clothing, Shoes & JewelryWomenClothingCoats...",5,The classic military inspired coat is updated ...,,D.E.P.T. Women's Fabulous Coat,https://www.amazon.com/dp/B008VO5UES,https://images-na.ssl-images-amazon.com/images...
7,B009O90116,,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,,Southpole Juniors' Lightweight Lace-Fabric Hen...,https://www.amazon.com/dp/B009O90116,https://images-na.ssl-images-amazon.com/images...
8,B00A85Z7MI,G2 Chic,"Clothing, Shoes & JewelryWomenClothingTops,...",5,,,G2 Chic Women's Crochet Accent Cape Loose Top,https://www.amazon.com/dp/B00A85Z7MI,https://images-na.ssl-images-amazon.com/images...
9,B00AEVHJZY,G2 Chic,"Clothing, Shoes & JewelryWomenClothingTops,...",5,"Round neck, oversized, quarter-sleeves, bumpy ...",,G2 Chic Women's Lightweight Solid Patterned La...,https://www.amazon.com/dp/B00AEVHJZY,https://images-na.ssl-images-amazon.com/images...


In [34]:
# user_5core.repartition(1).write.option("header", True).csv(user_path, mode='overwrite')
user_5core.repartition(1).write.parquet(user_path, mode='overwrite')

                                                                                

In [35]:
# product_5core.repartition(1).write.option("header", True).csv(item_path, mode='overwrite')
product_5core.repartition(1).write.parquet(item_path, mode='overwrite')

                                                                                

In [36]:
# review_5core.repartition(1).write.option("header", True).csv(interaction_path, mode='overwrite')
review_5core.repartition(1).write.parquet(interaction_path, mode='overwrite')

                                                                                

In [37]:
!aws s3 ls s3://dmetasoul-bucket/sass/ecommerce/amazonfashion/dataset/

                           PRE amazon_fashion_interaction.100K.extra.parquet/
                           PRE amazon_fashion_interaction.100K.parquet/
                           PRE amazon_fashion_interaction.csv/
                           PRE amazon_fashion_interaction.parquet/
                           PRE amazon_fashion_interaction.small.parquet/
                           PRE amazon_fashion_item.100K.extra.parquet/
                           PRE amazon_fashion_item.100K.parquet/
                           PRE amazon_fashion_item.csv/
                           PRE amazon_fashion_item.parquet/
                           PRE amazon_fashion_item.small.parquet/
                           PRE amazon_fashion_user.100K.extra.parquet/
                           PRE amazon_fashion_user.100K.parquet/
                           PRE amazon_fashion_user.csv/
                           PRE amazon_fashion_user.parquet/
                           PRE amazon_fashion_user.small.parquet/
2022-09-08 

In [38]:
# product_5core.filter(F.col('item_id') == 'B018448KC8').show(1,False)

In [39]:
stop_spark(spark)

Debug -- spark stop


22/10/13 18:40:28 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed (this is expected if the application is shutting down.)
