In [1]:
from datetime import datetime
from pyspark.sql import SparkSession

spark = (SparkSession.builder.appName("pyspark-rdd-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

spark.sparkContext.getConf().getAll()

[('spark.app.id', 'app-20250104172114-0000'),
 ('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false'),
 ('spark.jars',
  'file:///usr/local/spark-3.5.0-bin-hadoop3/jars/s3-2.29.43.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/aws-java-sdk-1.12.780.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/aws-java-sdk-bundle-1.12.262.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/delta-spark_2.12-3.2.1.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/delta-storage-3.2.1.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/hadoop-aws-3.3.4.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/postgresql-42.7.4.jar'),
 ('spark.hadoop.fs.s3a.access.key', 'minio'),
 ('spark.repl.local.jars',
  'file:///usr/local/spark-3.5.0-bin-hadoop3/jars/s3-2.29.43.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/aws-java-sdk-1.12.780.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/aws-java-sdk-bundle-1.12.262.jar,file:///usr/local/spark-3.5.0-bin-hadoop3/jars/delta-spark_2.12-3.2.1.jar,file:///usr/

In [32]:
users = spark.read.format("parquet").load("s3a://warehouse/gold/tiki/users.parquet")

In [3]:
sellers = spark.read.format("parquet").load("s3a://warehouse/silver/tiki/sellers.parquet")

In [5]:
sellers.toPandas()['store_level'].unique()

array(['OFFICIAL_STORE', 'NONE', 'TRUSTED_STORE'], dtype=object)

In [33]:
from pyspark.sql.functions import col, lit, when, from_unixtime, udf, explode
from pyspark.sql.types import IntegerType

In [5]:
products = spark.read.format("parquet").load("s3a://warehouse/gold/tiki/products.parquet")
# products = products.toPandas()
# products.info()


In [23]:
from pyspark.sql.functions import when, col, isnan

products = products.withColumn(
    "warranty_type",
    when(isnan(col("warranty_type")) | col("warranty_type").isNull(), "Không bảo hành")
    .otherwise(col("warranty_type"))
)

products = products.withColumn(
    "warranty_location",
    when(isnan(col("warranty_location")) | col("warranty_location").isNull(), "Không bảo hành")
    .otherwise(col("warranty_location"))
)
products = products.withColumn(
    "return_reason",
    when(col("return_reason") == "any_reason", "Bất cứ lý do gì")
    .when(col("return_reason") == "defective_product", "Sản phẩm hư hỏng")
    .when((col("return_reason") == "no_return") | col("return_reason").isNull(), "Không đổi trả")
    .otherwise(col("return_reason"))
)

# Hiển thị kết quả
products.toPandas()['return_reason'].value_counts()


return_reason
Bất cứ lý do gì     9455
Sản phẩm hư hỏng    1328
Không đổi trả        342
Name: count, dtype: int64

In [4]:
products = products.filsplna("Không bảo hành", subset=["warranty_type", "warranty_location"])
# products_spark_df = products_spark_df.fillna("no_return", subset=["return_reason"])
# products_spark_df = products_spark_df.fillna(0, subset=["quantity_sold"])

TypeError: DataFrame.fillna() got an unexpected keyword argument 'subset'

In [6]:
products_pd = products.toPandas()

# products_pd[products_pd['warranty_type'] == 'NaN']
products_pd['warranty_type'].value_counts()

warranty_type
Không bảo hành    7136
Điện tử           1528
Hóa đơn           1291
Phiếu bảo hành     774
Tem bảo hành       396
Name: count, dtype: int64

In [4]:
img_url = spark.read.format("parquet").load("s3a://warehouse/gold/tiki/images_url.parquet")
img_url = img_url.toPandas()
img_url.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69309 entries, 0 to 69308
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  69309 non-null  int64 
 1   seller_id   69309 non-null  int64 
 2   image_url   69309 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.6+ MB


In [5]:
img_url.head()

Unnamed: 0,product_id,seller_id,image_url
0,122012,1,https://salt.tikicdn.com/ts/product/24/85/1a/7...
1,122012,1,https://salt.tikicdn.com/media/catalog/product...
2,122012,1,https://salt.tikicdn.com/media/catalog/product...
3,122012,1,https://salt.tikicdn.com/media/catalog/product...
4,122012,1,https://salt.tikicdn.com/media/catalog/product...


In [35]:
users = users.toPandas()
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133909 entries, 0 to 133908
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   user_id       133909 non-null  int64  
 1   user_name     133054 non-null  object 
 2   avatar_url    133909 non-null  object 
 3   joined_day    133909 non-null  int32  
 4   joined_time   133869 non-null  object 
 5   total_review  133869 non-null  float64
 6   total_thank   133869 non-null  float64
dtypes: float64(2), int32(1), int64(1), object(3)
memory usage: 6.6+ MB


In [36]:
users['avatar_url'].value_counts()

avatar_url
//tiki.vn/assets/img/avatar.png                                                                                                                                                                                                                                       97664
http://s120.avatar.talk.zdn.vn/default                                                                                                                                                                                                                                   17
https://s120.avatar.talk.zdn.vn/default                                                                                                                                                                                                                                  11
https://scontent-iad3-1.xx.fbcdn.net/v/t31.0-1/c59.0.200.200a/p200x200/10733713_10150004552801937_4553731092814901385_o.jpg?_nc_cat=1&_nc_sid=12b3be&_nc_ohc=kbRb-3obZ7MAX9TZwXo&_nc_ht=s