In [22]:
#Start session Spark
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder \
  .appName('clean_products_external') \
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [23]:
##########################################################
#######Extract table external_products from BigQuery Staging ######
##########################################################

In [24]:
#name table external_products
table_products = "becade_mgutierrez.stg_external_products"

#load table
stg_external_products = spark.read \
  .format("bigquery") \
  .option("table", table_products) \
  .load()

#show incoming lines
print("lines incoming: " , stg_external_products.count())

#show schema
stg_external_products.printSchema()

lines incoming:  9279
root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_emitted_at: long (nullable = true)
 |-- app_sale_price: string (nullable = true)
 |-- app_sale_price_currency: string (nullable = true)
 |-- country: string (nullable = true)
 |-- evaluate_rate: string (nullable = true)
 |-- isbestseller: boolean (nullable = true)
 |-- isprime: boolean (nullable = true)
 |-- original_price: double (nullable = true)
 |-- product_detail_url: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_main_image_url: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- rowid: long (nullable = true)



In [25]:
from pyspark.sql.functions import col,when, regexp_extract , translate, regexp_replace, concat,lit
from pyspark.sql.types import IntegerType,DoubleType,DateType,StringType

In [26]:
#select columns from table
raw_products = stg_external_products.select('product_id','country','app_sale_price','evaluate_rate','isbestseller','isprime','app_sale_price_currency')

In [27]:
#clean column app_sale_price drop values 'None'
raw_products = raw_products.where(raw_products.app_sale_price != 'None')

#show outgoing lines
print("lines clean outgoing: " , raw_products.count())

lines clean outgoing:  8446


In [28]:
# fill empty rows evaluate_rate
df_raw_products= raw_products.withColumn("evaluate_rate", when(col("evaluate_rate")=="" ,None)  \
           .otherwise(col("evaluate_rate"))) 
        
#clean column app_sale_price drop values 'None'
df_raw_products = df_raw_products.where(df_raw_products.evaluate_rate != "None")

#show outgoing lines
print("lines clean outgoing: " , df_raw_products.count())

lines clean outgoing:  7929


In [29]:
#drop duplicates rows products
df_raw_products = df_raw_products.dropDuplicates()

#show outgoing lines
print("lines clean outgoing: " , df_raw_products.count())

lines clean outgoing:  6091


In [30]:
#clean column evaluate_rate extract format {n.n} &&  replace characters {,} by {.}
df_clean_rate = df_raw_products \
                .withColumn('clean_rate', regexp_extract(col('evaluate_rate'), r'([0-9][\.\,][0-9])',1)) \
                .withColumn('clean_rate', translate(col('clean_rate'), ',', '.'))
     
#Show row products
df_clean_rate.show(2,truncate=True)

+----------+-------+--------------+------------------+------------+-------+-----------------------+----------+
|product_id|country|app_sale_price|     evaluate_rate|isbestseller|isprime|app_sale_price_currency|clean_rate|
+----------+-------+--------------+------------------+------------+-------+-----------------------+----------+
|B09KPDDFH6|     US|         14.27|3.5 out of 5 stars|        true|   true|                      $|       3.5|
|B09MGY11CG|     US|        255.00|3.5 out of 5 stars|        true|  false|                      $|       3.5|
+----------+-------+--------------+------------------+------------+-------+-----------------------+----------+
only showing top 2 rows



In [31]:
#clean column app_sale_price delete characters && define format {n nnn.nn}
df_raw_price = df_clean_rate \
                .withColumn('raw_price', translate(col('app_sale_price'), ',￥', '.')) \
                .withColumn('decimal_price', regexp_extract(col('raw_price'), r'([\.][0-9]{2}+$)',1)) \
                .withColumn('raw_number_price', regexp_extract(col('raw_price'), r'([0-9][\.][0-9]{3}|[0-9]{2,3})',1)) \
                .withColumn('number_price', translate(col('raw_number_price'), '.', ''))
              
#Show row products             
df_raw_price.select('product_id','country','app_sale_price','raw_price','raw_number_price','number_price','decimal_price').show(3,truncate=False)

+----------+-------+--------------+---------+----------------+------------+-------------+
|product_id|country|app_sale_price|raw_price|raw_number_price|number_price|decimal_price|
+----------+-------+--------------+---------+----------------+------------+-------------+
|B09KPDDFH6|US     |14.27         |14.27    |14              |14          |.27          |
|B09MGY11CG|US     |255.00        |255.00   |255             |255         |.00          |
|B096VTTG7F|US     |142.89        |142.89   |142             |142         |.89          |
+----------+-------+--------------+---------+----------------+------------+-------------+
only showing top 3 rows



In [32]:
#concat columns  number_price + decimal_price = app_sale_price_us
df_clean_products_raw=df_raw_price.select('product_id','country','isbestseller','isprime','app_sale_price_currency','clean_rate',
                                          concat(df_raw_price.number_price,df_raw_price.decimal_price).alias("app_sale_price"))

#Show row products
df_clean_products_raw.show(5)

+----------+-------+------------+-------+-----------------------+----------+--------------+
|product_id|country|isbestseller|isprime|app_sale_price_currency|clean_rate|app_sale_price|
+----------+-------+------------+-------+-----------------------+----------+--------------+
|B09KPDDFH6|     US|        true|   true|                      $|       3.5|         14.27|
|B09MGY11CG|     US|        true|  false|                      $|       3.5|        255.00|
|B096VTTG7F|     US|       false|   true|                      $|       3.7|        142.89|
|B092DYHW2K|     US|        true|   true|                      $|       3.9|         25.84|
|B08P5G3PY6|     US|       false|  false|                      $|       3.9|        589.99|
+----------+-------+------------+-------+-----------------------+----------+--------------+
only showing top 5 rows



In [43]:
df_clean_products_raw = df_clean_products_raw.withColumn("app_sale_price_us", df_clean_products_raw.app_sale_price )

In [44]:
#renamed columns 
df_full_products = df_clean_products_raw.withColumnRenamed('isprime','product_is_prime') \
                           .withColumnRenamed('app_sale_price_currency','product_price_currency') \
                            .withColumnRenamed('isbestseller','product_is_bestseller') \
                           .withColumnRenamed('clean_rate','product_rate') \
                           .withColumnRenamed('app_sale_price','product_price') \
                           .withColumnRenamed('country','product_country') \
                            .withColumnRenamed('app_sale_price_us','product_price_us') \

#Show row exchange
df_full_products.show(5)

+----------+---------------+---------------------+----------------+----------------------+------------+-------------+----------------+
|product_id|product_country|product_is_bestseller|product_is_prime|product_price_currency|product_rate|product_price|product_price_us|
+----------+---------------+---------------------+----------------+----------------------+------------+-------------+----------------+
|B09KPDDFH6|             US|                 true|            true|                     $|         3.5|        14.27|           14.27|
|B09MGY11CG|             US|                 true|           false|                     $|         3.5|       255.00|          255.00|
|B096VTTG7F|             US|                false|            true|                     $|         3.7|       142.89|          142.89|
|B092DYHW2K|             US|                 true|            true|                     $|         3.9|        25.84|           25.84|
|B08P5G3PY6|             US|                false|     

In [46]:
#cast column
df_full_products = df_full_products.withColumn("product_price",df_full_products.product_price.cast(DoubleType()))  \
                                    .withColumn("product_rate",df_full_products.product_rate.cast(DoubleType())) \
                                    .withColumn("product_is_bestseller",df_full_products.product_is_bestseller.cast(StringType())) \
                                    .withColumn("product_is_prime",df_full_products.product_is_prime.cast(StringType())) \
                                    .withColumn("product_price_us",df_full_products.product_price_us.cast(DoubleType()))

#Display Schema
df_full_products.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_country: string (nullable = true)
 |-- product_is_bestseller: string (nullable = true)
 |-- product_is_prime: string (nullable = true)
 |-- product_price_currency: string (nullable = true)
 |-- product_rate: double (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_price_us: double (nullable = true)



In [47]:
df_full_products.show(5)

+----------+---------------+---------------------+----------------+----------------------+------------+-------------+----------------+
|product_id|product_country|product_is_bestseller|product_is_prime|product_price_currency|product_rate|product_price|product_price_us|
+----------+---------------+---------------------+----------------+----------------------+------------+-------------+----------------+
|B09KPDDFH6|             US|                 true|            true|                     $|         3.5|        14.27|           14.27|
|B09MGY11CG|             US|                 true|           false|                     $|         3.5|        255.0|           255.0|
|B096VTTG7F|             US|                false|            true|                     $|         3.7|       142.89|          142.89|
|B092DYHW2K|             US|                 true|            true|                     $|         3.9|        25.84|           25.84|
|B08P5G3PY6|             US|                false|     

In [48]:
#####################################################################
########insert table pr_products to BigQuery Production #############
####################Products whitout price US #######################

In [49]:
df_full_products.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_products_standard_price") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('append') \
  .save()