In [4]:
#Start session Spark
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder \
  .appName('products_operations') \
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [5]:
#name table pr_products_standard_price
table_standar_price = "becade_mgutierrez.pr_products_standard_price"

#load table
pr_products_price = spark.read \
  .format("bigquery") \
  .option("table", table_standar_price) \
  .load()

#show incoming lines
print("lines incoming: " , pr_products_price.count())

#show schema
pr_products_price.printSchema()

lines incoming:  9560
root
 |-- product_id: string (nullable = true)
 |-- product_is_bestseller: string (nullable = true)
 |-- product_is_prime: string (nullable = true)
 |-- product_price_currency: string (nullable = true)
 |-- product_rate: double (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_country: string (nullable = true)
 |-- product_price_us: double (nullable = true)



In [6]:
from pyspark.sql.functions import mean, countDistinct, min, max, first, last,col,avg


In [28]:
#calculated avg product_price_us && count product_country
df_avg_price=pr_products_price.select('product_id','product_price_us','product_country') \
        .groupBy('product_id') \
        .agg(mean('product_price_us').alias('product_avg_price_us'),countDistinct('product_country').alias('country_count')).sort('country_count', ascending=False)
#Show row products   
df_avg_price.show(3)


+----------+--------------------+-------------+
|product_id|product_avg_price_us|country_count|
+----------+--------------------+-------------+
|B00MNV8E0C|   24.73453126271878|            7|
|B007B9NV8Q|  20.634105088517593|            7|
|B00X4SCCFG|  17.748030361843725|            6|
+----------+--------------------+-------------+
only showing top 3 rows



In [8]:
#################################################################################
########Insert table pr_products_avg_price  to BigQuery Production ##############
#################################################################################

In [6]:
df_avg_price.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_products_avg_price") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('overwrite') \
  .save()

In [9]:
#################################################################################
#################################################################################

In [10]:
#operations min_price && max_price
df_price_range=pr_products_price.select('product_id','product_country','product_price_us') \
        .groupBy('product_id') \
        .agg(min('product_price_us').alias('product_min_price'),first('product_country').alias('country_mix_price'), \
            max('product_price_us').alias('product_max_price'),last('product_country').alias('country_max_price'))         
#Show row products         
df_price_range.show(5)

+----------+-----------------+-----------------+-----------------+-----------------+
|product_id|product_min_price|country_mix_price|product_max_price|country_max_price|
+----------+-----------------+-----------------+-----------------+-----------------+
|1400223512|            15.49|               US|            15.49|               US|
|1936096862|            11.93|               US|            11.93|               US|
|9804370085|            14.99|               US|            14.99|               US|
|B003ZJ8MIU|            28.99|               US|            28.99|               US|
|B005G9E9AW|            48.48|               US|            48.48|               US|
+----------+-----------------+-----------------+-----------------+-----------------+
only showing top 5 rows



In [17]:
#InnerJoin
df_full_ranges = df_avg_price.alias('A').join(df_price_range.alias('B'), col('A.product_id') == col('B.product_id'), "inner") 

#Select columns
df_full_ranges= df_full_ranges.select('A.product_id','A.country_count','A.product_avg_price_us','B.product_max_price','B.country_max_price','B.product_min_price','B.country_mix_price') \
                 .sort('A.product_id', ascending=False)

#Show row mergedf_merge_rows
df_full_ranges.show(2)

#Show schema
df_full_ranges.printSchema()

+----------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+
|product_id|country_count|product_avg_price_us|product_max_price|country_max_price|product_min_price|country_mix_price|
+----------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+
|B09S5G7BXW|            1|                 0.0|              0.0|               US|              0.0|               US|
|B09S2RQ19K|            1|               99.99|            99.99|               US|            99.99|               US|
+----------+-------------+--------------------+-----------------+-----------------+-----------------+-----------------+
only showing top 2 rows

root
 |-- product_id: string (nullable = true)
 |-- country_count: long (nullable = false)
 |-- product_avg_price_us: double (nullable = true)
 |-- product_max_price: double (nullable = true)
 |-- country_max_price: string (nullable = true)
 |-- product_m

In [30]:
#List product test
pr_products_price.filter(pr_products_price.product_id == "B00MNV8E0C").show(truncate=False)

+----------+---------------------+----------------+----------------------+------------+-------------+---------------+------------------+
|product_id|product_is_bestseller|product_is_prime|product_price_currency|product_rate|product_price|product_country|product_price_us  |
+----------+---------------------+----------------+----------------------+------------+-------------+---------------+------------------+
|B00MNV8E0C|true                 |false           |                      |4.3         |1146.0       |JP             |10.732891667660974|
|B00MNV8E0C|true                 |false           |€                     |4.6         |76.76        |DE             |87.67501307815138 |
|B00MNV8E0C|true                 |true            |₹                     |4.6         |1179.0       |IN             |15.909874809931647|
|B00MNV8E0C|true                 |false           |€                     |4.6         |13.79        |IT             |15.750891484467267|
|B00MNV8E0C|false                |true   

In [31]:
#TEST PRODUCT
df_full_ranges.filter(df_full_ranges.product_id == "B00MNV8E0C").show(truncate=False)

+----------+-------------+--------------------+-----------------+-----------------+------------------+-----------------+
|product_id|country_count|product_avg_price_us|product_max_price|country_max_price|product_min_price |country_mix_price|
+----------+-------------+--------------------+-----------------+-----------------+------------------+-----------------+
|B00MNV8E0C|7            |24.73453126271878   |87.67501307815138|US               |10.732891667660974|JP               |
+----------+-------------+--------------------+-----------------+-----------------+------------------+-----------------+



In [32]:
#################################################################################
########Insert table pr_products_range_price to BigQuery Production #############
#################################################################################

In [16]:
df_full_ranges.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_products_range_price") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('overwrite') \
  .save()

In [36]:
#################################################################################
#################################################################################

In [37]:
#operations avg evaluate_rate products
df_product_rate=pr_products_price.select('product_id','product_country','product_rate') \
        .groupBy('product_id') \
        .agg(avg('product_rate').alias('product_avg_rate'),countDistinct('product_country').alias('country_count'))         
        
df_product_rate.show(2)


+----------+----------------+-------------+
|product_id|product_avg_rate|country_count|
+----------+----------------+-------------+
|B08J3QQ11H|             4.6|            1|
|9804370085|             4.8|            1|
+----------+----------------+-------------+
only showing top 2 rows



In [38]:
##############################################################################
########Insert table pr_product_avg_rate to BigQuery Production ##############
##############################################################################

In [39]:
df_product_rate.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_product_avg_rate") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('overwrite') \
  .save()