In [127]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder \
  .appName('load_data_amazon dailys') \
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [128]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType,StringType
from pyspark.sql.functions  import date_sub,current_date,date_trunc,add_months,col


In [129]:
#name table compras
table_compras = "amazon_daily_updates.compras "

#load table
raw_compras = spark.read \
  .format("bigquery") \
  .option("table", table_compras) \
  .load()

raw_compras.printSchema()

#show incoming lines
print("lines incoming: " , raw_compras.count())

root
 |-- id: string (nullable = false)
 |-- client_id: string (nullable = false)
 |-- product_id: string (nullable = false)
 |-- cantidad: long (nullable = false)
 |-- precio: double (nullable = false)
 |-- envio_id: string (nullable = true)
 |-- isprime: string (nullable = true)
 |-- fecha_compra: date (nullable = false)
 |-- metodo_pago: string (nullable = true)

lines incoming:  203003


In [4]:
##########################################################################
##########################daily jobs ####################################
#########################################################################

In [17]:
###########################
###########################
# # #filter data previus day
raw_previus_day= raw_compras.filter(raw_compras.fecha_compra == date_sub(current_date(),1))

In [18]:
#select columns
raw_previus_day = raw_previus_day.select('fecha_compra','client_id','precio','product_id','cantidad','isprime')

In [19]:
raw_previus_day = raw_previus_day.withColumn("cantidad",raw_previus_day.cantidad.cast(IntegerType())) \
                                .withColumn("isprime",raw_previus_day.isprime.cast(StringType()))

In [20]:
#rename columns
raw_previus_final = raw_previus_day.withColumnRenamed('fecha_compra','purchase_date') \
                                .withColumnRenamed('cantidad','product_quantity') \
                                .withColumnRenamed('precio','product_price') \
                               .withColumnRenamed('isprime','client_is_prime')       

In [21]:
print("lines incoming: " , raw_previus_day.count())
raw_previus_final.show(5)


lines incoming:  192
+-------------+-----------------+-------------+----------+----------------+---------------+
|purchase_date|        client_id|product_price|product_id|product_quantity|client_is_prime|
+-------------+-----------------+-------------+----------+----------------+---------------+
|   2022-03-01|480-146888-22-806|        29.88|B09FCXXGT5|               1|          false|
|   2022-03-01|310-328278-65-945|         70.0|B01LZZ8UKK|               1|          false|
|   2022-03-01|323-462812-43-494|         43.7|B08X2K6B1Z|               1|          false|
|   2022-03-01|323-462812-43-494|        47.99|B098P1M628|               1|          false|
|   2022-03-01|323-462812-43-494|         8.99|B08ZS9PQ78|               1|          false|
+-------------+-----------------+-------------+----------+----------------+---------------+
only showing top 5 rows



In [22]:

#raw_previus_day=raw_previus_day.drop('client_is_prime')
raw_previus_day.printSchema()

root
 |-- fecha_compra: date (nullable = false)
 |-- client_id: string (nullable = false)
 |-- precio: double (nullable = false)
 |-- product_id: string (nullable = false)
 |-- cantidad: integer (nullable = false)
 |-- isprime: string (nullable = true)



In [23]:
######################################################################
########insert table pr_compras to BigQuery Production ###############
#####################################################################

In [25]:
raw_previus_day.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_compras") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('append') \
  .save()

In [26]:
#######################################################################

In [27]:
##########################################################################
##########################monthly jobs ####################################
#########################################################################

In [28]:
#name table compras
table_current_year = "becade_mgutierrez.pr_compras"

#load table
raw_current_year = spark.read \
  .format("bigquery") \
  .option("table", table_current_year) \
  .load()

In [29]:
#show schema
raw_current_year.printSchema()

#show incoming lines
print("lines incoming: " , raw_current_year.count())

root
 |-- client_id: string (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_quantity: long (nullable = true)
 |-- client_is_prime: string (nullable = true)
 |-- purchase_date: date (nullable = true)

lines incoming:  2168810


In [91]:
from pyspark.sql.functions  import year, month,col ,countDistinct,count,add_months

In [75]:
#filter data previus day
#raw_previus_month= raw_previus_month.filter(raw_previus_month.purchase_date == date_sub(current_date(),1))


#f1 = raw_previus_month.filter((date_trunc("month", col("purchase_date")) == date_trunc("month", add_months(current_date(), -1))) & (date_trunc("year", col("purchase_date")) == date_trunc("year", current_date())))
df_sales_current_year = raw_current_year.filter(  (date_trunc("month", col("purchase_date")) != date_trunc("month", current_date())) &
                                                (date_trunc("year", col("purchase_date")) == date_trunc("year", current_date())))

print("lines incoming: " , df_sales_current_year.count())

lines incoming:  23850


In [76]:
df_sales_current_year.show(5)

+-----------------+-------------+----------+----------------+---------------+-------------+
|        client_id|product_price|product_id|product_quantity|client_is_prime|purchase_date|
+-----------------+-------------+----------+----------------+---------------+-------------+
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-02|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-05|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-27|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-29|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-02-08|
+-----------------+-------------+----------+----------------+---------------+-------------+
only showing top 5 rows



In [77]:

df_new_sales= df_sales_current_year.withColumn('month_sales',month(df_sales_current_year.purchase_date)) \
                .withColumn('year_sales',year(df_sales_current_year.purchase_date))

df_new_sales.show(5)
#df_new_sales.printSchema()
print("lines source: " , df_new_sales.count())

+-----------------+-------------+----------+----------------+---------------+-------------+-----------+----------+
|        client_id|product_price|product_id|product_quantity|client_is_prime|purchase_date|month_sales|year_sales|
+-----------------+-------------+----------+----------------+---------------+-------------+-----------+----------+
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-02|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-05|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-27|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-29|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-02-08|          2|      2022|
+-----------------+-------------+----------+----------------+---------------+---

In [78]:
#############3TEST"#######

In [79]:
# df_new_sales = df_new_sales.select('year_sales','month_sales','product_id') \
#         .groupBy('year_sales','month_sales') \
#         .agg(count('product_id').alias('product_id')) \
#         .sort(['year_sales','month_sales'], ascending=True)

# print("lines source: " , df_new_sales.count())
# df_new_sales.show(13)

In [80]:
#######test#########

In [81]:
###compras por Año
df_ordenes_year = df_new_sales.select('year_sales','purchase_date','month_sales','client_id') \
        .groupBy('year_sales','purchase_date','client_id') \
        .agg(countDistinct('client_id').alias('total_compras')) \
        .sort(['year_sales', 'purchase_date'], ascending=True)

df_ordenes_year.show(5)
# df_ordenes_year.printSchema() 

+----------+-------------+-----------------+-------------+
|year_sales|purchase_date|        client_id|total_compras|
+----------+-------------+-----------------+-------------+
|      2022|   2022-01-01|801-809134-15-373|            1|
|      2022|   2022-01-01|323-079732-31-237|            1|
|      2022|   2022-01-01|435-318881-49-746|            1|
|      2022|   2022-01-01|602-921290-59-013|            1|
|      2022|   2022-01-01|480-225626-73-487|            1|
+----------+-------------+-----------------+-------------+
only showing top 5 rows



In [82]:


sum_ordenes_year = df_ordenes_year.select('year_sales','total_compras') \
        .groupBy('year_sales') \
        .agg(count('total_compras').alias('total_compras')) \
        .sort('year_sales', ascending=True)
sum_ordenes_year.show()

+----------+-------------+
|year_sales|total_compras|
+----------+-------------+
|      2022|         5025|
+----------+-------------+



In [83]:
from pyspark.sql.functions  import sum,avg

df_sales = df_new_sales.select('year_sales','month_sales','product_price','client_id') \
        .groupBy('year_sales') \
        .agg(sum('product_price').alias('venta_total_year'), \
             avg('product_price').alias('avg_venta_mensual')) \
         .sort('year_sales', ascending=True)
            

#df_year= df_year.withColumn('venta_total', df_year.venta_total.cast(DecimalType(18, 2)))

df_sales.show()
#df_year.printSchema() 

+----------+-----------------+------------------+
|year_sales| venta_total_year| avg_venta_mensual|
+----------+-----------------+------------------+
|      2022|2497929.040000331|104.73497023062185|
+----------+-----------------+------------------+



In [84]:
#InnerJoin
full_table_year = df_sales.alias('A').join(sum_ordenes_year.alias('B'), col('A.year_sales') == col('B.year_sales'), "inner") 

#Show first 20 rows
full_table_year= full_table_year.select('A.year_sales','A.venta_total_year','A.avg_venta_mensual','B.total_compras') \
                 .sort('A.year_sales', ascending=True)

full_table_year.show()

+----------+-----------------+------------------+-------------+
|year_sales| venta_total_year| avg_venta_mensual|total_compras|
+----------+-----------------+------------------+-------------+
|      2022|2497929.040000331|104.73497023062185|         5025|
+----------+-----------------+------------------+-------------+



In [85]:
full_table_year.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_compras_anuales") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('append') \
  .save()

In [130]:
####################MONHT##############

In [154]:
#filter data previus day
#raw_previus_month= raw_previus_month.filter(raw_previus_month.purchase_date == date_sub(current_date(),1))


df_sales_current_month = raw_current_year.filter(((date_trunc("month", col("purchase_date")) == date_trunc("month", add_months(current_date(), -2))) & 
                                                 (date_trunc("year", col("purchase_date")) == date_trunc("year", current_date()))))
# or
#                                                  ((date_trunc("month", col("purchase_date")) == date_trunc("month", add_months(current_date(), -2))) & 
#                                                  (date_trunc("year", col("purchase_date")) == '2021')))
                                               

print("lines incoming: " , df_sales_current_month.count())
df_sales_current_month.show(5)

lines incoming:  0
+---------+-------------+----------+----------------+---------------+-------------+
|client_id|product_price|product_id|product_quantity|client_is_prime|purchase_date|
+---------+-------------+----------+----------------+---------------+-------------+
+---------+-------------+----------+----------------+---------------+-------------+



In [148]:

df_new_sales= df_sales_current_month.withColumn('month_sales',month(df_sales_current_year.purchase_date)) \
                .withColumn('year_sales',year(df_sales_current_year.purchase_date))

df_new_sales.show(5)
#df_new_sales.printSchema()
print("lines source: " , df_new_sales.count())

+-----------------+-------------+----------+----------------+---------------+-------------+-----------+----------+
|        client_id|product_price|product_id|product_quantity|client_is_prime|purchase_date|month_sales|year_sales|
+-----------------+-------------+----------+----------------+---------------+-------------+-----------+----------+
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-02|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-05|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-27|          1|      2022|
|831-175061-77-427|       236.99|B00N69D6AS|               1|           true|   2022-01-29|          1|      2022|
|520-181798-68-069|       236.99|B00N69D6AS|               1|           true|   2022-01-01|          1|      2022|
+-----------------+-------------+----------+----------------+---------------+---

In [139]:
#### datafrem  df_new_sales

###compras por Mes
df_ordenes_month = df_new_sales .select('year_sales','purchase_date','month_sales','client_id') \
        .groupBy('year_sales','month_sales','purchase_date','client_id') \
        .agg(countDistinct('client_id').alias('total_compras')) \
        .sort(['year_sales', 'purchase_date'], ascending=True)

#df_year= df_year.withColumn('venta_total', df_year.venta_total.cast(DecimalType(18, 2)))

df_ordenes_month.show(5)
# df_ordenes_year.printSchema() 

+----------+-----------+-------------+-----------------+-------------+
|year_sales|month_sales|purchase_date|        client_id|total_compras|
+----------+-----------+-------------+-----------------+-------------+
|      2022|          1|   2022-01-01|602-878245-53-323|            1|
|      2022|          1|   2022-01-01|435-318881-49-746|            1|
|      2022|          1|   2022-01-01|702-991490-58-558|            1|
|      2022|          1|   2022-01-01|801-372658-24-641|            1|
|      2022|          1|   2022-01-01|801-014440-27-514|            1|
+----------+-----------+-------------+-----------------+-------------+
only showing top 5 rows



In [140]:
from pyspark.sql.functions  import count

sum_ordenes_month = df_ordenes_month.select('year_sales','month_sales','total_compras') \
        .groupBy('year_sales','month_sales') \
        .agg(count('total_compras').alias('total_compras_mes')) \
        .sort(['year_sales','month_sales'], ascending=True)

print("lines source: " , sum_ordenes_month.count())
sum_ordenes_month.show(13)

lines source:  1
+----------+-----------+-----------------+
|year_sales|month_sales|total_compras_mes|
+----------+-----------+-----------------+
|      2022|          1|             2575|
+----------+-----------+-----------------+



In [141]:
from pyspark.sql.functions  import sum,avg

df_month = df_new_sales.select('year_sales','month_sales','product_price') \
        .groupBy('year_sales','month_sales') \
        .agg(sum('product_price').alias('venta_total_mes')) \
        .sort(['year_sales','month_sales'], ascending=True)

#df_year= df_year.withColumn('venta_total', df_year.venta_total.cast(DecimalType(18, 2)))
print("lines source: " , df_month.count())
df_month.show(13)
#df_year.printSchema() 

lines source:  1
+----------+-----------+------------------+
|year_sales|month_sales|   venta_total_mes|
+----------+-----------+------------------+
|      2022|          1|1256540.4599999187|
+----------+-----------+------------------+



In [142]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

df_month_raw = df_month.withColumn('venta_total_mes_anterior',lag(df_month['venta_total_mes']).over(Window.orderBy("month_sales","year_sales")))
print("lines source: " , df_month_raw.count())

lines source:  1


In [143]:
df_month_raw= df_month_raw.na.fill(value=0,subset=["venta_total_mes_anterior"])
df_month_raw.show(5)

+----------+-----------+------------------+------------------------+
|year_sales|month_sales|   venta_total_mes|venta_total_mes_anterior|
+----------+-----------+------------------+------------------------+
|      2022|          1|1256540.4599999187|                     0.0|
+----------+-----------+------------------+------------------------+



In [110]:
#InnerJoin
full_table_month = df_month_raw.alias('A').join(sum_ordenes_month.alias('B'), \
                (col('A.month_sales') == col('B.month_sales')) & (col('A.year_sales') == col('B.year_sales')) , "inner") 

#Show first 20 rows
full_table_month= full_table_month.select('A.year_sales','A.month_sales','A.venta_total_mes','venta_total_mes_anterior','B.total_compras_mes') \
              .sort(['month_sales','year_sales'], ascending=True)

print("lines source: " , full_table_month.count())
full_table_month.show(13)

lines source:  1
+----------+-----------+------------------+------------------------+-----------------+
|year_sales|month_sales|   venta_total_mes|venta_total_mes_anterior|total_compras_mes|
+----------+-----------+------------------+------------------------+-----------------+
|      2022|          1|1256540.4599999187|                     0.0|             2575|
+----------+-----------+------------------+------------------------+-----------------+



In [None]:
#