In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder \
  .appName('merge_tables_compras') \
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [3]:
##extract table compras from BigQuery Staging ######

In [4]:
#name table compras
table_compras = "becade_mgutierrez.stg_compras"

#load table
raw_compras = spark.read \
  .format("bigquery") \
  .option("table", table_compras) \
  .load()

#show schema
raw_compras.printSchema()


root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_emitted_at: long (nullable = true)
 |-- cantidad: long (nullable = true)
 |-- client_id: string (nullable = true)
 |-- envio_id: string (nullable = true)
 |-- fecha_compra: timestamp (nullable = true)
 |-- id: string (nullable = true)
 |-- isprime: boolean (nullable = true)
 |-- metodo_pago: string (nullable = true)
 |-- precio: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- rowid: long (nullable = true)



In [5]:
#show incoming lines
print("lines incoming: " , raw_compras.count())

lines incoming:  536409


In [9]:
from pyspark.sql.types import IntegerType,BooleanType,DateType,StringType
from pyspark.sql.functions  import to_date, col

In [10]:
df_raw_compras = raw_compras.select('fecha_compra','client_id','precio','product_id','cantidad','isprime')

In [11]:

df_raw_compras = df_raw_compras.withColumn("datetime", to_date("fecha_compra")) \
                               .withColumn("cantidad",df_raw_compras.cantidad.cast(IntegerType())) \
                                .withColumn("isprime",df_raw_compras.isprime.cast(StringType()))

In [12]:
df_raw_compras = df_raw_compras.drop('fecha_compra')

In [13]:
df_raw_compras = df_raw_compras.withColumnRenamed('datetime','purchase_date') \
                                .withColumnRenamed('cantidad','product_quantity') \
                                .withColumnRenamed('precio','product_price') \
                                .withColumnRenamed('isprime','client_is_prime')
df_raw_compras.show(2)

+-----------------+-------------+----------+----------------+---------------+-------------+
|        client_id|product_price|product_id|product_quantity|client_is_prime|purchase_date|
+-----------------+-------------+----------+----------------+---------------+-------------+
|209-696678-32-117|       236.99|B00N69D6AS|               1|           true|   2018-04-18|
|209-696678-32-117|       236.99|B00N69D6AS|               1|           true|   2018-05-05|
+-----------------+-------------+----------+----------------+---------------+-------------+
only showing top 2 rows



In [24]:
df_raw_compras.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_quantity: integer (nullable = true)
 |-- client_is_prime: string (nullable = true)
 |-- purchase_date: date (nullable = true)



In [14]:
######################################################################
########insert table pr_compras to BigQuery Production ###############
#####################################################################

In [15]:
df_raw_compras.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_compras") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('overwrite') \
  .save()

In [16]:
#name table compras
table_compras_historico = "becade_mgutierrez.stg_historico_compras"

#load table
raw_compras_historico = spark.read \
  .format("bigquery") \
  .option("table", table_compras_historico) \
  .load()

#show schema
raw_compras_historico.printSchema()

#show incoming lines
print("lines incoming: " , raw_compras_historico.count())

root
 |-- id: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- cantidad: long (nullable = true)
 |-- precio: double (nullable = true)
 |-- envio_id: string (nullable = true)
 |-- isprime: boolean (nullable = true)
 |-- fecha_compra: date (nullable = true)
 |-- metodo_pago: string (nullable = true)

lines incoming:  1429398


In [17]:
raw_compras_historico = raw_compras_historico.select('fecha_compra','client_id','precio','product_id','cantidad','isprime')

In [18]:
from pyspark.sql.types import IntegerType,StringType

raw_compras_historico = raw_compras_historico.withColumn("cantidad",raw_compras_historico.cantidad.cast(IntegerType())) \
                                .withColumn("isprime",raw_compras_historico.isprime.cast(StringType()))

In [27]:
raw_compras_historico = raw_compras_historico.withColumnRenamed('datetime','purchase_date') \
                                .withColumnRenamed('cantidad','product_quantity') \
                                .withColumnRenamed('precio','product_price') \
                                .withColumnRenamed('isprime','client_is_prime') \
                                 .withColumnRenamed('fecha_compra','purchase_date')
raw_compras_historico.show(2)

+-------------+-----------------+-------------+----------+----------------+---------------+
|purchase_date|        client_id|product_price|product_id|product_quantity|client_is_prime|
+-------------+-----------------+-------------+----------+----------------+---------------+
|   2011-02-11|209-696678-32-117|       236.99|B00N69D6AS|               1|           true|
|   2011-03-07|209-696678-32-117|       236.99|B00N69D6AS|               1|           true|
+-------------+-----------------+-------------+----------+----------------+---------------+
only showing top 2 rows



In [28]:
#show schema
raw_compras_historico.printSchema()

root
 |-- purchase_date: date (nullable = true)
 |-- client_id: string (nullable = true)
 |-- product_price: double (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_quantity: integer (nullable = true)
 |-- client_is_prime: string (nullable = true)



In [29]:
########insert table pr_compras to BigQuery Production ###############
####################P###############################################

In [30]:
raw_compras_historico.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_compras") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('append') \
  .save()