In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder \
  .appName('clean_products') \
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [3]:
##extract table clients from BigQuery Staging ######

In [4]:
#name table clients
table_clients = "becade_mgutierrez.stg_clients"

#load table
raw_clients = spark.read \
  .format("bigquery") \
  .option("table", table_clients) \
  .load()

#show incoming lines
print("lines incoming: " , raw_clients.count())

#show schema
raw_clients.printSchema()



lines incoming:  5678
root
 |-- _airbyte_ab_id: string (nullable = true)
 |-- _airbyte_emitted_at: long (nullable = true)
 |-- direccion: string (nullable = true)
 |-- email: string (nullable = true)
 |-- id: string (nullable = true)
 |-- isprime: boolean (nullable = true)
 |-- nombre: string (nullable = true)
 |-- numero_tarjeta: string (nullable = true)
 |-- rowid: long (nullable = true)
 |-- telefono: string (nullable = true)



In [5]:
raw_clients = raw_clients.drop('_airbyte_ab_id','_airbyte_emitted_at','rowid')

In [11]:
#renamed columns 
raw_clients = raw_clients.withColumnRenamed('id','client_id') \
                           .withColumnRenamed('nombre','client_name') \
                            .withColumnRenamed('direccion','client_address') \
                           .withColumnRenamed('email','client_email') \
                           .withColumnRenamed('isprime','client_is_prime') \
                           .withColumnRenamed('numero_tarjeta','client_credit_card') \
                           .withColumnRenamed('telefono','client_phone')

In [12]:
from pyspark.sql.types import DateType,StringType

raw_clients= raw_clients.withColumn("client_is_prime",raw_clients.client_is_prime.cast(StringType())) 

In [14]:
#select columns from table
raw_clients.show(10)


#show outgoing lines
print("lines clean outgoing: " , raw_clients.count())
#show schema
raw_clients.printSchema()

+--------------------+--------------------+-----------------+---------------+-----------------+-------------------+-------------+
|      client_address|        client_email|        client_id|client_is_prime|      client_name| client_credit_card| client_phone|
+--------------------+--------------------+-----------------+---------------+-----------------+-------------------+-------------+
| 811 Bond Route, ...|KPearson9610@outl...|310-309280-59-926|          false|Katherine Pearson|5395-1908-9123-4506|(310)052-0159|
| 7781 John Locks,...|DBell5699@hotmail...|323-462812-43-494|          false|      Denise Bell|5145-8058-2252-3801|(323)377-0937|
| 2649 Calhoun Ove...|DHunt4398@outlook...|510-658906-47-860|          false|       David Hunt|4962-3792-3311-9337|(510)911-5136|
| 8650 Villa Ridge...|Christine-Sta@out...|661-024332-95-247|          false|  Christine Stark|4551-7274-4304-6403|(661)700-7605|
| 8596 Morgan Spri...|GOconnor5032@hotm...|480-146888-22-806|          false|   George Oco

In [15]:
#####################################################################
########insert table pr_clients to BigQuery Production ##############
#####################################################################

In [17]:
raw_clients.write \
  .format("bigquery") \
  .option("table","becade_mgutierrez.pr_clients") \
  .option("temporaryGcsBucket", "amazon_magdielgutierrez") \
  .mode('overwrite') \
  .save()