In [None]:
from pyspark.sql.types import StringType
import pyspark.sql.functions as F

In [None]:
from pyspark.sql import SparkSession 
builder = SparkSession.builder

spark = (builder
         .config('spark.driver.maxResultSize', '21000m')
         .config('spark.scheduler.mode', 'FIFO')
         .config('spark.executor.memory', '15000m')
        ).getOrCreate()

In [None]:
spark

In [None]:
reader = (spark
              .read
              .option('parentProject' ,'tr-tech-innovation-dev')     
              .option('project','tr-tech-innovation-dev')
              .format('bigquery')
              .option('dataset', 'merchant')
              .option('table', 'BestSellers_TopProducts_8090258')
              .option('filter', '_PARTITIONDATE >= "2020-10-01" AND _PARTITIONDATE <= "2020-10-31"')
)

In [None]:
top_products_tmp = reader.load()
top_products_tmp.createOrReplaceTempView("top_products_db")

In [None]:
top_products = (spark.sql('''SELECT
                                 rank_timestamp AS date,
                                 product_title.name AS product_name,
                                 brand AS product_brand,
                                 rank AS product_rank,
                                 previous_rank AS previous_product_rank,
                                 ranking_category AS product_ranking_category,
                                 ranking_country AS country,
                                 price_range.min AS product_min_price,
                                 price_range.max AS product_max_price,
                                 price_range.currency AS price_currency
                             FROM
                                 top_products_db
                         ''')
)
top_products = top_products.select(F.col('date'),
                                   F.col('product_name').getItem(0).alias('product_name'),
                                   F.col('product_brand'),
                                   F.col('product_rank'),
                                   F.col('previous_product_rank'),
                                   F.col('product_ranking_category'),
                                   F.col('country'),
                                   F.col('product_min_price'),
                                   F.col('product_max_price'),
                                   F.col('price_currency')
                                  )
top_products.cache().count()

In [None]:
top_products.limit(5).toPandas()

In [None]:
top_products.agg(F.min('date')).show()

In [None]:
top_products.agg(F.max('date')).show()