In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.window import Window
from pyspark.sql import SparkSession 
import pyspark.sql.functions as F

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
path = 'gs://bkt-dataproc-prod-eu-notebookmatthieubritoantunes/google-merchant-centre/'
filename = 'top_products_gmc_10_2020.parquet'

df_merchant = sqlContext.read.load(path+filename)
df_merchant = (df_merchant.withColumnRenamed('product_category', 'large_product_category')
               .withColumn('product_category', 
                           F.split(F.col('large_product_category'), ' > ').getItem(0))
              ).distinct()
df_merchant.cache().count()

In [None]:
#Work carried on on UK data as a first example
df_merchant_uk_ = df_merchant.filter(F.col('country')=='GB')
df_merchant_uk_.cache().count()

In [None]:
df_merchant_uk = (df_merchant_uk_.groupBy(F.col('date'),
                                          F.col('product_name'),
                                          F.col('product_brand'),
                                          F.col('product_category'))
                  .agg(F.mean(F.col('product_rank')).cast(IntegerType()).alias('product_rank'),
                       F.max(F.col('product_price')).alias('product_price'))
                  .orderBy(F.asc('date'),
                           F.asc('product_brand'),
                           F.asc('product_category'),
                           F.asc('product_name'))
                  .select(F.col('date'),
                          F.col('product_name'),
                          F.col('product_brand'),
                          F.col('product_category'),
                          F.col('product_price'),
                          F.col('product_rank'),
                          F.lag('product_rank',1).over(Window.partitionBy('product_name')
                                                       .orderBy('date'))
                          .alias('previous_product_rank')
                         )
                 )
df_merchant_uk.cache().count()

In [None]:
del df_merchant_uk_

In [None]:
df_merchant_uk.limit(5).toPandas()

# Catalogue Time

In [None]:
catalogue_days = (df_merchant_uk.groupBy('product_name')
                  .agg(F.count('date').alias('catalogue_days'))
                  .orderBy(F.desc('catalogue_days'))
                  .select(F.col('product_name'),
                          F.col('catalogue_days'))
                  .orderBy(F.desc('catalogue_days'))
                  .toPandas()
)

In [None]:
import seaborn as sns
sns.set_style('whitegrid')
sns.set(rc={'figure.figsize': (12, 6)})

ax = sns.barplot(x='product_name', y='catalogue_days', data=catalogue_days[:10])
ax.set(title='October 2020 spent days at catalogue by product in UK',
       xlabel='Product name', 
       ylabel='Catalogue days')
plt.xticks(rotation=90);
plt.show()

# Pricing, Ranking Evolution and Rising Stars

# Latest k

# Upcoming Trends