In [1]:
from pyspark.sql.types import StringType
import pyspark.sql.functions as F

In [7]:
from pyspark.sql import SparkSession 
builder = SparkSession.builder

spark = (builder
          .config('spark.driver.maxResultSize', '21000m')
          .config('spark.scheduler.mode', 'FIFO')
          .config('spark.executor.memory', '15000m')
         ).getOrCreate()

In [8]:
spark

In [9]:
raw_reader = (
  spark
  .read
  .option("parentProject" ,'tr-tech-innovation-dev')     
  .option('project','tr-tech-innovation-dev')
#   .option("credentialsFile", credentials_file)
  .format("bigquery")
  .option("dataset", "merchant")
)


reader = raw_reader.option("filter", "_PARTITIONDATE = '2020-10-26'")

In [11]:
products = (reader
  .option('table', 'Products_8215388')
  .load()
)

In [12]:
products.groupby('offer_id').count().groupby('count').count().show()

+-----+-----+
|count|count|
+-----+-----+
|    1|10156|
+-----+-----+



In [13]:
topBrands4weeks = (reader
  .option('table', 'BestSellers_TopBrands_8090258')
  .option('filter', '_PARTITIONDATE in ("2020-10-05","2020-10-12","2020-10-19","2020-10-26")')
  .load()
)

from pyspark.storagelevel import StorageLevel
# topBrands4weeks.persist(StorageLevel.DISK_ONLY).count()
topBrands4weeks.cache().count()

5359219

In [15]:
bucket = "bkt-dataproc-prod-eu-notebookmarouaneazlaf"

In [17]:
topBrands4weeks.write.parquet(f'gs://{bucket}/klarna/topBrands4weeks.parquet')

In [20]:
!gsutil cp gs://{bucket}/klarna/datalyer.json  .

Copying gs://bkt-dataproc-prod-eu-notebookmarouaneazlaf/klarna/datalyer.json...
/ [1 files][337.5 KiB/337.5 KiB]                                                
Operation completed over 1 objects/337.5 KiB.                                    


In [None]:
# gs://bkt-dataproc-prod-eu-notebookmarouaneazlaf/klarna/datalyer.json

In [21]:
import pandas as pd
import json

with open('datalyer.json') as f : 
    datalyer = json.load(f)

merchants = pd.DataFrame([l for d in datalyer  if 'block.cards' in d for l in d['block.cards']])    
merchants_df = spark.createDataFrame(merchants)



In [24]:
klarna_merchants = merchants_df.withColumnRenamed('title','name')
my_brands = topBrands4weeks.join(klarna_merchants,klarna_merchants.name == topBrands4weeks.brand,'inner')
my_brands.cache().count()


86925

In [27]:
us_brands = (my_brands
             .filter('ranking_country = "US" ')

            .select('*', 
                    F.explode('ranking_category_path').alias('cat')
            )
            .filter('cat.locale = "en-US"')
 .drop('ranking_category_path')
)

In [28]:
matching_cat_ids = {d['name']:d['ranking_category'] for d in (us_brands
#  .filter(~F.col('cat.name').contains('>'))
 .filter(F.size(F.split('cat.name','>')) < 3)
 .groupBy('cat.name','ranking_category')
 .agg(F.countDistinct('brand').alias('brands'))
 .orderBy('name')
).toPandas().to_dict('records')}

In [29]:
cat_matching = {'Electronics > Audio' : "Electronics",
'Apparel & Accessories > Clothing': "Clothes & fashion",
'Apparel & Accessories > Jewelry': "Clothes & fashion",
'Apparel & Accessories > Shoes': "Clothes & fashion",
'Health & Beauty > Health Care': "Health",
'Health & Beauty > Personal Care': "Beauty",
'Home & Garden > Decor': "Home",
'Home & Garden > Kitchen & Dining': "Home",
'Home & Garden > Linens & Bedding': "Home",
'Luggage & Bags': 'Luggage',
'Sporting Goods': 'Sport',
 "Toys & Games": 'Toys & entertainment',
 "Furniture": "Home"}

In [31]:
from functools import reduce
col2 = reduce(lambda y,srcdest: F.when(F.col('ranking_category') == matching_cat_ids.get(srcdest[0]),srcdest[1] ).otherwise(y),cat_matching.items(),None).alias('my_cat')


In [35]:
us_brands_2 = us_brands.select('*', col2)

In [33]:
other_brands = topBrands4weeks.select('rank','brand','ranking_country', col2)
other_brands_us = other_brands.filter('ranking_country = "US" ')

In [36]:
from itertools import zip_longest

for v in set(cat_matching.values()): 
    print(v)
    my_brands_str = (us_brands_2
     .filter(F.col('my_cat') == v  )
     .groupby(F.col('brand').alias('k_brands'), 'ranking_country')
     .agg(F.min('rank').alias('min_rank'), F.max('rank').alias('max_rank'))
     .orderBy(F.asc('min_rank'))
     .limit(10)
    )._jdf.showString(10,50,False)

    all_brands_str = (other_brands_us
     .filter(F.col('my_cat') == v  )
     .groupby(F.col('brand').alias('all_brands'), 'ranking_country')
     .agg(F.min('rank').alias('min_rank'), F.max('rank').alias('max_rank'))
     .orderBy(F.asc('min_rank'))
     .limit(10)
    )._jdf.showString(10,50,False)

    # rising_brands_str = (other_brands_us
    #  .filter(F.col('my_cat') == v  )
    #  .groupby(F.col('brand').alias('rising_brands'), 'ranking_country')
    #  .agg(F.min('rank').alias('min_rank'), F.max('rank').alias('max_rank'))
    #  .orderBy(F.asc('min_rank'))
    # )._jdf.showString(20,50,False)
    for a,b in zip_longest(my_brands_str.split('\n'),all_brands_str.split('\n')):
        print(b,'' if not a else a)


Beauty
+-----------------+---------------+--------+--------+ +-------------+---------------+--------+--------+
|       all_brands|ranking_country|min_rank|max_rank| |     k_brands|ranking_country|min_rank|max_rank|
+-----------------+---------------+--------+--------+ +-------------+---------------+--------+--------+
|     The Ordinary|             US|       1|       1| | Calvin Klein|             US|      62|      74|
|          L'Oréal|             US|       2|       4| |Mario Badescu|             US|     140|     165|
|          Philips|             US|       2|       3| |     Givenchy|             US|     151|     159|
|           Revlon|             US|       3|       6| |Beautycounter|             US|     203|     328|
|Bath & Body Works|             US|       4|       5| |        SHEIN|             US|     241|     283|
|            M·A·C|             US|       5|       8| |    Bio Ionic|             US|     273|     341|
|             Dior|             US|       6|      10| |  

In [39]:
us_brands_2.filter('brand = "JBL"').select('rank', 'rank_timestamp', 'my_cat').where('my_cat is not null').show()

+----+-------------------+-----------+
|rank|     rank_timestamp|     my_cat|
+----+-------------------+-----------+
|4046|2020-10-02 00:00:00|     Beauty|
|2907|2020-10-02 00:00:00|      Sport|
|2817|2020-10-02 00:00:00|       Home|
|   4|2020-10-02 00:00:00|Electronics|
|4421|2020-10-09 00:00:00|     Beauty|
|2941|2020-10-09 00:00:00|      Sport|
|   3|2020-10-09 00:00:00|Electronics|
|2966|2020-10-16 00:00:00|      Sport|
|3146|2020-10-16 00:00:00|       Home|
|   3|2020-10-16 00:00:00|Electronics|
|   3|2020-10-23 00:00:00|Electronics|
|2829|2020-10-23 00:00:00|      Sport|
+----+-------------------+-----------+



In [65]:
max_timestamp = "2020-10-23 00:00:00"

In [136]:
from pyspark.sql.window import Window

window = Window.partitionBy("brand","category","ranking_country").orderBy('rank_timestamp')
last_w1 = F.lag('rank',1).over(window).alias('rank_1w')
last_w2 = F.lag('rank',2).over(window).alias('rank_2w')
last_w3 = F.lag('rank',3).over(window).alias('rank_3w')

In [148]:


brands = (topBrands4weeks
          .select('*', 
                    F.explode('ranking_category_path').alias('cat')
            )
            .filter('cat.locale = "en-US"')
             .drop('ranking_category_path')          
          .select('rank','brand','ranking_country','rank_timestamp','ranking_category',F.col('cat.name').alias('category'), col2)
          .select('*', last_w1,last_w2,last_w3)
          .filter(F.col('rank_timestamp') == max_timestamp )
         )

In [149]:
my_brands = brands.join(klarna_merchants,klarna_merchants.name == brands.brand,'leftOuter')
my_brands.cache().count()


1374669

In [134]:
# my_brands.show()

In [139]:
# !gsutil -m rm  -r 'gs://{bucket}/klarna/my_brands_full.parquet'

In [150]:
my_brands.write.parquet(f'gs://{bucket}/klarna/my_brands_fullCategories.parquet')

In [151]:
my_brands_us = my_brands.where('ranking_country = "US" ')

In [152]:
# rising = my_brands_us.filter( (F.col('rank') <  0.8 * F.col('rank_1w')) &
#                    (F.col('rank_1w') <  0.8 * F.col('rank_2w')) &
#                     (F.col('rank_2w') < 0.8 * F.col('rank_3w')) &
#                     (F.col('rank') > 10 ) 
#                    ).select('rank','brand','my_cat','rank_1w','rank_2w', 'rank_3w','name')

In [206]:
ranks = F.concat_ws(' ◄ ','rank_1w','rank_2w', 'rank_3w').alias('ranks')

In [None]:
my_brands_us.select('category').distinct().count()

In [204]:
my_brands.head()

Row(rank=642, brand='1 Up Nutrition', ranking_country='US', rank_timestamp=datetime.datetime(2020, 10, 23, 0, 0), ranking_category=2890, category='Health & Beauty > Health Care > Fitness & Nutrition', my_cat=None, rank_1w=613, rank_2w=506, rank_3w=508, position=None, target=None, name=None)

In [207]:
spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
# This is required if you are using service account and set true, 
# spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
# spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', credentials_file)


(my_brands.withColumn('ranks',ranks).coalesce(1).write
  .option("parentProject" ,'tr-tech-innovation-dev')     
  .option('project','tr-tech-innovation-dev')
#   .option("credentialsFile", credentials_file)
  .format("bigquery")
  .option("temporaryGcsBucket","tmp-tr-tech-innovation-dev")
  .save("merchant.brands")
)

In [208]:
res = []

In [210]:
filters = my_brands.select('category','ranking_country').distinct().collect()

In [222]:
window = Window.partitionBy("category","ranking_country").orderBy('rank')
my_rank = F.rank().over(window).alias('relative_rank')

DataFrame[false: boolean]

In [239]:
final_brands = (my_brands
 .select(my_rank,'brand',"rank","rank_timestamp","category","ranking_country",ranks,F.lit(False).alias('klarna'),F.lit(False).alias('rising'))
 .filter('relative_rank <=30')
 .distinct()
).unionAll(
(my_brands
 .filter('name is not null')
 .select(my_rank,'brand',"rank","rank_timestamp","category","ranking_country",ranks,F.lit(True).alias('klarna'),F.lit(False).alias('rising'))
 .filter('relative_rank <=30')
 .distinct()
)
).unionAll(
(my_brands
    .filter ( (F.col('rank') <     F.col('rank_3w') - 200) 
                & (F.col('rank_1w') <  F.col('rank_2w')) 
                & (F.col('rank_2w') <  F.col('rank_3w')) 
                & (F.col('rank') > 10 ) 
                & (F.col('rank') < 1000 ) )
    .select(my_rank,'brand',"rank","rank_timestamp","category","ranking_country",ranks,F.lit(False).alias('klarna'),F.lit(True).alias('rising'))
 .filter('relative_rank <=30')
 .distinct()
)
)

In [240]:
final_brands.cache().count()

458766

In [243]:
spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
# This is required if you are using service account and set true, 
# spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
# spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', credentials_file)


(final_brands.coalesce(1).write
  .option("parentProject" ,'tr-tech-innovation-dev')     
  .option('project','tr-tech-innovation-dev')
#   .option("credentialsFile", credentials_file)
  .format("bigquery")
  .option("temporaryGcsBucket","tmp-tr-tech-innovation-dev")
  .mode("overwrite")
  .save("merchant.brands_top30")
)

In [213]:
# +-----------------+---------------+----+------------+
# |all_brands       |ranking_country|rank|ranks       |
# +-----------------+---------------+----+------------+
# |The Ordinary     |US             |1   |1 ◄ 1 ◄ 1   |
# |Philips          |US             |2   |2 ◄ 3 ◄ 3   |
# |Revlon           |US             |3   |4 ◄ 6 ◄ 5   |
# |L'Oréal          |US             |4   |3 ◄ 2 ◄ 2   |
# |Bath & Body Works|US             |5   |5 ◄ 4 ◄ 4   |
# |CeraVe           |US             |6   |6 ◄ 7 ◄ 10  |
# |Eyefusion        |US             |7   |17 ◄ 32 ◄ 62|
# |M·A·C            |US             |8   |7 ◄ 5 ◄ 8   |
# |Lancôme          |US             |9   |13 ◄ 13 ◄ 14|
# |Dior             |US             |10  |8 ◄ 8 ◄ 6   |
# +-----------------+---------------+----+------------+

35757

In [147]:
from itertools import zip_longest

for v in set(cat_matching.keys()): 
    print(v)
    my_brands_str = (my_brands_us
     .filter('name is not null')
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('k_brands'), 'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(10)
    ).show(truncate=False)

    all_brands_str = (my_brands_us
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('all_brands'), 'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(10)
    ).show(truncate=False)

    rising_brands_str = (my_brands_us
#     .filter( (F.col('rank') <  0.9 * F.col('rank_1w')) 
#             &    (F.col('rank_1w') <  0.8 * F.col('rank_2w')) 
#             &    (F.col('rank_2w') < 0.7 * F.col('rank_3w')) 
#               &  (F.col('rank') > 10 ) )
    .filter ( (F.col('rank') <    0.5 * F.col('rank_3w')) 
            &    (F.col('rank_1w') <  F.col('rank_2w')) 
            &    (F.col('rank_2w') <  F.col('rank_3w')) 
              &  (F.col('rank') > 10 ) 
              &  (F.col('rank') < 1000 ) )
                        
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('rising_brands'), 'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(5)
    ).show(truncate=False)
#     for a,b,c in zip_longest(my_brands_str.split('\n'),all_brands_str.split('\n'),rising_brands_str.split('\n')):
#         print(b,'' if not a else a,'' if not c else c)


Health & Beauty > Personal Care
+-------------+---------------+----+---------------+
|k_brands     |ranking_country|rank|ranks          |
+-------------+---------------+----+---------------+
|Calvin Klein |US             |62  |73 ◄ 74 ◄ 72   |
|Givenchy     |US             |157 |157 ◄ 151 ◄ 159|
|Mario Badescu|US             |163 |165 ◄ 160 ◄ 140|
|SHEIN        |US             |241 |283 ◄ 270 ◄ 259|
|Beautycounter|US             |270 |203 ◄ 210 ◄ 328|
|Bio Ionic    |US             |341 |298 ◄ 273 ◄ 292|
|Perry Ellis  |US             |490 |491 ◄ 448 ◄ 421|
|Brookstone   |US             |532 |586 ◄ 607 ◄ 600|
|Nautica      |US             |544 |559 ◄ 561 ◄ 486|
|Hollister    |US             |567 |629 ◄ 621 ◄ 626|
+-------------+---------------+----+---------------+

+-----------------+---------------+----+------------+
|all_brands       |ranking_country|rank|ranks       |
+-----------------+---------------+----+------------+
|The Ordinary     |US             |1   |1 ◄ 1 ◄ 1   |
|Philips 

+--------------------+---------------+----+------------------+
|       rising_brands|ranking_country|rank|             ranks|
+--------------------+---------------+----+------------------+
|    Little Live Pets|             US|  33|      33 ◄ 51 ◄ 79|
|          Epic Games|             US|  79|    89 ◄ 137 ◄ 172|
|          Fun Wheels|             US| 324|   365 ◄ 612 ◄ 721|
|         FAO Schwarz|             US| 342|   450 ◄ 623 ◄ 705|
|Home Accents Holiday|             US| 376|   786 ◄ 788 ◄ 998|
|          HCM Kinzel|             US| 381|  372 ◄ 404 ◄ 1047|
|Magformers (Магфо...|             US| 455| 823 ◄ 1105 ◄ 1411|
|               Topps|             US| 545|  548 ◄ 846 ◄ 1114|
|                 TCG|             US| 559|1370 ◄ 1401 ◄ 1467|
|             Lortone|             US| 568|  571 ◄ 760 ◄ 1184|
+--------------------+---------------+----+------------------+



In [154]:
topProducts4weeks = (reader
  .option('table', 'BestSellers_TopProducts_8090258')
  .option('filter', '_PARTITIONDATE in ("2020-10-05","2020-10-12","2020-10-19","2020-10-26")')
  .load()
)

from pyspark.storagelevel import StorageLevel
# topBrands4weeks.persist(StorageLevel.DISK_ONLY).count()
topProducts4weeks.cache().count()

20363021

In [170]:

topProducts4weeks.write.parquet(f'gs://{bucket}/klarna/topProducts4weeks.parquet')

In [163]:
window2 = Window.partitionBy("product.name","brand","category","ranking_country").orderBy('rank_timestamp')
plast_w1 = F.lag('rank',1).over(window2).alias('rank_1w')
plast_w2 = F.lag('rank',2).over(window2).alias('rank_2w')
plast_w3 = F.lag('rank',3).over(window2).alias('rank_3w')

products = (topProducts4weeks
          .select('*', 
                    F.explode('ranking_category_path').alias('cat')
            )
            .filter('cat.locale = "en-US"')
             .drop('ranking_category_path')          
          .select('rank','brand','ranking_country','rank_timestamp',
                  'ranking_category',F.col('cat.name').alias('category'), col2,
                 'price_range',F.explode('product_title').alias('product'))
          
            .filter('product.locale = "en-US"')
          .select('*', plast_w1,plast_w2,plast_w3)
          .filter(F.col('rank_timestamp') == max_timestamp )
         )

In [188]:
products = products.drop("name")

In [189]:
my_products = products.join(klarna_merchants,klarna_merchants.name == products.brand,'leftOuter')
my_products.cache().count()


1399598

DataFrame[rank: bigint, brand: string, ranking_country: string, rank_timestamp: timestamp, ranking_category: bigint, category: string, my_cat: string, price_range: struct<min:decimal(38,9),max:decimal(38,9),currency:string>, product: struct<locale:string,name:string>, rank_1w: bigint, rank_2w: bigint, rank_3w: bigint]

In [186]:
products = spark.read.parquet(f'gs://{bucket}/klarna/my_products_fullCategories.parquet')

In [162]:
# !gsutil -m rm -r 'gs://{bucket}/klarna/my_products_fullCategories.parquet'

In [None]:
my_products.write.parquet(f'gs://{bucket}/klarna/my_products_fullCategories.parquet')

In [190]:
my_products_us = my_products.where('ranking_country = "US" ')

In [198]:
pranks = (F.concat_ws(' ◄ ','rank_1w','rank_2w', 'rank_3w')).alias('ranks')

In [199]:
my_products_us.withColumn('ranks',pranks)

DataFrame[rank: bigint, brand: string, ranking_country: string, rank_timestamp: timestamp, ranking_category: bigint, category: string, my_cat: string, price_range: struct<min:decimal(38,9),max:decimal(38,9),currency:string>, product: struct<locale:string,name:string>, rank_1w: bigint, rank_2w: bigint, rank_3w: bigint, position: bigint, target: string, name: string, ranks: string]

In [None]:
window = Window.partitionBy("category","ranking_country").orderBy('rank')
my_rank = F.rank().over(window).alias('relative_rank')

In [244]:
final_products = (my_products_us
 .select(my_rank,'brand','product.name' ,"rank","rank_timestamp","category","ranking_country",ranks,F.lit(False).alias('klarna'),F.lit(False).alias('rising'))
 .filter('relative_rank <=30')
 .distinct()
).unionAll(
(my_products_us
 .filter('name is not null')
 .select(my_rank,'brand','product.name' ,"rank","rank_timestamp","category","ranking_country",ranks,F.lit(True).alias('klarna'),F.lit(False).alias('rising'))
 .filter('relative_rank <=30')
 .distinct()
)
).unionAll(
(my_products_us
    .filter ( (F.col('rank') <     F.col('rank_3w') - 200) 
                & (F.col('rank_1w') <  F.col('rank_2w')) 
                & (F.col('rank_2w') <  F.col('rank_3w')) 
                & (F.col('rank') > 10 ) 
                & (F.col('rank') < 1000 ) )
    .select(my_rank,'brand','product.name' ,"rank","rank_timestamp","category","ranking_country",ranks,F.lit(False).alias('klarna'),F.lit(True).alias('rising'))
 .filter('relative_rank <=30')
 .distinct()
)
)

In [245]:
final_products.cache().count()

113068

In [249]:
spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
# This is required if you are using service account and set true, 
# spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
# spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', credentials_file)


(final_products.coalesce(1).write
  .option("parentProject" ,'tr-tech-innovation-dev')     
  .option('project','tr-tech-innovation-dev')
#   .option("credentialsFile", credentials_file)
  .format("bigquery")
  .option("temporaryGcsBucket","tmp-tr-tech-innovation-dev")
  .mode("overwrite")
  .save("merchant.products_top30")
)

In [248]:
(final_products
 .filter('klarna')
 .filter('category = "Electronics > Audio" ')
 .filter('ranking_country = "US"')
 .select('brand','name','ranking_country','rank','ranks'   )
 .orderBy('rank')
).show()

+-----+--------------------+---------------+----+---------------+
|brand|                name|ranking_country|rank|          ranks|
+-----+--------------------+---------------+----+---------------+
| Bose|Bose QuietComfort...|             US|   4|    4 ◄ 61 ◄ 77|
|  JBL|          JBL Flip 5|             US|   9|    7 ◄ 11 ◄ 17|
| Bose|  Bose Sport Earbuds|             US|  21|               |
|  JBL|        JBL Charge 4|             US|  23|   19 ◄ 20 ◄ 49|
| Bose|Bose QuietComfort...|             US|  24|        18 ◄ 17|
| Bose|Bose SoundLink Re...|             US|  28| 30 ◄ 202 ◄ 189|
|  JBL|          JBL Clip 3|             US|  33|   34 ◄ 29 ◄ 72|
| Bose|Bose SoundLink Co...|             US|  41|   41 ◄ 62 ◄ 60|
| Bose|         Bose Solo 5|             US|  50|   64 ◄ 84 ◄ 85|
| Bose|Bose SoundSport W...|             US|  62|   69 ◄ 66 ◄ 98|
|  JBL|      JBL TUNE 500BT|             US|  65| 78 ◄ 114 ◄ 279|
| Bose|Bose QuietComfort...|             US|  70|             88|
|  JBL|   

In [203]:
spark._jsc.hadoopConfiguration().set('fs.gs.impl', 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem')
# This is required if you are using service account and set true, 
# spark._jsc.hadoopConfiguration().set('fs.gs.auth.service.account.enable', 'true')
# spark._jsc.hadoopConfiguration().set('google.cloud.auth.service.account.json.keyfile', credentials_file)


(my_products_us.withColumn('ranks',pranks).coalesce(1).write
  .option("parentProject" ,'tr-tech-innovation-dev')     
  .option('project','tr-tech-innovation-dev')
#   .option("credentialsFile", credentials_file)
  .format("bigquery")
  .option("temporaryGcsBucket","tmp-tr-tech-innovation-dev")
  .save("merchant.products")
)

In [192]:
my_products_us.cache().count()

1375890

In [196]:
my_products_us.head()

Row(rank=2386, brand='ASTRO', ranking_country='US', rank_timestamp=datetime.datetime(2020, 10, 23, 0, 0), ranking_category=223, category='Electronics > Audio', my_cat='Electronics', price_range=Row(min=Decimal('60.000000000'), max=Decimal('60.000000000'), currency='USD'), product=Row(locale='en-US', name='Astro Gaming A40 TR Mod Kit'), rank_1w=2620, rank_2w=5031, rank_3w=None, position=None, target=None, name=None)

In [None]:
    my_brands_str = (my_products_us
     .filter('name is not null')
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('klarna'),'product.name' ,'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(10)
    ).show(truncate=False)

In [193]:
from itertools import zip_longest

for v in set(cat_matching.keys()): 
    print(v)
    my_brands_str = (my_products_us
     .filter('name is not null')
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('klarna'),'product.name' ,'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(10)
    ).show(truncate=False)

    all_brands_str = (my_products_us
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('all'),'product.name', 'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(10)
    ).show(truncate=False)

    rising_brands_str = (my_products_us
#     .filter( (F.col('rank') <  0.9 * F.col('rank_1w')) 
#             &    (F.col('rank_1w') <  0.8 * F.col('rank_2w')) 
#             &    (F.col('rank_2w') < 0.7 * F.col('rank_3w')) 
#               &  (F.col('rank') > 10 ) )
    .filter ( (F.col('rank') <    0.5 * F.col('rank_3w')) 
            &    (F.col('rank_1w') <  F.col('rank_2w')) 
            &    (F.col('rank_2w') <  F.col('rank_3w')) 
              &  (F.col('rank') > 10 ) 
              &  (F.col('rank') < 1000 ) )
                        
     .filter(F.col('category') == v  )
     .groupby(F.col('brand').alias('rising'),'product.name', 'ranking_country')
     .agg(F.min('rank').alias('rank'),ranks )
     .orderBy(F.asc('rank'))
     .limit(5)
    ).show(truncate=False)
#     for a,b,c in zip_longest(my_brands_str.split('\n'),all_brands_str.split('\n'),rising_brands_str.split('\n')):
#         print(b,'' if not a else a,'' if not c else c)


Health & Beauty > Personal Care
+-------------+---------------------------------------------+---------------+----+-----------------+
|klarna       |name                                         |ranking_country|rank|ranks            |
+-------------+---------------------------------------------+---------------+----+-----------------+
|Bose         |Bose Sleepbuds II                            |US             |250 |319              |
|Calvin Klein |Calvin Klein Euphoria                        |US             |253 |295 ◄ 555 ◄ 571  |
|Mario Badescu|Mario Badescu Drying Lotion                  |US             |268 |291 ◄ 306 ◄ 201  |
|Hot Topic    |Arctic Fox Semi-Permanent Hair Dye           |US             |345 |531 ◄ 1058 ◄ 475 |
|Love Wellness|Love Wellness pH Balancing Cleanser          |US             |416 |337 ◄ 265 ◄ 269  |
|Calvin Klein |Calvin Klein Eternity Men                    |US             |549 |371              |
|Calvin Klein |Calvin Klein Euphoria for Women Eau de Parfu