In [3]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F
from pyspark.sql.types import *

from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Database uri
app_config = AppConfig()
db_uri = app_config.db_uri_jdbc
db_uri

'jdbc:oracle:thin:BRC03_VMECA/8sUFYtvK@//pyox2k01:1521/BRCEX_PP2'

In [5]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
                                                      executors=4, executor_cores=4, executor_mem='4g',
                                                      dynamic_allocation=True, max_executors=8)

In [6]:
def df_process(file_location):  
  file_type = 'csv'
  # CSV options
  infer_schema = "false"
  first_row_is_header = "true"
  delimiter = ";"

  # The applied options are for CSV files. For other file types, these will be ignored.
  df = spark_session.read.format(file_type) \
    .option("encoding", 'ISO-8859-1') \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(file_location)

  #Eliminate special characters in column names
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace('-', '')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace(' ', '_')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace('(', '_')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace(')', '_')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace('=', '_')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace('.', '_')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace('\'', '_')) for c in df.columns))
  df = df.select(*(F.col("`" + c+ "`").alias(c.replace(';', '_')) for c in df.columns))
  return df

In [7]:
dfPROMO_1= df_process("/user/e587247/data/landing/ope/Operations Commerciales - AP - client promotions.csv")
dfPROMO_2= df_process("/user/e587247/data/landing/ope/05-05-2020 - Network Remuneration - Base REF_PROMO - CSV.csv")

In [8]:
dfPROMO = dfPROMO_1.union(dfPROMO_2)

In [9]:
dfPROMO=dfPROMO.drop_duplicates()

In [10]:
column_value_cat= {'Autres':'Other','Buy Back':'Buy Back','Hors scope':'Out of scope','Reprise':'Trade-in','Evénements':'Events','Services':'Services','Produit':'Product','Stock':'Stock','Remise':'Discount','CRM':'CRM','Financement':'Funding'}
dfPROMO = dfPROMO.replace(to_replace=column_value_cat, subset=['Catégorie'])

In [11]:
dfPROMO = dfPROMO.withColumn('Classification',F.when(F.col('Catégorie').isin('Discount','Events','Product','Funding','Trade-in','Services','Stock','CRM','Buy Back','Other','Out of scope'), 'Client Promotions').otherwise('Network Remuneration'))

In [12]:
dfPROMO = dfPROMO.withColumn('Libellé_Opération_Commerciale',F.concat(F.col('Libellé_Opération_Commerciale'),F.lit(' - '), F.col('Classification')))
dfPROMO=dfPROMO.drop('Classification')

In [13]:
newCat = spark_session.createDataFrame([('Unknown','Unknown')], dfPROMO.columns)
dfPROMO = dfPROMO.union(newCat)

In [14]:
#import vinxpromo and do outer join on promo
dfVINPROMO = spark_session.read.load("/user/brc03/vmeca/data/refined/vinpromo/COUNTRY=France")
dfVINPROMO = dfVINPROMO.withColumn('COUNTRY',F.lit('France')).drop('year','month','day')

In [15]:
dfPROMO = dfPROMO.withColumnRenamed('Libellé_Opération_Commerciale','LIB_OPC')
dfPROMO = dfPROMO.withColumnRenamed('Catégorie','LIB_CAT')
dfPROMO = dfPROMO.withColumn('COUNTRY',F.lit('France'))

In [16]:
# flag promo as to be flagged (cat)
# append to ref promo
df_toFlagged= dfVINPROMO.filter(F.col('COUNTRY') == 'France').join(dfPROMO, on = ['LIB_OPC','COUNTRY'], how='left_outer')

In [17]:
df_toFlagged = df_toFlagged.select('LIB_OPC','LIB_CAT','COUNTRY').withColumn('LIB_CAT', F.lit('To be flagged'))

In [18]:
dfPROMO = dfPROMO.union(df_toFlagged)

In [19]:
dfPROMO=dfPROMO.drop_duplicates()

## Load madax on oracle

In [27]:
dfPROMO.write.option('truncate','true').jdbc(url=db_uri, table="SMKT005_REF_PROMO", mode="overwrite")

In [20]:
dfPROMO.write.mode("overwrite").partitionBy("COUNTRY").parquet("hdfs:///user/brc03/vmeca/data/refined/ope/")

In [21]:
spark_session.stop()