<img src="media/logo_psa.jpg" width="300">

<h1><center>Constructing MADAX Data</center></h1>

## Imports

In [14]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F
from pyspark.sql.types import *

from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig
from distribution_cost.configuration.data import DataConfig
from distribution_cost.infra import oracle
from distribution_cost.domain import kpis

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Connection to Exadata (Optional)

In [15]:
# Database uri
app_config = AppConfig()
db_uri = app_config.db_uri_jdbc
db_uri_cx_oracle = app_config.db_uri_cx_oracle

# Data Config
data_config = DataConfig()

data_config.vhls_perimeter

sites = data_config.vhls_perimeter["sites"]
start_date = data_config.vhls_perimeter["start_date"]
end_date = data_config.vhls_perimeter["end_date"]
genr_door = data_config.vhls_perimeter["genr_door"]

## Creating Spark Session

In [16]:
# # Create spark session
# spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
#                                                       executors=4, executor_cores=4, executor_mem='16g',
#                                                       dynamic_allocation=True, max_executors=8)

# spark_session.conf.set("spark.sql.crossJoin.enabled", "true")
# spark_session.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") 


# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-reduce-brc10sinqtfv4",
                                                      executors=3, executor_cores=5, executor_mem='8g',
                                                      dynamic_allocation=True, max_executors=8)

spark_session.conf.set("spark.sql.crossJoin.enabled", "true")
# spark_session.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")

## Loading the tables from HDFS

In [17]:
df_rbvqtcdc = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtcdc/")
df_rbvqttxm_arc = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqttxm_arc/")
df_rbvqtfll = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtfll/")
df_rbvqttff = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqttff/")
df_rbvqtlm1 = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtlm1/")
df_rbvqtveh = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtveh/")
df_rbvqttut = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqttut/")
df_rbvqtcco = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtcco/")
df_rbvqtcaf = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtcaf/")
df_rbvqtm10 = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtm10/")
df_rbvqtctp = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtctp/")
df_rbvqtfam = spark_session.read.load("/user/brc06/data/standardized/bds00/rbvqtfam/")

In [18]:
df_rbvqtcdc.createOrReplaceTempView("df_rbvqtcdc")
df_rbvqttxm_arc.createOrReplaceTempView("df_rbvqttxm_arc")
df_rbvqtfll.createOrReplaceTempView("df_rbvqtfll")
df_rbvqttff.createOrReplaceTempView("df_rbvqttff")
df_rbvqtlm1.createOrReplaceTempView("df_rbvqtlm1")
df_rbvqtveh.createOrReplaceTempView("df_rbvqtveh")
df_rbvqttut.createOrReplaceTempView("df_rbvqttut")
df_rbvqtcco.createOrReplaceTempView("df_rbvqtcco")
df_rbvqtcaf.createOrReplaceTempView("df_rbvqtcaf")
df_rbvqtm10.createOrReplaceTempView("df_rbvqtm10")
df_rbvqtctp.createOrReplaceTempView("df_rbvqtctp")
df_rbvqtfam.createOrReplaceTempView("df_rbvqtfam")

## Parameters

In [19]:
dateFrom = "01/08/2017"
dateTo = "01/02/2020"

## SQL Query that constructs MADAX

In [20]:
queryMADAX = """
(SELECT 
  RBVQTTXM_ARC.NOVIN NOVIN, 
  RBVQTTXM_ARC.DATCCLT DATCCLT, 
  RBVQTFAM_restriction.VPVU VPVU, 
  RBVQTM10.DATMEC DATMEC, 
  RBVQTTXM_ARC.TYPUTIL TYPUTIL, 
  RBVQTVEH.INDDEMO INDDEMO, 
  RBVQTM10.CO2MIXTE CO2MIXTE, 
  RBVQTTXM_ARC.HABEXTT HABEXTT, 
  RBVQTTXM_ARC.HABEXTC HABEXTC, 
  RBVQTFAM_restriction.FAMILLE FAMILLE, 
  RBVQTFAM_restriction.LIBFAMI LIBFAMI, 
  RBVQTTXM_ARC.COD_MOTOR COD_MOTOR, 
  RBVQTTXM_ARC.DATVENT DATVENT, 
  RBVQTTFF.LIBELLE_FRANCAIS LIBELLE_FRANCAIS, 
  RBVQTFAM_restriction.MARQUE MARQUE, 
  RBVQTM10.DATDEM DATDEM, 
  CASE WHEN ( RBVQTFLL.QC_FILIAL ) IN (1,63) THEN RBVQTM10.DATIMM ELSE RBVQTVEH.DATIMMAT END DATIMMAT,
  RBVQTCDC.CODOP1 CODOP1, 
  RBVQTCDC.CODOP2 CODOP2, 
  RBVQTCDC.CODOP3 CODOP3, 
  RBVQTCDC.CODOP4 CODOP4, 
  RBVQTCDC.CODOP5 CODOP5, 
  RBVQTCDC.CODPROM CODPROM, 
  RBVQTCDC.CODPROM2 CODPROM2,
  RBVQTCDC.REMIPOUR REMIPOUR,
  RBVQTVEH.DATMAD DATMAD, 
  RBVQTVEH.DATEXPC DATEXPC, 
  RBVQTVEH.DATARCR DATARCR, 
  RBVQTCCO.LIBCOMBIPACK LIBCOMBIPACK, 
  RBVQTTXM_ARC.CODCPRO CODCPRO, 
  Table__129.LIBELLE LIBELLE, 
  RBVQTVEH.DATPROD DATPROD, 
  RBVQTCAF.DATMRES DATMRES 
  
FROM 
  df_rbvqttxm_arc RBVQTTXM_ARC,
  df_rbvqttff RBVQTTFF
  LEFT OUTER JOIN df_rbvqtcdc RBVQTCDC ON RBVQTTXM_ARC.QI_FILIAL=RBVQTCDC.QI_FILIAL and RBVQTTXM_ARC.NUMCCLT=RBVQTCDC.NUMCCLT
  JOIN  df_rbvqtfll RBVQTFLL ON RBVQTTFF.QI_FILIAL=RBVQTFLL.QC_FILIAL and RBVQTTXM_ARC.QI_FILIAL=RBVQTFLL.QC_FILIAL
  LEFT OUTER JOIN df_rbvqtlm1 RBVQTLM1 ON RBVQTTXM_ARC.COD_MOTOR=RBVQTLM1.COD_MOTOR and RBVQTTXM_ARC.QI_FILIAL=RBVQTLM1.QI_FILIAL
  LEFT OUTER JOIN df_rbvqtveh RBVQTVEH ON RBVQTTXM_ARC.QI_FILIAL=RBVQTVEH.QI_FILIAL and RBVQTTXM_ARC.NOVIN=RBVQTVEH.NOVIN
  LEFT OUTER JOIN df_rbvqttut RBVQTTUT_TXM ON RBVQTTXM_ARC.TYPUTIL=RBVQTTUT_TXM.TYPUTIL and RBVQTTXM_ARC.QI_FILIAL=RBVQTTUT_TXM.QI_FILIAL
  LEFT OUTER JOIN df_rbvqtcco RBVQTCCO ON RBVQTTXM_ARC.QI_FILIAL=RBVQTCCO.QI_FILIAL and RBVQTTXM_ARC.CODCPER=RBVQTCCO.CODCPER and RBVQTTXM_ARC.VERSION_CCO=RBVQTCCO.VERSION
  LEFT OUTER JOIN df_rbvqtcaf RBVQTCAF ON RBVQTTXM_ARC.NOCAF=RBVQTCAF.NOCAF and RBVQTTXM_ARC.QI_FILIAL=RBVQTCAF.QI_FILIAL
  LEFT OUTER JOIN df_rbvqtm10 RBVQTM10 ON RBVQTTXM_ARC.QI_FILIAL=RBVQTM10.QI_FILIAL and RBVQTTXM_ARC.NOVIN=RBVQTM10.NOVIN
  LEFT OUTER JOIN (SELECT CODPSA as CODPSA,  MAX(LITCTP) as LIBELLE FROM df_rbvqtctp RBVQTCTP GROUP BY CODPSA ) Table__129 ON RBVQTTXM_ARC.CODCPRO=Table__129.CODPSA
  JOIN  df_rbvqtfam RBVQTFAM_restriction ON RBVQTTXM_ARC.VPVU=RBVQTFAM_restriction.VPVU and SUBSTR(RBVQTTXM_ARC.VERS14, 1, 4)=RBVQTFAM_restriction.FAMILLE
  
  WHERE  RBVQTFAM_restriction.QI_FILIAL IN ('83','84')
  AND RBVQTTFF.LIBELLE_FRANCAIS  IN  ( 'France' , 'Allemagne' , 'Portugal' , 'Autriche' , 'Belgique' , 'Italie' , 'Espagne' , 'Gde-Bretagne' , 'Pays Bas' , 'Pologne'  ) 
  AND RBVQTFAM_restriction.MARQUE  IN  ( 'AP' , 'AC', 'DS'  )  
  AND RBVQTTXM_ARC.DATVENT BETWEEN TO_DATE('{0}', 'dd/MM/yyyy') AND TO_DATE('{1}', 'dd/MM/yyyy')
  ORDER BY RBVQTTXM_ARC.DATVENT
  )
""".format(dateFrom, dateTo)

In [21]:
dfMADAX = spark_session.sql(queryMADAX)\
    .withColumn("year", F.year(F.col("DATVENT")))\
    .withColumn("month", F.month(F.col("DATVENT")))\
    .withColumn("day", F.dayofmonth(F.col("DATVENT")))\
    .withColumn('COUNTRY', F.col('LIBELLE_FRANCAIS'))

# Treatments

## Promo Codes Mapping

In [22]:
for i in range(1, 6):
    codop = "CODOP{0}".format(str(i))

    dfMADAX = dfMADAX.withColumn(codop + '_LIBELLE',\
                                 F.when((F.substring(codop, 1, 1) == 'C') |\
                                        (F.substring(codop, 1, 1) == 'D'),\
                                        F.col(codop).substr(F.lit(4), F.lit(F.length(F.col(codop))-5)))\
                                 .otherwise(F.col(codop).substr(F.lit(3), F.lit(F.length(F.col(codop))-4))))

## Mapping VP, VU variables from French abb. to English abb.

In [23]:
column_name_VpVu= {'VP':'PC','VU':'CV' }

dfMADAXFinal = dfMADAX\
    .replace(to_replace=column_name_VpVu, subset=['VPVU'])\
    .withColumn('REMIPOUR', F.when(F.col('REMIPOUR')=='0', F.lit(None)).otherwise(F.col('REMIPOUR')))\
    .withColumn('CODPROM', F.when(F.col('CODPROM')=='0', F.lit(None)).otherwise(F.col('CODPROM')))

### Writing the result in HDFS in partitions

In [24]:
dfMADAXFinal\
    .write\
    .mode("overwrite")\
    .partitionBy("COUNTRY","MARQUE", "year", "month")\
    .parquet("hdfs:///user/brc03/vmeca/data/raw/madax/")

In [25]:
spark_session.stop()