<img src="media/logo_psa.jpg" width="300">

<h1><center>Constructing MADAX Data (1. MADAX)</center></h1>

### Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F
import numpy as  np

from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig
from distribution_cost.configuration.data import DataConfig
from distribution_cost.domain import kpis

/gpfs/user/e587246/dco00/conf/application.yml
/gpfs/user/e587246/dco00


In [2]:
# Database uri
app_config = AppConfig()

db_uri = app_config.db_uri_jdbc
db_uri

'jdbc:oracle:thin:BRC_E587247/uOR3hLZr@//pyox2k01:1521/BRCEX_PP2'

In [3]:
# Data Config
data_config = DataConfig()

data_config.vhls_perimeter

{'sites': ['PY', 'MU'],
 'start_date': '15/01/20',
 'end_date': '17/01/20',
 'genr_door': 'EMON'}

In [4]:
sites = data_config.vhls_perimeter["sites"]
start_date = data_config.vhls_perimeter["start_date"]
end_date = data_config.vhls_perimeter["end_date"]
genr_door = data_config.vhls_perimeter["genr_door"]

In [5]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
                                                      executors=2, executor_cores=4, executor_mem='5g',
                                                      dynamic_allocation=True, max_executors=8)

In [6]:
dateFrom = "01/01/2019"
dateTo = "30/06/2019"

In [7]:
queryMADAX = """
(SELECT 
  BRC06_BDS00.RBVQTTXM_ARC.NOVIN, 
  BRC06_BDS00.RBVQTTXM_ARC.DATCCLT, 
  RBVQTFAM_restriction.VPVU, 
  BRC06_BDS00.RBVQTM10.DATMEC, 
  BRC06_BDS00.RBVQTTXM_ARC.TYPUTIL, 
  --RBVQTTUT_TXM.LITERAL MDX_Libelle_Type_Utilisation, 
  BRC06_BDS00.RBVQTVEH.INDDEMO, 
  BRC06_BDS00.RBVQTM10.CO2MIXTE, 
  BRC06_BDS00.RBVQTTXM_ARC.HABEXTT, 
  RBVQTFAM_restriction.FAMILLE, 
  RBVQTFAM_restriction.LIBFAMI, 
  BRC06_BDS00.RBVQTTXM_ARC.COD_MOTOR, 
  --BRC06_BDS00.RBVQTLM1.LITERAL MDX_Libelle_Moteur, 
  --BRC06_BDS00.RBVQTLM1.ENERGIA, 
  BRC06_BDS00.RBVQTTXM_ARC.DATVENT, 
  RBVQTTFF.LIBELLE_FRANCAIS, 
  RBVQTFAM_restriction.MARQUE, 
  BRC06_BDS00.RBVQTM10.DATDEM, 
  CASE WHEN ( BRC06_BDS00.RBVQTFLL.QC_FILIAL ) IN (1,63) THEN BRC06_BDS00.RBVQTM10.DATIMM ELSE BRC06_BDS00.RBVQTVEH.DATIMMAT END AS DATIMMAT,
  BRC06_BDS00.RBVQTCDC.CODOP1, 
  BRC06_BDS00.RBVQTCDC.CODOP2, 
  BRC06_BDS00.RBVQTCDC.CODOP3, 
  BRC06_BDS00.RBVQTCDC.CODOP4, 
  BRC06_BDS00.RBVQTCDC.CODOP5, 
  BRC06_BDS00.RBVQTCDC.CODPROM, 
  BRC06_BDS00.RBVQTCDC.CODPROM2, 
  BRC06_BDS00.RBVQTCDC.REMIPOUR, 
  BRC06_BDS00.RBVQTVEH.DATMAD, 
  BRC06_BDS00.RBVQTVEH.DATEXPC, 
  BRC06_BDS00.RBVQTVEH.DATARCR, 
  --BRC06_BDS00.RBVQTCCO.COMBI, 
  BRC06_BDS00.RBVQTCCO.LIBCOMBI, 
  BRC06_BDS00.RBVQTTXM_ARC.CODCPRO, 
  Table__129.LIBELLE, 
  BRC06_BDS00.RBVQTVEH.DATPROD, 
  BRC06_BDS00.RBVQTCAF.DATMRES 
FROM 
  BRC06_BDS00.RBVQTCDC, 
  BRC06_BDS00.RBVQTTXM_ARC, 
  BRC06_BDS00.RBVQTFLL, 
  BRC06_BDS00.RBVQTTFF, 
  BRC06_BDS00.RBVQTLM1, 
  BRC06_BDS00.RBVQTVEH, 
  BRC06_BDS00.RBVQTTUT RBVQTTUT_TXM, 
  BRC06_BDS00.RBVQTCCO, 
  BRC06_BDS00.RBVQTCAF, 
  BRC06_BDS00.RBVQTM10, 
  (  
  SELECT CODPSA as CODPSA,  MAX(LITCTP) as LIBELLE FROM BRC06_BDS00.RBVQTCTP GROUP BY CODPSA 
  )  Table__129,
  BRC06_BDS00.RBVQTFAM  RBVQTFAM_restriction 
WHERE 
       ( BRC06_BDS00.RBVQTCDC.QI_FILIAL(+)=BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL  ) 
  AND  ( BRC06_BDS00.RBVQTTXM_ARC.NUMCCLT=BRC06_BDS00.RBVQTCDC.NUMCCLT(+)  ) 
  AND  ( BRC06_BDS00.RBVQTFLL.QC_FILIAL=BRC06_BDS00.RBVQTTFF.QI_FILIAL  ) 
  AND  ( BRC06_BDS00.RBVQTLM1.COD_MOTOR(+)=BRC06_BDS00.RBVQTTXM_ARC.COD_MOTOR  ) 
  AND  ( BRC06_BDS00.RBVQTLM1.QI_FILIAL(+)=BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL  ) 
  AND  ( BRC06_BDS00.RBVQTVEH.QI_FILIAL(+)=BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL  ) 
  AND  ( BRC06_BDS00.RBVQTVEH.NOVIN(+)=BRC06_BDS00.RBVQTTXM_ARC.NOVIN  ) 
  AND  ( BRC06_BDS00.RBVQTTXM_ARC.TYPUTIL=RBVQTTUT_TXM.TYPUTIL(+)  ) 
  AND  ( BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL=RBVQTTUT_TXM.QI_FILIAL(+)  ) 
  AND  ( BRC06_BDS00.RBVQTCCO.QI_FILIAL(+)=BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL  ) 
  AND  ( BRC06_BDS00.RBVQTTXM_ARC.CODCPER=BRC06_BDS00.RBVQTCCO.CODCPER(+)  ) 
  AND  ( BRC06_BDS00.RBVQTCCO.VERSION(+)=BRC06_BDS00.RBVQTTXM_ARC.VERSION_CCO  ) 
  AND  ( BRC06_BDS00.RBVQTCAF.NOCAF(+)=BRC06_BDS00.RBVQTTXM_ARC.NOCAF and BRC06_BDS00.RBVQTCAF.QI_FILIAL(+)=BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL  ) 
  AND  ( BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL=BRC06_BDS00.RBVQTM10.QI_FILIAL(+) and BRC06_BDS00.RBVQTTXM_ARC.NOVIN=BRC06_BDS00.RBVQTM10.NOVIN(+)  ) 
  AND  ( Table__129.CODPSA(+)=BRC06_BDS00.RBVQTTXM_ARC.CODCPRO  ) 
  AND  ( BRC06_BDS00.RBVQTFLL.QC_FILIAL=BRC06_BDS00.RBVQTTXM_ARC.QI_FILIAL  ) 
  AND  ( RBVQTFAM_restriction.VPVU=BRC06_BDS00.RBVQTTXM_ARC.VPVU and RBVQTFAM_restriction.FAMILLE=SUBSTR(BRC06_BDS00.RBVQTTXM_ARC.VERS14, 1, 4)  ) 
  AND  ( RBVQTFAM_restriction.QI_FILIAL IN ('83','84')  ) 
  AND  ( 
         BRC06_BDS00.RBVQTTFF.LIBELLE_FRANCAIS  IN  ( 'France' , 'Allemagne' , 'Portugal' , 'Autriche' , 'Belgique' , 'Italie' , 'Espagne' , 'Gde-Bretagne' , 'Pays Bas' , 'Pologne'  ) 
         AND 
         RBVQTFAM_restriction.MARQUE  IN  ( 'AP' , 'AC', 'DS'  ) 
       ) 
  AND BRC06_BDS00.RBVQTTXM_ARC.DATVENT BETWEEN TO_DATE('%s', 'dd/mm/yyyy') AND TO_DATE('%s', 'dd/mm/yyyy')
  ORDER BY BRC06_BDS00.RBVQTTXM_ARC.DATVENT)
""" % (dateFrom, dateTo)

In [8]:
# dfMADAXPandas = oracle.read_df_from_query(db_uri_cx_oracle, queryMADAX)

In [9]:
dfMADAX = spark_session.read.option("fetchsize", 10000).jdbc(db_uri, table=queryMADAX).cache()

# Tests

In [10]:
#dfMADAX.toPandas().head(10)

# Treatments

## Promo Codes Mapping

In [11]:
dfMADAX = dfMADAX.withColumn('CODOP1_LIBELLE',\
                             F.when((F.substring('CODOP1', 1, 1) == 'C') |\
                                    (F.substring('CODOP1', 1, 1) == 'D'),\
                                    F.col("CODOP1").substr(F.lit(4), F.lit(F.length(F.col("CODOP1"))-5)))\
                             .otherwise(F.col("CODOP1").substr(F.lit(3), F.lit(F.length(F.col("CODOP1"))-4))))

## Mapping VP, VU variables from French abb. to English abb.

In [12]:
column_name_VpVu= {'VP':'PC','VU':'CV' }
dfMADAX = dfMADAX.replace(to_replace=column_name_VpVu, subset=['VPVU'])

In [13]:
dfMADAX = dfMADAX.withColumn('REMIPOUR', F.when(F.col('REMIPOUR')=='0', np.nan).otherwise(F.col('REMIPOUR')))
dfMADAX = dfMADAX.withColumn('CODPROM', F.when(F.col('CODPROM')=='0', np.nan).otherwise(F.col('CODPROM')))

In [14]:
#dfMADAX.toPandas().head(10)

In [15]:
df_with_year_and_month_and_day = (
    dfMADAX.withColumn("year", F.year(F.col("DATVENT")))
                   .withColumn("month", F.month(F.col("DATVENT")))
                  .withColumn("day", F.dayofmonth(F.col("DATVENT")))
) 
#spark_session.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") 

In [16]:
df_with_year_and_month_and_day.write.mode("overwrite").partitionBy("year","month","day").parquet("hdfs:///user/e587247/data/raw/madax/")