<img src="media/logo_psa.jpg" width="300">

<h1><center>Constructing MADAX U SAMARA</center></h1>

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import numpy as np
import pandas as pd
from functools import reduce
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F
import pyspark.sql.types as StringType

from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig
from distribution_cost.configuration.data import DataConfig
from distribution_cost.infra import oracle
from distribution_cost.domain import kpis

/gpfs/user/e587246/dco00/conf/application.yml
/gpfs/user/e587246/dco00


## Connection to Exadata (Optional)

In [2]:
# Database uri
app_config = AppConfig()

db_uri = app_config.db_uri_jdbc
db_uri_cx_oracle = app_config.db_uri_cx_oracle

# Data Config
data_config = DataConfig()

data_config.vhls_perimeter

sites = data_config.vhls_perimeter["sites"]
start_date = data_config.vhls_perimeter["start_date"]
end_date = data_config.vhls_perimeter["end_date"]
genr_door = data_config.vhls_perimeter["genr_door"]

## Creating Spark Session

In [3]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
                                                      executors=3, executor_cores=4, executor_mem='8g',
                                                      dynamic_allocation=True, max_executors=8)

# spark_session.conf.set("spark.sql.crossJoin.enabled", "true")
# spark_session.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")

## Loading the tables from HDFS

In [4]:
# Read MADAX
df_madax = spark_session.read.load("/user/e587247/data/raw/madaxv30c")
# Read SAMARA
df_samara = spark_session.read.load("/user/e587247/data/raw/samarav30c1")
# READ OPV
df_opv = spark_session.read.load("/user/e587247/data/raw/opvv30c")
# READ TAX RATE (FROM ORACLE DB)?
df_taxRate = spark_session.read.option("fetchsize", 10000).jdbc(db_uri, table='SMKT001_REF_TAX')

## Parameters 

In [5]:
# Renaming VIN column in madax, samara and opv and dropping year, month and day
df_samara=df_samara.withColumnRenamed('CODE_VIN','VIN').drop("year", "month", "day")#.withColumn('COUNTRY',F.lit('France')).drop('day')
df_madax=df_madax.withColumnRenamed('NOVIN','VIN').drop("year", "month", "day")#.withColumn('COUNTRY',F.lit('France')).drop('day')
df_opv=df_opv.withColumnRenamed('CDC_NUMVIN', 'VIN').drop("year", "month", "day")#.withColumn('COUNTRY', F.lit('France')).drop('day')

# Make the outer join between MADAX and SAMARA on VIN and Country
df_mx_outer_sm = df_madax.join(df_samara,on=['COUNTRY','VIN'] , how='outer')

# Make the left join between OPV and MXSM on VIN and Country
df_mx_outer_sm_opv = df_mx_outer_sm.join(df_opv, on=['COUNTRY','VIN'], how='left_outer')

# Make the left join between tax rate and MXSM on VIN and Country
df_mx_outer_sm_opv = df_mx_outer_sm_opv.join(df_taxRate, on=['COUNTRY'], how='left_outer')

In [6]:
# Merge common columns between MADAX and SAMARA
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('BRAND', F.when(F.col('MARQUE').isNull(),F.col('SINQTMRQ_2__CODE')).otherwise(F.col('MARQUE')))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('DATE_SALE', F.when(F.col('DATVENT').isNull(),F.col('DT_VD')).otherwise(F.col('DATVENT')))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('DATE_ORDER', F.when(F.col('DATCCLT').isNull(),F.col('DATE_COMANDE')).otherwise(F.col('DATCCLT')))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('LIB_TYPUTIL', F.when(F.col('TYPUTIL').isNull(),F.col('TYP_UTIL_VD')).otherwise(F.col('TYPUTIL')))
df_mx_outer_sm_opv=df_mx_outer_sm_opv.drop('MARQUE','DATVENT','DATCCLT','TYPUTIL','TYP_UTIL_VD','DATE_COMANDE','DT_VD','SINQTMRQ_2__CODE','LIB_ZDS')

In [7]:
# Date formatting (dd-MM-yyyy)
list_dates = ['DATMEC','DATE_SALE','DATPROD','DATMRES','DATMAD','DATEXPC','DATARCR','DATE_ORDER',"DATDEM","DATIMMAT","DT_FACT"]
for d in list_dates:
    df_mx_outer_sm_opv = df_mx_outer_sm_opv.withColumn(d, F.to_date(F.col(d),'yyyy-MM-dd'))

# Convert variables to float
list_amounts = ['TOTAL_REMISE','NCL_VO_VALON','NCL_VO_IMPAYUDAREC','TOTAL_REMISE_PRE'] 
for a in list_amounts:
    df_mx_outer_sm_opv = df_mx_outer_sm_opv.withColumn(a, F.regexp_replace(F.col(a),'\,','.'))
    df_mx_outer_sm_opv = df_mx_outer_sm_opv.withColumn(a, F.regexp_replace(F.col(a),'[^0-9\-\.]','').cast("float"))

In [8]:
# Variable calculations
df_mx_outer_sm_opv=df_mx_outer_sm_opv.withColumn('TOT_ADV_AFT_TAX', F.abs(F.col('TOTAL_REMISE')) + F.col('NCL_VO_IMPAYUDAREC'))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('TOT_ADV_PRE_TAX', F.col('TOT_ADV_AFT_TAX')/(1+F.col('TAX_RATE').cast('float')))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('TRADE_IN_AID_PRE',  F.col('NCL_VO_IMPAYUDAREC')/(1+F.col('TAX_RATE').cast('float')))

df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('DISC_AFT', F.when((F.abs(F.col('TOTAL_REMISE_PRE')) > 0) & (F.col('NCL_VO_IMPAYUDAREC').isNull()) & (F.col('NCL_VO_VALON').isNull()),1).otherwise(F.when(((F.abs(F.col('TOTAL_REMISE_PRE')) > 0) & (F.col('NCL_VO_IMPAYUDAREC') == 0) & (F.col('NCL_VO_VALON') == 0)),1).otherwise(0)))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('DISC_PRE', F.when(((F.abs(F.col('TOTAL_REMISE_PRE')) <= 0) | (F.col('NCL_VO_IMPAYUDAREC')<= 0) | (F.col('NCL_VO_VALON')<= 0)),0).otherwise(1))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('VIN_DM',F.col('DISC_AFT')+F.col('DISC_PRE'))

df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('VEH_AGE',  F.datediff(F.col('DATE_ORDER'),F.col('DATPROD')))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('STOCK_AGE',F.when(F.col('VEH_AGE')> 0, F.col('VEH_AGE')).otherwise(F.lit(None)))
df_mx_outer_sm_opv= df_mx_outer_sm_opv.withColumn('DELIVERY_TIME', F.datediff(F.col('DATE_ORDER'),F.col('DATE_SALE'))*(-1))

In [9]:
# Select columns from MX-SM-OPV (only COD_PDV from OPV)
df_mx_outer_sm_opv=df_mx_outer_sm_opv.drop('CRE_MARCA', 'DATE_COMANDE', 'NCL_PD_SUBTOTAL2', 'FINITION', 'ACC_PRE', 'NCL_VO_IMPAYUDAREC', 'FRAIS_ANEXXES', 'TOTAL_REMISE_PRE', 'TOTAL_REMISE', 'PRIX_FINAL', 'tarif+options_PRE', 'tarif+OPTTIONS', 'NCL_VO_PRIMACONVERSION', 'ACC', 'CONTRAT_SERVICE', 'BONUS_MALUS', 'NCL_VO_VALON', 'TRANSFORMATIONS', 'TAXE_PARAFISCALE', 'day','TAX_RATE')

In [10]:
# Cast all float values into string (for power bi format ??)
for c in df_mx_outer_sm_opv.columns:
    df_mx_outer_sm_opv = df_mx_outer_sm_opv.withColumn(c, F.col(c).cast('string'))

In [11]:
df_mx_outer_sm_opv.drop_duplicates()

DataFrame[COUNTRY: string, VIN: string, VPVU: string, DATMEC: string, INDDEMO: string, CO2MIXTE: string, HABEXTT: string, HABEXTC: string, FAMILLE: string, LIBFAMI: string, COD_MOTOR: string, DATDEM: string, DATIMMAT: string, CODOP1: string, CODOP2: string, CODOP3: string, CODOP4: string, CODOP5: string, CODPROM: string, CODPROM2: string, REMIPOUR: string, DATMAD: string, DATEXPC: string, DATARCR: string, LIBCOMBIPACK: string, CODCPRO: string, LIBELLE: string, DATPROD: string, DATMRES: string, CODOP1_LIBELLE: string, CODOP2_LIBELLE: string, CODOP3_LIBELLE: string, CODOP4_LIBELLE: string, CODOP5_LIBELLE: string, CODE_CLI_2: string, CODE_PAYS_IMPLANT: string, CODE_VER: string, LIB_VER: string, DT_FACT: string, DT_COMM_CLI_FIN_VD: string, DATIMM: string, CODE_CMP: string, LIB_CMP: string, CODE_CND: string, LIB_CND: string, CODE_CLI: string, LIB_CLI: string, CODE_SEG: string, LIB_SEG: string, CODE_SFA: string, LIB_SFA: string, CODE_FAM: string, LIB_FAM: string, CODE_CMA: string, LIB_CMA: s

### Writing the result in Oracle DB (Exadata)

In [12]:
column_values_ctry = {'France':'France','Belgique':'Belgium','Italie':'Italy','Espagne':'Spain','Grande':'Great Britain','Gde-Bretagne':'Great Britain','Allemagne':'Germany','Pologne':'Poland','Autriche':'Austria','Pays Bas':'Netherlands','Pays-bas':'Netherlands','Pays':'Netherlands','Portugal':'Portugal'}
df_mx_outer_sm_opv=df_mx_outer_sm_opv.replace(to_replace=column_values_ctry, subset=['COUNTRY'])

In [13]:
df_mx_outer_sm_opv.columns

['COUNTRY',
 'VIN',
 'VPVU',
 'DATMEC',
 'INDDEMO',
 'CO2MIXTE',
 'HABEXTT',
 'HABEXTC',
 'FAMILLE',
 'LIBFAMI',
 'COD_MOTOR',
 'DATDEM',
 'DATIMMAT',
 'CODOP1',
 'CODOP2',
 'CODOP3',
 'CODOP4',
 'CODOP5',
 'CODPROM',
 'CODPROM2',
 'REMIPOUR',
 'DATMAD',
 'DATEXPC',
 'DATARCR',
 'LIBCOMBIPACK',
 'CODCPRO',
 'LIBELLE',
 'DATPROD',
 'DATMRES',
 'CODOP1_LIBELLE',
 'CODOP2_LIBELLE',
 'CODOP3_LIBELLE',
 'CODOP4_LIBELLE',
 'CODOP5_LIBELLE',
 'CODE_CLI_2',
 'CODE_PAYS_IMPLANT',
 'CODE_VER',
 'LIB_VER',
 'DT_FACT',
 'DT_COMM_CLI_FIN_VD',
 'DATIMM',
 'CODE_CMP',
 'LIB_CMP',
 'CODE_CND',
 'LIB_CND',
 'CODE_CLI',
 'LIB_CLI',
 'CODE_SEG',
 'LIB_SEG',
 'CODE_SFA',
 'LIB_SFA',
 'CODE_FAM',
 'LIB_FAM',
 'CODE_CMA',
 'LIB_CMA',
 'CODE_CMI',
 'LIB_CMI',
 'TYPE_FLOTTE_VD',
 'TYPE_OPE_ESSOR',
 'CODE_PROFESSION_VD',
 'CODE_PROMO',
 'CODE_PROMO2',
 'ANNEE_MOIS',
 'VOLUME_AJ',
 'PRIX_VENTE',
 'MACOM_CONSO',
 'MACOM_CONSO_VERSION',
 'MACOM_ENTITE',
 'MACOM_ENTITE_AJ',
 'MACOM_ENTITE_OPTION',
 'PRIX_VENTE_AJ'

In [None]:
df_mx_outer_sm_opv.write.option("truncate", "true").jdbc(url=db_uri, table="SMKT003_MXUSM", mode="overwrite")

In [1]:
spark_session.stop()

NameError: name 'spark_session' is not defined