<img src="media/logo_psa.jpg" width="300">

<h1><center>Constructing SAMARA Data</center></h1>

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import numpy as np
import pandas as pd
from functools import reduce
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F
from pyspark.sql import Window

from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig
from distribution_cost.configuration.data import DataConfig
from distribution_cost.infra import oracle
from distribution_cost.domain import kpis

/gpfs/user/e587247/dco00/conf/application.yml
/gpfs/user/e587247/dco00


## Connection to Exadata (Optional)

In [2]:
# Database uri
app_config = AppConfig()

db_uri = app_config.db_uri_jdbc
db_uri_cx_oracle = app_config.db_uri_cx_oracle

# Data Config
data_config = DataConfig()

data_config.vhls_perimeter

sites = data_config.vhls_perimeter["sites"]
start_date = data_config.vhls_perimeter["start_date"]
end_date = data_config.vhls_perimeter["end_date"]
genr_door = data_config.vhls_perimeter["genr_door"]

## Creating Spark Session

In [3]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
                                                      executors=3, executor_cores=5, executor_mem='10g',
                                                      dynamic_allocation=True, max_executors=8)

spark_session.conf.set("spark.sql.crossJoin.enabled", "true")
# spark_session.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")

## Parameters 

In [13]:
rangeMonths = pd.date_range(start = pd.to_datetime('1/08/2017',  dayfirst = True), 
                    periods = 31, freq = 'MS')

# for m in rangeMonths:
#    print(m)

## Loading the tables from HDFS

In [14]:
df_sinqtvin = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtvin/")
df_sinqtcli = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcli/")
df_sinqtver = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtver/")
# df_sinqtfv4 = spark_session.read.load("/user/e587247/data/landing/sinqtfv4_30month2/")#\
#     .where("DT_VD BETWEEN TO_DATE('{0}', 'dd/MM/yyyy') AND TO_DATE('{1}', 'dd/MM/yyyy')".format(dateFrom, dateTo))
df_sinqtfv4 = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtfv4/")
df_sinqtcmp = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcmp/")
df_sinqtcnd = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcnd/")
df_sinqtseg = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtseg/")
df_sinqtzds = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtzds/")
df_sinqtsfa = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtsfa/")
df_sinqtfam = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtfam/")
df_sinqtrub = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtrub/")
df_sinqtopc = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtopc/")
df_sinqtcma = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcma/")
df_sinqtcmi = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcmi/")
df_sinqtcyr = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtcyr/")
df_sinqtmrq = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtmrq/")
df_sinqtbas = spark_session.read.load("/user/brc10/data/standardized/sinc0/sinqtbas/")

In [15]:
df_sinqtvin.createOrReplaceTempView("df_sinqtvin")
df_sinqtver.createOrReplaceTempView("df_sinqtver")
df_sinqtfv4.createOrReplaceTempView("df_sinqtfv4")
df_sinqtcmp.createOrReplaceTempView("df_sinqtcmp")
df_sinqtcnd.createOrReplaceTempView("df_sinqtcnd")
df_sinqtcli.createOrReplaceTempView("df_sinqtcli")
df_sinqtseg.createOrReplaceTempView("df_sinqtseg")
df_sinqtzds.createOrReplaceTempView("df_sinqtzds")
df_sinqtsfa.createOrReplaceTempView("df_sinqtsfa")
df_sinqtfam.createOrReplaceTempView("df_sinqtfam")
df_sinqtrub.createOrReplaceTempView("df_sinqtrub")
df_sinqtopc.createOrReplaceTempView("df_sinqtopc")
df_sinqtcma.createOrReplaceTempView("df_sinqtcma")
df_sinqtcmi.createOrReplaceTempView("df_sinqtcmi")
df_sinqtcyr.createOrReplaceTempView("df_sinqtcyr")
df_sinqtmrq.createOrReplaceTempView("df_sinqtmrq")
df_sinqtbas.createOrReplaceTempView("df_sinqtbas")

## SQL Query that constructs SAMARA

In [None]:
querySAMARA = """
SELECT
    SINQTVIN.CODE SINQTVIN__CODE,
    SINQTCLI_2.CODE SINQTCLI_2__CODE,
    SINQTCLI_2.CODE_PAYS_IMPLANT,
    SINQTVER.CODE SINQTVER__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTVER.LIB_EN,SINQTVER.LIB_FR) 
when 'fr_FR' then SINQTVER.LIB_FR
when 'es_SP' then nvl(SINQTVER.LIB_ES,SINQTVER.LIB_FR)
else SINQTVER.LIB_FR

end SINQTVER__LIB,
    Table__54.DT_FACT,
    Table__54.DT_VD,
    Table__54.DT_COMM_CLI_FIN_VD,
    Table__54.DATIMM,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCMP.LIB_EN,SINQTCMP.LIB_FR)
when 'fr_FR' then SINQTCMP.LIB_FR
when 'es_SP' then nvl(SINQTCMP.LIB_ES,SINQTCMP.LIB_FR)
else SINQTCMP.LIB_FR

end SINQTCMP__LIB,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCND.LIB_EN,SINQTCND.LIB_FR)
when 'fr_FR' then SINQTCND.LIB_FR
when 'es_SP' then nvl(SINQTCND.LIB_ES,SINQTCND.LIB_FR)
else SINQTCND.LIB_FR

end SINQTCND__LIB,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCLI.LIB_EN,SINQTCLI.LIB_FR)
when 'fr_FR' then SINQTCLI.LIB_FR
when 'es_SP' then nvl(SINQTCLI.LIB_ES,SINQTCLI.LIB_FR)
else SINQTCLI.LIB_FR

end SINQTCLI__LIB,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTSEG.LIB_EN,SINQTSEG.LIB_FR)
when 'fr_FR' then SINQTSEG.LIB_FR
when 'es_SP' then nvl(SINQTSEG.LIB_ES,SINQTSEG.LIB_FR)
else SINQTSEG.LIB_FR

end SINQTSEG__LIB,
--    case 'fr_FR'--
--when 'en_GB' then nvl(SINQTZDS.LIB_EN,SINQTZDS.LIB_FR)--
--when 'fr_FR' then SINQTZDS.LIB_FR--
--when 'es_SP' then nvl(SINQTZDS.LIB_ES,SINQTZDS.LIB_FR)--
--else SINQTZDS.LIB_FR--
--end SINQTZDS__LIB,--

    case 'fr_FR'
when 'en_GB' then nvl(SINQTSFA.LIB_EN,SINQTSFA.LIB_FR)
when 'fr_FR' then SINQTSFA.LIB_FR
when 'es_SP' then nvl(SINQTSFA.LIB_ES,SINQTSFA.LIB_FR)
else SINQTSFA.LIB_FR

end SINQTSFA__LIB,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTFAM.LIB_EN,SINQTFAM.LIB_FR)
when 'fr_FR' then SINQTFAM.LIB_FR
when 'es_SP' then nvl(SINQTFAM.LIB_ES,SINQTFAM.LIB_FR)
else SINQTFAM.LIB_FR

end SINQTFAM__LIB,
    SINQTRUB.CODE SINQTRUB__CODE,
    case 'fr_FR'
when 'en_GB' then SINQTRUB.LIB_EN
when 'fr_FR' then SINQTRUB.LIB_FR
when 'es_SP' then SINQTRUB.LIB_ES
else SINQTRUB.LIB_FR

end SINQTRUB__LIB,
    SINQTOPC.CODE SINQTOPC__CODE,
    case 'fr_FR'
when 'en_GB' then SINQTOPC.LIB_EN
when 'fr_FR' then SINQTOPC.LIB_FR
when 'es_SP' then SINQTOPC.LIB_ES
else SINQTOPC.LIB_FR

end SINQTOPC_LIB,
    SINQTCMA.CODE SINQTCMA__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCMA.LIB_EN,SINQTCMA.LIB_FR)
when 'fr_FR' then SINQTCMA.LIB_FR
when 'es_SP' then nvl(SINQTCMA.LIB_ES,SINQTCMA.LIB_FR)
else SINQTCMA.LIB_FR

end SINQTCMA__LIB,
    SINQTCMI.CODE SINQTCMI__CODE,
    case 'fr_FR'
when 'en_GB' then nvl(SINQTCMI.LIB_EN,SINQTCMI.LIB_FR)
when 'fr_FR' then SINQTCMI.LIB_FR
when 'es_SP' then nvl(SINQTCMI.LIB_ES,SINQTCMI.LIB_FR)
else SINQTCMI.LIB_FR

end SINQTCMI__LIB,
    -- Table__54.TYPE_FLOTTE_VD, --
    -- Table__54.TYPE_OPE_ESSOR, --
    Table__54.TYP_UTIL_VD,
    -- Table__54.CODE_PROFESSION_VD, --
    Table__54.CODE_PROMO,
    Table__54.CODE_PROMO2,
    -- SINQTCYR.ANNEE_MOIS, --
    -- sum(Table__54.VOLUME_AJ) VOLUME_AJ, --
    sum(Table__54.PRIX_VENTE) PRIX_VENTE,
    sum(Table__54.PRIX_VENTE_AJ) PRIX_VENTE_AJ,
    sum(Table__54.PV_OPTIONS) PV_OPTIONS,
    sum(Table__54.PV_VERSION) PV_VERSION,
    sum(Table__54.MACOM_CONSO) MACOM_CONSO,
    sum(Table__54.MACOM_CONSO_AJ) MACOM_CONSO_AJ,
    -- sum(Table__54.MACOM_CONSO_VERSION) MACOM_CONSO_VERSION, --
    -- sum(Table__54.MACOM_CONSO_OPTION) MACOM_CONSO_OPTION, --
    -- sum(Table__54.MACOM_ENTITE) MACOM_ENTITE, --
    -- sum(Table__54.MACOM_ENTITE_AJ) MACOM_ENTITE_AJ, --
    -- sum(Table__54.MACOM_ENTITE_VERSION) MACOM_ENTITE_VERSION, --
    -- sum(Table__54.MACOM_ENTITE_OPTION) MACOM_ENTITE_OPTION, --
    sum(Table__54.RBCV_AJ) RBCV_AJ,
    sum(Table__54.MCX_VARIABLES) MCX_VARIABLES,
    SINQTMRQ_2.CODE SINQTMRQ_2__CODE
FROM
    df_sinqtfv4 TABLE__54,
    df_sinqtfam SINQTFAM_2
    
JOIN df_sinqtcmp SINQTCMP ON Table__54.ID_ZDS=SINQTCMP.ID_ZDS and Table__54.ID_CMP=SINQTCMP.ID
JOIN df_sinqtcli SINQTCLI ON Table__54.ID_ZDS=SINQTCLI.ID_ZDS and Table__54.ID_SCD=SINQTCLI.ID_SCD and Table__54.ID_CLI=SINQTCLI.ID
JOIN df_sinqtcmi SINQTCMI ON Table__54.ID_ZDS=SINQTCMI.ID_ZDS and Table__54.ID_CMI=SINQTCMI.ID
JOIN df_sinqtbas SINQTBAS ON Table__54.ID_BAS=SINQTBAS.ID
JOIN df_sinqtzds SINQTZDS ON Table__54.ID_ZDS=SINQTZDS.ID
LEFT OUTER JOIN df_sinqtrub SINQTRUB ON Table__54.ID_RUB=SINQTRUB.ID
JOIN df_sinqtfam SINQTFAM ON Table__54.ID_ZDS=SINQTFAM.ID_ZDS and Table__54.ID_FAM=SINQTFAM.ID
JOIN df_sinqtsfa SINQTSFA ON Table__54.ID_ZDS=SINQTSFA.ID_ZDS and Table__54.ID_SFA=SINQTSFA.ID
JOIN df_sinqtver SINQTVER ON Table__54.ID_ZDS=SINQTVER.ID_ZDS and Table__54.ID_VER=SINQTVER.ID
LEFT OUTER JOIN df_sinqtseg SINQTSEG ON Table__54.ID_ZDS=SINQTSEG.ID_ZDS and Table__54.ID_SEG=SINQTSEG.ID
JOIN df_sinqtcnd SINQTCND ON Table__54.ID_CND=SINQTCND.ID
JOIN df_sinqtcma SINQTCMA ON Table__54.ID_CMA=SINQTCMA.ID
LEFT OUTER JOIN df_sinqtopc SINQTOPC ON Table__54.ID_ZDS=SINQTOPC.ID_ZDS and Table__54.ID_OPC=SINQTOPC.ID
JOIN df_sinqtvin SINQTVIN ON Table__54.ID_ZDS=SINQTVIN.ID_ZDS and Table__54.ID_VIN=SINQTVIN.ID
-- JOIN df_sinqtcyr SINQTCYR ON Table__54.ID_CYC=SINQTCYR.ID --
JOIN df_sinqtcli SINQTCLI_2 ON Table__54.ID_ZDS=SINQTCLI_2.ID_ZDS and Table__54.ID_SCD=SINQTCLI_2.ID_SCD and Table__54.ID_CLI_LIV=SINQTCLI_2.ID
JOIN df_sinqtmrq SINQTMRQ_2 ON SINQTFAM_2.ID_MRQ_COM=SINQTMRQ_2.ID and Table__54.ID_ZDS=SINQTFAM_2.ID_ZDS and Table__54.ID_FAM=SINQTFAM_2.ID    

WHERE   ( Table__54.DT_VD BETWEEN TO_DATE('{0}', 'dd/MM/yyyy') AND TO_DATE('{1}', 'dd/MM/yyyy'))
    AND ( SINQTBAS.CODE  =  'LA' AND SINQTZDS.CODE IN  ( 'CPBE','PPBE','CPFR','PPFR','CPIT', 'PPIT','CPES', 'PPES','CPGB','PPGB','CPNL', 'PPNL','CPPL', 'PPPL','CPAT','PPAT', 'CPPT','PPPT','CPDE', 'PPDE' ) AND ( SINQTBAS.CODE != 'EA'  ))
GROUP BY
  SINQTVIN.CODE, 
  SINQTCLI_2.CODE, 
  SINQTCLI_2.CODE_PAYS_IMPLANT, 
  SINQTVER.CODE, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTVER.LIB_EN,SINQTVER.LIB_FR)
when 'fr_FR' then SINQTVER.LIB_FR
when 'es_SP' then nvl(SINQTVER.LIB_ES,SINQTVER.LIB_FR)
else SINQTVER.LIB_FR

end, 
  Table__54.DT_FACT, 
  Table__54.DT_VD, 
  Table__54.DT_COMM_CLI_FIN_VD, 
  Table__54.DATIMM, 
  -- SINQTCMP.CODE, --
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCMP.LIB_EN,SINQTCMP.LIB_FR)
when 'fr_FR' then SINQTCMP.LIB_FR
when 'es_SP' then nvl(SINQTCMP.LIB_ES,SINQTCMP.LIB_FR)
else SINQTCMP.LIB_FR

end, 
  -- SINQTCND.CODE, --
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCND.LIB_EN,SINQTCND.LIB_FR)
when 'fr_FR' then SINQTCND.LIB_FR
when 'es_SP' then nvl(SINQTCND.LIB_ES,SINQTCND.LIB_FR)
else SINQTCND.LIB_FR

end, 
  -- SINQTCLI.CODE, --
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCLI.LIB_EN,SINQTCLI.LIB_FR)
when 'fr_FR' then SINQTCLI.LIB_FR
when 'es_SP' then nvl(SINQTCLI.LIB_ES,SINQTCLI.LIB_FR)
else SINQTCLI.LIB_FR

end, 
  -- SINQTSEG.CODE, --
  case 'fr_FR'
when 'en_GB' then nvl(SINQTSEG.LIB_EN,SINQTSEG.LIB_FR)
when 'fr_FR' then SINQTSEG.LIB_FR
when 'es_SP' then nvl(SINQTSEG.LIB_ES,SINQTSEG.LIB_FR)
else SINQTSEG.LIB_FR

end, 
  -- case 'fr_FR' --
-- when 'en_GB' then nvl(SINQTZDS.LIB_EN,SINQTZDS.LIB_FR) --
-- when 'fr_FR' then SINQTZDS.LIB_FR --
-- when 'es_SP' then nvl(SINQTZDS.LIB_ES,SINQTZDS.LIB_FR) --
-- else SINQTZDS.LIB_FR --

-- end, --
  -- SINQTSFA.CODE, --
  case 'fr_FR'
when 'en_GB' then nvl(SINQTSFA.LIB_EN,SINQTSFA.LIB_FR)
when 'fr_FR' then SINQTSFA.LIB_FR
when 'es_SP' then nvl(SINQTSFA.LIB_ES,SINQTSFA.LIB_FR)
else SINQTSFA.LIB_FR

end, 
  -- SINQTFAM.CODE, --
  case 'fr_FR'
when 'en_GB' then nvl(SINQTFAM.LIB_EN,SINQTFAM.LIB_FR)
when 'fr_FR' then SINQTFAM.LIB_FR
when 'es_SP' then nvl(SINQTFAM.LIB_ES,SINQTFAM.LIB_FR)
else SINQTFAM.LIB_FR

end, 
  SINQTRUB.CODE, 
  case 'fr_FR'
when 'en_GB' then SINQTRUB.LIB_EN
when 'fr_FR' then SINQTRUB.LIB_FR
when 'es_SP' then SINQTRUB.LIB_ES
else SINQTRUB.LIB_FR

end, 
  SINQTOPC.CODE, 
  case 'fr_FR'
when 'en_GB' then SINQTOPC.LIB_EN
when 'fr_FR' then SINQTOPC.LIB_FR
when 'es_SP' then SINQTOPC.LIB_ES
else SINQTOPC.LIB_FR

end,  
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCMA.LIB_EN,SINQTCMA.LIB_FR)
when 'fr_FR' then SINQTCMA.LIB_FR
when 'es_SP' then nvl(SINQTCMA.LIB_ES,SINQTCMA.LIB_FR)
else SINQTCMA.LIB_FR

end, 
  case 'fr_FR'
when 'en_GB' then nvl(SINQTCMI.LIB_EN,SINQTCMI.LIB_FR)
when 'fr_FR' then SINQTCMI.LIB_FR
when 'es_SP' then nvl(SINQTCMI.LIB_ES,SINQTCMI.LIB_FR)
else SINQTCMI.LIB_FR

end, 
  Table__54.TYP_UTIL_VD, 
  Table__54.CODE_PROMO, 
  Table__54.CODE_PROMO2, 
  SINQTMRQ_2.CODE
  
ORDER BY Table__54.DT_VD
"""

In [17]:
for i in range(len(rangeMonths)-1):
    dfSAMARA = spark_session.sql(querySAMARA.format(rangeMonths[i],rangeMonths[i+1]))\
            .withColumn("year", F.year(F.col("DT_VD")))\
            .withColumn("month", F.month(F.col("DT_VD")))\
            .withColumn("day", F.dayofmonth(F.col("DT_VD")))\
            .withColumn('COUNTRY', "CODE_IMPLANT_PAYS")#\
        #   .withColumn("SINQTVIN__CODE", F.sha2(F.col("SINQTVIN__CODE"), 256))
        # dropping the duplicate lines based on selected columns
               
    column_values_ctry = {'FR':'France','BE':'Belgium','IT':'Italy','ES':'Spain','GB':'Great Britain','DE':'Germany','PL':'Poland','AT':'Austria','NL':'Netherlands','PT':'Portugal'}
    dfSAMARA = dfSAMARA.replace(to_replace=column_values_ctry, subset=['Country'])
               
    dfVINXPROMO = dfSAMARA\
        .select("SINQTVIN__CODE","COUNTRY","SINQTOPC_LIB","SINQTRUB__CODE","SINQTRUB__LIB","SINQTOPC__CODE","MCX_VARIABLES", "COUNTRY", "year", "month", "day")\
        .drop_duplicates()\
        .withColumnRenamed("SINQTVIN__CODE", "CODE_VIN")\
        .withColumnRenamed("SINQTOPC_LIB", "LIB_OPC")\
        .withColumnRenamed("SINQTRUB__CODE", "CODE_RUB")\
        .withColumnRenamed("SINQTRUB__LIB", "LIB_RUB")\
        .withColumnRenamed("SINQTOPC__CODE", "CODE_OPC")

    # adding Country column
    # dfVINXPROMO = dfVINXPROMO\
    #     .withColumn('COUNTRY', F.initcap(F.split(F.col('LIB_ZDS'),' ')[1]))
    #     .withColumn('VINPROMO', F.concat(F.col('SINQTVIN__CODE'),F.lit('_'), F.col('Country'))) # to be done in the SQL view

    # renaming columns
    column_values_rub= {'PROVISION SUR PERTES FUTURES':'PSA - Buy Backs',
                        'PRIMES VENTES AUX SOCIETES':'PSA - B2B promotions',
                        'PRIME QUALITE':'PSA - Bonus for quality',
                        'PRIMES A LA PERFORMANCE RESEAU':'PSA - Bonus for dealer performance',
                        'PROMOTIONS CLIENT FINAL':'PSA - B2C promotions'}
    dfVINXPROMO = dfVINXPROMO.replace(to_replace=column_values_rub, subset=['LIB_RUB'])

    dfVINXPROMO=dfVINXPROMO.filter((F.col('MCX_VARIABLES')!=0))

    # value modifications based on a condition
    dfVINXPROMO = dfVINXPROMO\
        .withColumn('LIB_OPC',F.concat(F.col('LIB_OPC'),F.lit(' - '), F.when(F.col('LIB_RUB').isin('PSA - Bonus for dealer performance','PSA - Bonus for quality'), 'Network Remuneration').otherwise('Client Promotions')))
    # dfVINXPROMO\
    # .write\
    # .mode("overwrite")\
    # .partitionBy("COUNTRY", "year", "month")\
    # .parquet("hdfs:///user/e587247/data/refined/vinpromov30c/")
    # dfVINXPROMO.drop('year', 'month', 'day').write.jdbc(url=db_uri, table="SMKT008_VINPROMOv2502", mode="overwrite")
               
    dfSAMARA_VIN = dfSAMARA.drop('SINQTRUB__CODE','SINQTRUB__LIB', 'SINQTOPC__CODE', 'SINQTOPC_LIB', 'MCX_VARIABLES')
    dfSAMARA_VIN.createOrReplaceTempView("dfSAMARA_VIN")
    
d = dfSAMARA_VIN.select('SINQTVIN__CODE',
                    #'VOLUME_AJ',
                    'PRIX_VENTE',
                    #'MACOM_CONSO',
                    #'MACOM_CONSO_VERSION',
                    #'MACOM_ENTITE',
                    #'MACOM_ENTITE_AJ',
                    #'MACOM_ENTITE_OPTION',
                    'PRIX_VENTE_AJ',
                    'PV_VERSION',
                    'MACOM_CONSO_AJ',
                    'MACOM_CONSO_OPTION',
                    'MACOM_ENTITE_VERSION',
                    'RBCV_AJ',
                    'PV_OPTIONS').groupby('SINQTVIN__CODE').sum()

dfSAMARA_VIN = dfSAMARA_VIN.drop(
                    #'VOLUME_AJ',
                    'PRIX_VENTE',
                    #'MACOM_CONSO',
                    #'MACOM_CONSO_VERSION',
                    #'MACOM_ENTITE',
                    #'MACOM_ENTITE_AJ',
                    #'MACOM_ENTITE_OPTION',
                    'PRIX_VENTE_AJ',
                    'PV_VERSION',
                    'MACOM_CONSO_AJ',
                    'MACOM_CONSO_OPTION',
                    #'MACOM_ENTITE_VERSION',
                    'RBCV_AJ',
                    'PV_OPTIONS')

dfSAMARA_VIN = dfSAMARA_VIN.drop_duplicates() # to check before that and after that

dfs = [dfSAMARA_VIN, d]
dfSAMARA_VIN = reduce(lambda left,right: left.join(right,on='SINQTVIN__CODE'), dfs)

# Renaming columns and replacing 0 vcalues with None
dfSAMARA_VIN = dfSAMARA_VIN\
    .withColumnRenamed("SINQTVIN__CODE", "CODE_VIN")\
    .withColumnRenamed("SINQTCLI_2__CODE", "CODE_CLI_2")\
    #.withColumnRenamed("SINQTVER__CODE", "CODE_VER")\
    .withColumnRenamed("SINQTVER__LIB", "LIB_VER")\
    #.withColumnRenamed("SINQTCMP__CODE", "CODE_CMP")\
    .withColumnRenamed('SINQTCMP__LIB', 'LIB_CMP')\
    #.withColumnRenamed('SINQTCND__CODE', 'CODE_CND')\
    .withColumnRenamed('SINQTCND__LIB', 'LIB_CND')\
    #.withColumnRenamed('SINQTCLI__CODE', 'CODE_CLI')\
    .withColumnRenamed('SINQTCLI__LIB', 'LIB_CLI')\
    #.withColumnRenamed('SINQTSEG__CODE', 'CODE_SEG')\
    .withColumnRenamed('SINQTSEG__LIB', 'LIB_SEG')\
    .withColumnRenamed('SINQTZDS__LIB', 'LIB_ZDS')\
    #.withColumnRenamed('SINQTSFA__CODE', 'CODE_SFA')\
    .withColumnRenamed('SINQTSFA__LIB', 'LIB_SFA')\
    #.withColumnRenamed('SINQTFAM__CODE', 'CODE_FAM')\
    .withColumnRenamed('SINQTFAM__LIB', 'LIB_FAM')\
    #.withColumnRenamed('SINQTCMA__CODE', 'CODE_CMA')\
    .withColumnRenamed('SINQTCMA__LIB', 'LIB_CMA')\
    #.withColumnRenamed('SINQTCMI__CODE', 'CODE_CMI')\
    .withColumnRenamed('SINQTCMI__LIB', 'LIB_CMI')\
    .withColumnRenamed("sum(PRIX_VENTE_AJ)",'PRIX_VENTE_AJ')\
    .withColumnRenamed("sum(PV_OPTIONS)", 'PV_OPTIONS')\
    .withColumnRenamed("sum(PV_VERSION)", 'PV_VERSION')\
    .withColumnRenamed("sum(RBCV_AJ)", 'RBCV_AJ')\
    .withColumnRenamed("sum(MACOM_CONSO_AJ)", 'MACOM_CONSO_AJ')\
    .withColumnRenamed("sum(MACOM_CONSO_OPTION)", 'MACOM_CONSO_OPTION')\
    #.withColumnRenamed("sum(MACOM_ENTITE_VERSION)", 'MACOM_ENTITE_VERSION')\
    #.withColumnRenamed("sum(VOLUME_AJ)", 'VOLUME_AJ')\
    .withColumnRenamed("sum(PRIX_VENTE)", 'PRIX_VENTE')\
    #.withColumnRenamed("sum(MACOM_CONSO)", 'MACOM_CONSO')\
    #.withColumnRenamed("sum(MACOM_CONSO_VERSION)", 'MACOM_CONSO_VERSION')\
    #.withColumnRenamed("sum(MACOM_ENTITE)", 'MACOM_ENTITE')\
    #.withColumnRenamed("sum(MACOM_ENTITE_AJ)", 'MACOM_ENTITE_AJ')\
    #.withColumnRenamed("sum(MACOM_ENTITE_OPTION)", 'MACOM_ENTITE_OPTION')\
    .withColumn('PRIX_VENTE_AJ', F.when(F.col('PRIX_VENTE_AJ')=='0', F.lit(None)).otherwise(F.col('PRIX_VENTE_AJ')))\
    .withColumn('PV_OPTIONS', F.when(F.col('PV_OPTIONS')=='0', F.lit(None)).otherwise(F.col('PV_OPTIONS')))\
    .withColumn('PV_VERSION', F.when(F.col('PV_VERSION')=='0', F.lit(None)).otherwise(F.col('PV_VERSION')))\
    .withColumn('MACOM_CONSO_AJ', F.when(F.col('MACOM_CONSO_AJ')=='0', F.lit(None)).otherwise(F.col('MACOM_CONSO_AJ')))\
    .withColumn('MACOM_CONSO_OPTION', F.when(F.col('MACOM_CONSO_OPTION')=='0', F.lit(None)).otherwise(F.col('MACOM_CONSO_OPTION')))\
    #.withColumn('MACOM_ENTITE_VERSION', F.when(F.col('MACOM_ENTITE_VERSION')=='0', F.lit(None)).otherwise(F.col('MACOM_ENTITE_VERSION')))\
    #.withColumn('VOLUME_AJ', F.when(F.col('VOLUME_AJ')=='0', F.lit(None)).otherwise(F.col('VOLUME_AJ')))\
    .withColumn('PRIX_VENTE', F.when(F.col('PRIX_VENTE')=='0', F.lit(None)).otherwise(F.col('PRIX_VENTE')))\
    #.withColumn('MACOM_CONSO', F.when(F.col('MACOM_CONSO')=='0', F.lit(None)).otherwise(F.col('MACOM_CONSO')))\
    #.withColumn('MACOM_CONSO_VERSION', F.when(F.col('MACOM_CONSO_VERSION')=='0', F.lit(None)).otherwise(F.col('MACOM_CONSO_VERSION')))\
    #.withColumn('MACOM_ENTITE', F.when(F.col('MACOM_ENTITE')=='0', F.lit(None)).otherwise(F.col('MACOM_ENTITE')))\
    #.withColumn('MACOM_ENTITE_AJ', F.when(F.col('MACOM_ENTITE_AJ')=='0', F.lit(None)).otherwise(F.col('MACOM_ENTITE_AJ')))\
    #.withColumn('MACOM_ENTITE_OPTION', F.when(F.col('MACOM_ENTITE_OPTION')=='0', F.lit(None)).otherwise(F.col('MACOM_ENTITE_OPTION')))\
    .select('CODE_VIN',
             'CODE_CLI_2',
             #'CODE_PAYS_IMPLANT',
             #'CODE_VER',
             'LIB_VER',
             'DT_FACT',
             'DT_VD',
             #'DT_COMM_CLI_FIN_VD',
             'DATIMM',
             #'CODE_CMP',
             'LIB_CMP',
             #'CODE_CND',
             'LIB_CND',
             #'CODE_CLI',
             'LIB_CLI',
             #'CODE_SEG',
             'LIB_SEG',
             'LIB_ZDS',
             #'CODE_SFA',
             'LIB_SFA',
             #'CODE_FAM',
             'LIB_FAM',
             #'CODE_CMA',
             'LIB_CMA',
             #'CODE_CMI',
             'LIB_CMI',
             #'TYPE_FLOTTE_VD',
             #'TYPE_OPE_ESSOR',
             'TYP_UTIL_VD',
             #'CODE_PROFESSION_VD',
             'CODE_PROMO',
             'CODE_PROMO2',
             #'ANNEE_MOIS',
             #'VOLUME_AJ',
             'PRIX_VENTE',
             #'MACOM_CONSO',
             #'MACOM_CONSO_VERSION',
             #'MACOM_ENTITE',
             #'MACOM_ENTITE_AJ',
             #'MACOM_ENTITE_OPTION',
             'SINQTMRQ_2__CODE',
             'PRIX_VENTE_AJ',
             'PV_VERSION',
             'MACOM_CONSO_AJ',
             'MACOM_CONSO_OPTION',
             #'MACOM_ENTITE_VERSION',
             'RBCV_AJ',
             'PV_OPTIONS',
             'COUNTRY',
             'year',
             'month',
             'day')
               
    # dfSAMARA_VIN\
     #   .write\
     #   .mode("overwrite")\
      #  .partitionBy("COUNTRY", "year", "month")\
      #  .parquet("hdfs:///user/e587247/data/raw/samarav30c/")
   # dfSAMARA_VIN.drop('year', 'month').write.jdbc(url=db_uri, table="SMKT009_SAMARAv2502", mode="overwrite")

### Writing the result in HDFS in partitions

In [10]:
dfVINXPROMO\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .partitionBy("COUNTRY", "year", "month")\
    .parquet("hdfs:///user/e587247/data/refined/vinpromov30c/")

KeyboardInterrupt: 

In [None]:
spark_session.stop()