# Customer Insights data prep

#### Account prep

##### Logistics postal address entity prep

In [None]:
locationdf=spark.sql("""SELECT locationid                       AS LOCATIONID,
                            description                         AS DESCRIPTION,
                            ispostaladdress                     AS ISPOSTALADDRESS,
                            recid                               AS LOCATIONRECID
                    FROM logisticslocation
                    """)

locationdf.createOrReplaceTempView("logisticslocationview")


logisticspostaladdressdf= spark.sql("""SELECT address             AS ADDRESS,
                            city                                  AS CITY,
                            countryregionid                       AS COUNTRYREGIONID,
                            county                                AS COUNTY,
                            districtname                          AS DISTRICTNAME,
                            isprivate                             AS ISPRIVATE,
                            latitude                              AS LATITUDE,
                            longitude                             AS LONGITUDE,
                            postbox                               AS POSTBOX,
                            state                                 AS STATE,
                            street                                AS STREET,
                            streetnumber                          AS STREETNUMBER,
                            timezone                              AS TIMEZONE,
                            validfrom                             AS VALIDFROM,
                            validto                               AS VALIDTO,
                            zipcode                               AS ZIPCODE,
                            privateforparty                       AS PRIVATEFORPARTY,
                            location                              AS LOCATION
                    FROM logisticspostaladdress
                    """)

logisticspostaladdressdf.createOrReplaceTempView("logisticspostaladdressview")

logisticspostaladdressentitydf= spark.sql("""select * 
                                                from logisticslocationview T1
                                                inner join logisticspostaladdressview T2 on T1.LOCATIONRECID = T2.LOCATION
                                            """)

logisticspostaladdressentitydf.createOrReplaceTempView("logisticspostaladdressentity")



##### Dir party entity prep

In [None]:
dirpartydf=spark.sql("""SELECT 
                        partynumber             AS PARTYNUMBER,
                        NAME                    AS NAME,
                        namealias               AS NAMEALIAS,
                        knownas                 AS KNOWNAS,
                        addressbooknames        AS ADDRESSBOOKS,
                        languageid              AS LANGUAGEID,
                        instancerelationtype    AS INSTANCERELATIONTYPE,
                        dataareaid              AS DATAAREA,
                        primaryaddresslocation  AS PRIMARYADDRESSLOCATION,
                        recid                   AS PARTYRECORDID,
                        primarycontactphone     AS PRIMARYCONTACTPHONEREF,
                        primarycontactemail     AS PRIMARYCONTACTEMAILREF
                    FROM dirpartytable
                    """)

dirpartydf.createOrReplaceTempView("dirpartytableview")

emaildf=spark.sql("""SELECT 
                        recid                  AS PRIMARYCONTACTEMAILRECORDID,
                        locator                AS PRIMARYCONTACTEMAIL,
                        description            AS PRIMARYCONTACTEMAILDESCRIPTION,
                        isinstantmessage       AS PRIMARYCONTACTEMAILISIM,
                        electronicaddressroles AS PRIMARYCONTACTEMAILPURPOSE,
                        isprimary              AS ISPRIMARY
                    FROM logisticselectronicaddress
                    """)

emaildf.createOrReplaceTempView("email")

phonedf=spark.sql("""SELECT 
                        recid                  AS PRIMARYCONTACTPHONERECORDID,
                        locator                AS PRIMARYCONTACTPHONE,
                        description            AS PRIMARYCONTACTPHONEDESCRIPTION,
                        locatorextension       AS PRIMARYCONTACTPHONEEXTENSION,
                        ismobilephone          AS PRIMARYCONTACTPHONEISMOBILE,
                        electronicaddressroles AS PRIMARYCONTACTPHONEPURPOSE
                    FROM logisticselectronicaddress
                    """)

phonedf.createOrReplaceTempView("phone")

dirpersonnamedf=spark.sql("""SELECT 
                                firstname       AS PERSONFIRSTNAME,
                                middlename      AS PERSONMIDDLENAME,
                                lastnameprefix  AS PERSONLASTNAMEPREFIX,
                                lastname        AS PERSONLASTNAME,
                                validto         AS PERSONNAMEVALIDTO,
                                validfrom       AS PERSONNAMEVALIDFROM,
                                recid           AS DIRPERSONNAMERECID,
                                person          AS PERSON
                    FROM dirpersonname T1
                    where (T1.validfrom <= current_timestamp() AND T1.validto >= current_timestamp())
                    order by PERSONNAMEVALIDFROM desc
                    limit 1
                    """)

dirpersonnamedf.createOrReplaceTempView("dirpersonnameview")

dirpartybaseentitydf=spark.sql("""SELECT *,
                                    CASE T1.INSTANCERELATIONTYPE
                                        WHEN 13271 THEN 'Person'
                                        WHEN 2077 THEN 'Organization'
                                        WHEN 6926 THEN 'Team'
                                        WHEN 8363 THEN 'OperatingUnit'
                                        WHEN 9027 THEN 'LegalEntity'
                                        ELSE ''
                                    END                      AS PARTYTYPE
                    FROM dirpartytableview T1
                    LEFT OUTER JOIN email T2 ON( T1.PRIMARYCONTACTEMAILREF = T2.PRIMARYCONTACTEMAILRECORDID )
                    LEFT OUTER JOIN phone T3 ON( T1.PRIMARYCONTACTPHONEREF = T3.PRIMARYCONTACTPHONERECORDID )
                    LEFT OUTER JOIN dirpersonnameview T4 ON(T1.PARTYRECORDID = T4.PERSON)
                    """)

dirpartybaseentitydf.createOrReplaceTempView("dirpartybaseentity")


##### Customer entity prep

In [None]:
customerdf = spark.sql("""SELECT accountnum                     as ACCOUNTNUM,
                            custgroup                           as CUSTGROUP,
                            party                               as PARTYNUMBER,
                            recid                               as RECID,
                            dataareaid                          as DATAAREA
                    FROM custtable
                    """)

customerdf.createOrReplaceTempView("custtableview")

customerentitydf=spark.sql("""SELECT    T1.ACCOUNTNUM,
                                        T1.RECID,
                                        T1.CUSTGROUP,
                                        T1.DATAAREA, 
                                        concat(T1.DATAAREA,'-',T1.ACCOUNTNUM) AS ID,                                       
                                        T2.NAME,
                                        T2.NAMEALIAS,
                                        T2.KNOWNAS,
                                        T2.PARTYTYPE,
                                        T2.LANGUAGEID,
                                        T2.PARTYNUMBER,
                                        T2.PRIMARYCONTACTEMAIL,
                                        T2.PRIMARYCONTACTPHONE,
                                        T2.PRIMARYCONTACTEMAILREF,
                                        T2.PRIMARYCONTACTPHONEREF,
                                        T2.PERSONFIRSTNAME,
                                        T2.PERSONMIDDLENAME,
                                        T2.PERSONLASTNAME,
                                        T2.PARTYRECORDID,
                                        T2.PERSON,
                                        T3.ADDRESS,
                                        T3.CITY,
                                        T3.COUNTRYREGIONID,
                                        T3.COUNTY,
                                        T3.DISTRICTNAME,
                                        T3.POSTBOX,
                                        T3.STATE,
                                        T3.STREET,
                                        T3.STREETNUMBER,
                                        T3.ZIPCODE,
                                        T3.LOCATIONRECID,
                                        T3.LOCATION
                                FROM custtableview T1
                                INNER JOIN dirpartybaseentity T2 ON (T1.PARTYNUMBER = T2.PARTYRECORDID)
                                LEFT OUTER JOIN logisticspostaladdressentity T3 ON (T2.PRIMARYADDRESSLOCATION = T3.LOCATIONRECID AND ( T3.VALIDFROM <= current_timestamp() AND T3.VALIDTO >= current_timestamp()))
                                """)

customerentitydf.createOrReplaceTempView("customerentity")

#### Transactions prep

In [None]:
salestabledf=spark.sql("""SELECT 
                        custaccount                         AS CUSTACCOUNT,
                        salesid                             AS SALESID,
                        retailchanneltable                  AS RETAILCHANNELTABLE,
                        dataareaid                          AS DATAAREA
                    FROM salestable
                    """)

salestabledf.createOrReplaceTempView("salestable")

saleslinedf=spark.sql("""SELECT 
                        itemid                      AS ITEMID,
                        recid                       AS RECID,
                        lineamount                  AS LINEAMOUNT,
                        linedisc                    AS LINEDISC,
                        costprice                   AS COSTPRICE,
                        currencycode                AS CURRENCYCODE,
                        linenum                     AS LINENUM,
                        priceunit                   AS PRICEUNIT,
                        salesid                     AS SALESID,
                        salesprice                  AS SALESPRICE,
                        salesqty                    AS SALESQTY,
                        salesunit                   AS SALESUNIT,
                        salesstatus                 AS SALESSTATUS,
                        dataareaid                  AS DATAAREA,
                        createddatetime             AS CREATEDDATETIME,
                        "Purchase"                  AS EVENTTYPE,
                        name                        AS NAME
                    FROM salesline
                    """)

saleslinedf.createOrReplaceTempView("salesline")

retailchanneltabledf=spark.sql("""SELECT 
                                    retailchannelid AS CHANNELID,
                                    recid           AS CHANNELRECID
                                    FROM retailchanneltable
                                """)

retailchanneltabledf.createOrReplaceTempView("retailchanneltable")


transactionentitydf=spark.sql("""SELECT 
                        T1.CUSTACCOUNT                 AS CUSTACCOUNT,
                        T1.SALESID                     AS SALESID,
                        T1.DATAAREA                    AS DATAAREA,
                        concat(T1.DATAAREA,'-',T1.CUSTACCOUNT) AS ID,  
                        T2.ITEMID                      AS ITEMID,
                        T2.LINEAMOUNT                  AS LINEAMOUNT,
                        T2.LINEDISC                    AS LINEDISC,
                        T2.COSTPRICE                   AS COSTPRICE,
                        T2.CURRENCYCODE                AS CURRENCYCODE,
                        T2.LINENUM                     AS LINENUM,
                        T2.PRICEUNIT                   AS PRICEUNIT,
                        T2.SALESPRICE                  AS SALESPRICE,
                        T2.SALESQTY                    AS SALESQTY,
                        T2.SALESUNIT                   AS SALESUNIT,
                        T2.SALESSTATUS                 AS SALESSTATUS, 
                        T2.CREATEDDATETIME             AS CREATEDDATETIME,
                        T2.EVENTTYPE                   AS EVENTTYPE,
                        T2.NAME                        AS NAME,
                        T2.RECID                       AS RECID,
                        T4.CHANNELID                   AS CHANNELID
                        FROM salestable T1 
                        LEFT OUTER JOIN retailchanneltable T4 on T1.RETAILCHANNELTABLE=T4.CHANNELRECID
                        INNER JOIN salesline T2 on (T1.SALESID = T2.SALESID)
                    """)

transactionentitydf.createOrReplaceTempView("transactionentity")


#### Save to changes to data lake

In [None]:
customersb2clakedf=spark.sql("""SELECT T1.*,current_timestamp() AS TIMESTAMP
                                FROM customerentity T1
                                where PARTYTYPE='Person'
                                """)
customersb2clakedf.write.format("parquet").mode("overwrite").partitionBy("DATAAREA").save("Tables//customerinsightdata//customersb2clake")