<img src="media/logo_psa.jpg" width="300">

<h1><center>Examples of app00 use</center></h1>

#### Imports

In [20]:
%load_ext autoreload
%autoreload 2
import os
import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from pyspark.sql import functions as F

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**WARNING: Make sure to replace 'app_template' in the imports by the package_name you choosed**

In [7]:
from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig
from distribution_cost.configuration.data import DataConfig

from distribution_cost.infra import oracle
from distribution_cost.domain import kpis

/gpfs/user/e587247/dco00/conf/application.yml
/gpfs/user/e587247/dco00


#### Configs

In [8]:
# Database uri
app_config = AppConfig()

db_uri = app_config.db_uri_jdbc
db_uri_cx_oracle = app_config.db_uri_cx_oracle

In [9]:
# Data Config
data_config = DataConfig()

data_config.vhls_perimeter

{'sites': ['PY', 'MU'],
 'start_date': '15/01/20',
 'end_date': '17/01/20',
 'genr_door': 'EMON'}

In [10]:
sites = data_config.vhls_perimeter["sites"]
start_date = data_config.vhls_perimeter["start_date"]
end_date = data_config.vhls_perimeter["end_date"]
genr_door = data_config.vhls_perimeter["genr_door"]

#### Spark session

In [11]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-template",
                                                      executors=2, executor_cores=4, executor_mem='4g',
                                                      dynamic_allocation=True, max_executors=8)

## 1. HDFS

### 1.1. Read

In [12]:
# Read data from hdfs
dirpath_prod_flow = '/user/brc05/data/refined/manf001_vehc_prdc_flow/year=2020/month=01'

df_flow = spark_session.read.parquet(dirpath_prod_flow)

In [13]:
# Filter data according to the perimeter
df_flow = df_flow.filter(df_flow['site_code'].isin(sites)) \
                 .filter(df_flow['pass_date'] >= datetime.datetime.strptime(start_date, '%d/%m/%y')) \
                 .filter(df_flow['pass_date'] < datetime.datetime.strptime(end_date, '%d/%m/%y')) \
                 .filter(df_flow['genr_door'] == genr_door)

In [14]:
# Compute number of vins per site
kpis.compute_n_vins_per_site(df_flow, "site_code")

[Row(site_code='PY', count=790), Row(site_code='MU', count=700)]

### 1.2. Write

In [10]:
# Output dirpath
output_dirpath = '/' + '/'.join(os.environ['DATA'].split('/')[2:])
output_dirpath

'/user/u542310/brc14/app00/data'

In [11]:
# Write only PY data to hdfs (partition by date)
df_flow.filter(F.col("site_code") == "PY") \
       .withColumn("year", F.year(F.col("pass_date"))) \
       .withColumn("month", F.month(F.col("pass_date"))) \
       .withColumn("day", F.dayofmonth(F.col("pass_date"))) \
       .coalesce(1).write.partitionBy("year", "month", "day") \
       .parquet(output_dirpath, "overwrite")

## 2. Exadata

In [16]:
# SQL query based on the perimeter
query = (
    "(SELECT * FROM BRC05.MANF001_VEHC_PRDC_FLOW "
    "WHERE SITE_CODE IN ({}) "
    "AND PASS_DATE >= to_date('{}', 'dd/mm/yy') "
    "AND PASS_DATE < to_date('{}', 'dd/mm/yy') "
    "AND GENR_DOOR = '{}')"
)

query = query.format(', '.join(["'" + site + "'" for site in sites]),
                     start_date, end_date, genr_door)

query

"(SELECT * FROM BRC05.MANF001_VEHC_PRDC_FLOW WHERE SITE_CODE IN ('PY', 'MU') AND PASS_DATE >= to_date('15/01/20', 'dd/mm/yy') AND PASS_DATE < to_date('17/01/20', 'dd/mm/yy') AND GENR_DOOR = 'EMON')"

### 2.1. jdbc connector

#### 2.1.1. Read

In [21]:
# Read data from Oracle using jdbc connector
df_flow = spark_session.read.option("fetchsize", 10000).jdbc(db_uri, table=query)

In [22]:
type(df_flow)

pyspark.sql.dataframe.DataFrame

NB: *fetchsize* is the number of rows to load per network call (default Oracle row fetch size value is 10)

In [18]:
# Compute number of vins per site
kpis.compute_n_vins_per_site(df_flow, "site_code")

[Row(site_code='PY', count=790), Row(site_code='MU', count=700)]

#### 2.1.2. Write

In [15]:
# Write data to oracle using jdbc connector
df_flow.filter(F.col("site_code") == "PY") \
       .write.jdbc(url=db_uri, table="table_test", mode="overwrite")

NB: *mode* can be overwrite, append, error, ignore

### 2.2. sqlalchemy connector

#### 2.2.1. Read

In [19]:
# Read data from Oracle using sqlalchemy connector
df_flow_pd = oracle.read_df_from_query(db_uri_cx_oracle, query)
df_flow_pd.head()

Unnamed: 0,site_code,site_labl,vin,faml_grp_labl,nof,fabr_ordr_type,fabr_ordr_type_labl,flow_pont_pass,flow_pont_pass_labl,genr_door,genr_door_labl,foll_door,foll_door_labl,pass_date,insr_date
0,PY,Poissy,VF3CCHMP0LW003459,208,9P59AEE0,CO,Commerce,EMON01,Entrie Montage L1,EMON,,EMON,,2020-01-16 06:47:24.706,2020-01-17
1,PY,Poissy,VF3CRYHYPLW003334,208,9P59A2A8,CO,Commerce,EMON01,Entrie Montage L1,EMON,,EMON,,2020-01-16 06:52:08.178,2020-01-17
2,PY,Poissy,VF3CRYHYPLW003335,208,9P59A2A9,CO,Commerce,EMON01,Entrie Montage L1,EMON,,EMON,,2020-01-16 06:58:26.571,2020-01-17
3,PY,Poissy,VR1UCYHYJLW003555,t-D34,9P54A1AR,CO,Commerce,EMON01,Entrie Montage L1,EMON,,EMON,,2020-01-16 06:03:06.890,2020-01-17
4,PY,Poissy,VR1UCYHYJLW003661,t-D34,9P54A1NP,CO,Commerce,EMON01,Entrie Montage L1,EMON,,EMON,,2020-01-16 06:02:11.930,2020-01-17


In [17]:
# Compute number of vins per site
kpis.compute_n_vins_per_site(df_flow_pd, "site_code")

site_code
MU    700
PY    790
dtype: int64

#### 2.2.2. Write

In [18]:
# Write data to oracle using sqlalchemy connector
oracle.df_to_oracle(df_flow_pd.loc[df_flow_pd["site_code"] == "PY"], "table_test",
                    db_uri_cx_oracle, if_exists="replace")

In [19]:
# Verification
simple_query = f"(SELECT * from BRC_{os.environ['USER'].upper()}.table_test)"
df_test = oracle.read_df_from_query(db_uri_cx_oracle, simple_query)

kpis.compute_n_vins_per_site(df_test, "site_code")

site_code
PY    790
dtype: int64

## Appendix

In [20]:
# Create query using query_builder module
from distribution_cost.infra import query_builder

COLUMNS_TO_SELECT = ["to_char(P254G6_VEH_DC.DATE_ECOM,'YYYY') AS YEAR", 
"DC_PIECES_RECHANGE.CODE_PR", 
"P254G6_VEH_DC.CODE_FAMILLE", 
"P254G6_VEH_DC.CODE_SILHOUETTE", 
"P254G6_VEH_DC.CODE_FINITION", 
"P254G6_VEH_DC.CODE_MOTEUR",
"P254G6_VEH_DC.DATE_ECOM",
"P254G6_VEH_DC.CODE_BOITE_DE_VITESSE"]

TODAY = datetime.date(2018, 12,17)
FROM_TABLE = "BRC07.P254G6_VEH_DC"
JOIN_TABLES = {"BRC07.DC_PIECES_RECHANGE": "VIN"}
SAMPLING = {"DATE_ECOM": TODAY.strftime("%d/%m/%y"),
            "CODE_VERSION": "('#')"}
nb_days = 1

query = query_builder.build_download_query(from_table=FROM_TABLE,
                                               columns_to_select=COLUMNS_TO_SELECT,
                                               join_tables=JOIN_TABLES,
                                               nb_days=nb_days,
                                               date_col="DATE_ECOM",
                                               cond='NOT IN',
                                               sampling=SAMPLING)
    
query

"(SELECT to_char(P254G6_VEH_DC.DATE_ECOM,'YYYY') AS YEAR, DC_PIECES_RECHANGE.CODE_PR, P254G6_VEH_DC.CODE_FAMILLE, P254G6_VEH_DC.CODE_SILHOUETTE, P254G6_VEH_DC.CODE_FINITION, P254G6_VEH_DC.CODE_MOTEUR, P254G6_VEH_DC.DATE_ECOM, P254G6_VEH_DC.CODE_BOITE_DE_VITESSE FROM BRC07.P254G6_VEH_DC LEFT JOIN BRC07.DC_PIECES_RECHANGE ON BRC07.DC_PIECES_RECHANGE.VIN = BRC07.P254G6_VEH_DC.VIN  WHERE CODE_VERSION NOT IN ('#') AND DATE_ECOM >= TO_DATE('17/12/18', 'dd/mm/yy') - 1)"