In [1]:
import pandas as pd
from pyspark.sql import functions as F
import calendar
from datetime import date
from dateutil.relativedelta import relativedelta
from distribution_cost.configuration import spark_config
from distribution_cost.configuration.app import AppConfig

/gpfs/user/e587246/dco00/conf/application.yml
/gpfs/user/e587246/dco00


In [2]:
# Database uri
app_config = AppConfig()
db_uri = app_config.db_uri_jdbc
db_uri

'jdbc:oracle:thin:BRC03_VMECA/8sUFYtvK@//pyox2k01:1521/BRCEX_PP2'

In [3]:
# Create spark session
spark_context, spark_session = spark_config.get_spark(app_name="app-distribution-cost",
                                                      executors=4, executor_cores=4, executor_mem='4g',
                                                      dynamic_allocation=True, max_executors=8)


In [4]:
data_date=pd.date_range(start='2017-08-01', end='2020-02-01')

In [5]:
df_date = pd.DataFrame(data_date, columns = ['Date']) 
df_date

Unnamed: 0,Date
0,2017-08-01
1,2017-08-02
2,2017-08-03
3,2017-08-04
4,2017-08-05
...,...
910,2020-01-28
911,2020-01-29
912,2020-01-30
913,2020-01-31


In [6]:
def add0 (n):
  if len(str(n)) < 2:
    return '0'+str(n)
  else:
    return str(n)
  
df_date['Day'] = pd.DatetimeIndex(df_date['Date']).day
df_date['Month'] = pd.DatetimeIndex(df_date['Date']).month
df_date['Year'] = pd.DatetimeIndex(df_date['Date']).year
df_date['MonthYearNumber'] =pd.DatetimeIndex(df_date['Date']).month.astype(str) + '/' + df_date['Year'].apply(lambda x: str(x)[-2:])
df_date['MonthNameShort'] = df_date['Month'].apply(lambda x: calendar.month_abbr[x])
df_date['MonthNameLong'] = df_date['Month'].apply(lambda x: calendar.month_name[x])
df_date['MonthYearShort'] = df_date['MonthNameShort'].astype(str) + '-' + df_date['Year'].astype(str)
df_date['MonthYearLong'] = df_date['MonthNameLong'].astype(str) + ' ' + df_date['Year'].astype(str)
df_date['Quarter'] = df_date['Month'].apply(lambda m: 'Q1' if m<=3 else ('Q2' if 3<m<=6 else ('Q3' if 6<m<=9 else 'Q4')))
df_date['Year quarter'] = df_date['Year'].astype(str)+' '+ df_date['Quarter'].astype(str)
df_date['Day'] = df_date['Day'].apply(add0)
df_date['Month'] = df_date['Month'].apply(add0)
df_date['DateAsInteger'] = df_date['Year'].astype(str) + df_date['Month'].astype(str) + df_date['Day'].astype(str)
df_date['MonthShortAsInterger'] = df_date['Year'].astype(str) + df_date['Month'].astype(str)


df_date

Unnamed: 0,Date,Day,Month,Year,MonthYearNumber,MonthNameShort,MonthNameLong,MonthYearShort,MonthYearLong,Quarter,Year quarter,DateAsInteger,MonthShortAsInterger
0,2017-08-01,01,08,2017,8/17,Aug,August,Aug-2017,August 2017,Q3,2017 Q3,20170801,201708
1,2017-08-02,02,08,2017,8/17,Aug,August,Aug-2017,August 2017,Q3,2017 Q3,20170802,201708
2,2017-08-03,03,08,2017,8/17,Aug,August,Aug-2017,August 2017,Q3,2017 Q3,20170803,201708
3,2017-08-04,04,08,2017,8/17,Aug,August,Aug-2017,August 2017,Q3,2017 Q3,20170804,201708
4,2017-08-05,05,08,2017,8/17,Aug,August,Aug-2017,August 2017,Q3,2017 Q3,20170805,201708
...,...,...,...,...,...,...,...,...,...,...,...,...,...
910,2020-01-28,28,01,2020,1/20,Jan,January,Jan-2020,January 2020,Q1,2020 Q1,20200128,202001
911,2020-01-29,29,01,2020,1/20,Jan,January,Jan-2020,January 2020,Q1,2020 Q1,20200129,202001
912,2020-01-30,30,01,2020,1/20,Jan,January,Jan-2020,January 2020,Q1,2020 Q1,20200130,202001
913,2020-01-31,31,01,2020,1/20,Jan,January,Jan-2020,January 2020,Q1,2020 Q1,20200131,202001


In [7]:
df_date_spark = spark_session.createDataFrame(df_date)
df_date_spark.write.jdbc(url=db_uri, table="SMKT007_REF_DATE", mode="overwrite")

In [8]:
df_date_spark = df_date_spark.select(*(F.col("`" + c+ "`").alias(c.replace(' ', '_')) for c in df_date_spark.columns))

In [10]:
df_date_spark.write.mode("overwrite").partitionBy("Year","Month","Day").parquet("hdfs:///user/brc03/vmeca/data/refined/date/")