# TSS_EVENT Project - Text

# Initialize Widget

In [0]:
# dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.text("input_table", "default.tss_event_2019_02_27_12_13_46_01500_02999", " 1. Input Table")
dbutils.widgets.text("file_name", "tss_event_2019_02_27_12_13_46_01500_02999", " 2. File Name")
dbutils.widgets.text("output_schema_name", "greenbrier_projects", " 3. Output Schema Name")
dbutils.widgets.text("output_table_name", "movement", " 4. Output Table Name")

# Run Common Functions Notebook
This notebook contains functions and libraries we need

In [0]:
%run "./Common Functions"

# Define Notebook variables

In [0]:
INPUT_TABLE = dbutils.widgets.get("input_table")
FILE_NAME = dbutils.widgets.get("file_name")
OUTPUT_SCHEMA_NAME = dbutils.widgets.get("output_schema_name")
OUTPUT_TABLE_NAME = dbutils.widgets.get("output_table_name")

print(f"INPUT_TABLE             = {INPUT_TABLE}")
print(f"FILE_NAME               = {FILE_NAME}")
print(f"OUTPUT_SCHEMA_NAME      = {OUTPUT_SCHEMA_NAME}")
print(f"OUTPUT_TABLE_NAME       = {OUTPUT_TABLE_NAME}")


In [0]:
data_raw = spark.table(INPUT_TABLE)
## or 
#data_raw = spark.sql(f"select * from {INPUT_TABLE}")
print(data_raw.count())
display(data_raw)

# Extract all columns to form a dataframe

Load function `reformat_text_data` from `Common Functions` notebook

In [0]:
schema = {"equipment_initial": (1,4), "equipment_number": (5,14), "event_code": (15,16), "event_code_alpha": (17,20)
          ,"cancel_date": (21,30), "event_date": (31,48), "fsac": (49,54), "load_empty": (55,55)
          ,"train_id": (56,65), "company_abbreviation": (66,69), "carrier_abbreviation": (70,73), "waybill_id": (74,99)
          ,"waybill_version": (100,102), "railroad_event_code": (103,104), "railroad_event_state_code": (105,106), "joint_service_code": (107,107)
          ,"end_of_record_constant": (108,108)
          }

df_reformed = reformat_text_data(data_raw, schema)
display(df_reformed)

# Transformations

## Remove leading and trailing spaces from all columns except `train_id`

Load function `remove_leading_trailing_spaces` from `Common Functions` notebook

In [0]:
df_trim = remove_leading_trailing_spaces(df_reformed, 'train_id')
display(df_trim.tail(10))

## Convert datatypes

In [0]:
# from pyspark.sql.functions import to_date, substring
# from pyspark.sql.types import DateType, IntegerType

df_final = (df_trim.select('equipment_initial'
                          ,col('equipment_number').cast(IntegerType())
                          ,'event_code'
                          ,'event_code_alpha'
                          ,col('cancel_date').cast(DateType())
                          ,to_date(substring('event_date',1,10)).alias('event_date')
                          ,col('fsac').cast(IntegerType())
                          ,'load_empty'
                          ,'train_id'
                          ,'company_abbreviation'
                          ,'carrier_abbreviation'
                          ,'waybill_id'
                          ,'waybill_version'
                          ,'railroad_event_code'
                          ,'railroad_event_state_code'
                          ,'joint_service_code'
                          ,'end_of_record_constant'
                        )
           )
display(df_final)

# Checks

## Check for nulls in `equipment_initial`,`equipment_number`,`event_code_alpha`,`event_date`

In [0]:
col_names = ('equipment_initial','equipment_number','event_code_alpha','event_date')

for col in col_names:
    assert_no_nulls(df_trim, col)