# Data Engineering Use Cases

In [31]:
import pandas as pd
import time, datetime

## Bulk Insert and add curation columns



In [32]:
bulk_insert_start_time = time.time()

In [33]:
full_load = pd.read_parquet('../helpers/dummy_example_creator/full_load.parquet')
print(full_load.head())

  product_id product_name  price extraction_timestamp    op
0      00001       Heater    250  2022-01-01 01:01:01  None
1      00002   Thermostat    400  2022-01-01 01:01:01  None
2      00003   Television    600  2022-01-01 01:01:01  None
3      00004      Blender    100  2022-01-01 01:01:01  None
4      00005  USB charger     50  2022-01-01 01:01:01  None


In [34]:
end_datetime = datetime.datetime(2250, 1, 1)

full_load['start_datetime'] = full_load['extraction_timestamp']
full_load['end_datetime'] = end_datetime
full_load['is_current'] = True

print(full_load.head())
full_load.to_parquet('bulk_insert.parquet')

  product_id product_name  price extraction_timestamp    op  \
0      00001       Heater    250  2022-01-01 01:01:01  None   
1      00002   Thermostat    400  2022-01-01 01:01:01  None   
2      00003   Television    600  2022-01-01 01:01:01  None   
3      00004      Blender    100  2022-01-01 01:01:01  None   
4      00005  USB charger     50  2022-01-01 01:01:01  None   

       start_datetime end_datetime  is_current  
0 2022-01-01 01:01:01   2250-01-01        True  
1 2022-01-01 01:01:01   2250-01-01        True  
2 2022-01-01 01:01:01   2250-01-01        True  
3 2022-01-01 01:01:01   2250-01-01        True  
4 2022-01-01 01:01:01   2250-01-01        True  


In [35]:
bulk_insert_process_time = time.time() - bulk_insert_start_time
print(bulk_insert_process_time)

0.11295485496520996


## Slowly Changing Dimension Type 2

The updates are created by replacing one column with the same value to simplify the testing.
The soft deletes are not taken into account since very similar process from a performance perspective.

1. Read updates
2. Join full load with updates on primary key
3. Set `end_datetime` to the `extraction_timestamp` of the updated records 
4. Close the existing records
5. Add curation columms to updates
6. Append updated data to existing data

In [36]:
scd2_start_time = time.time()

In [37]:
updates= pd.read_parquet('../helpers/dummy_example_creator/updates.parquet')
print(updates.head())

  product_id product_name  price extraction_timestamp op
0      00001       Heater   1000           2023-01-01  U
1      00002   Thermostat   1000           2023-01-01  U
2      00003   Television   1000           2023-01-01  U
3      00004      Blender   1000           2023-01-01  U
4      00005  USB charger   1000           2023-01-01  U


In [38]:
df = pd.merge(full_load,
              updates[['product_id','extraction_timestamp']],
              on='product_id',
              suffixes=(None, "_y")
              )
df['end_datetime'] = df['extraction_timestamp_y']
df.drop(columns=['extraction_timestamp_y'],inplace=True)
df['is_current'] = False

print(df.head())


  product_id product_name  price extraction_timestamp    op  \
0      00001       Heater    250  2022-01-01 01:01:01  None   
1      00002   Thermostat    400  2022-01-01 01:01:01  None   
2      00003   Television    600  2022-01-01 01:01:01  None   
3      00004      Blender    100  2022-01-01 01:01:01  None   
4      00005  USB charger     50  2022-01-01 01:01:01  None   

       start_datetime end_datetime  is_current  
0 2022-01-01 01:01:01   2023-01-01       False  
1 2022-01-01 01:01:01   2023-01-01       False  
2 2022-01-01 01:01:01   2023-01-01       False  
3 2022-01-01 01:01:01   2023-01-01       False  
4 2022-01-01 01:01:01   2023-01-01       False  


In [41]:
updates['start_datetime'] = updates['extraction_timestamp']
updates['end_datetime'] = end_datetime
updates['is_current'] = True

output = pd.concat([df,updates],ignore_index=True)
print(output.head(10))
output.to_parquet('updated_data.parquet')

  product_id product_name  price extraction_timestamp    op  \
0      00001       Heater    250  2022-01-01 01:01:01  None   
1      00002   Thermostat    400  2022-01-01 01:01:01  None   
2      00003   Television    600  2022-01-01 01:01:01  None   
3      00004      Blender    100  2022-01-01 01:01:01  None   
4      00005  USB charger     50  2022-01-01 01:01:01  None   
5      00001       Heater   1000  2023-01-01 00:00:00     U   
6      00002   Thermostat   1000  2023-01-01 00:00:00     U   
7      00003   Television   1000  2023-01-01 00:00:00     U   
8      00004      Blender   1000  2023-01-01 00:00:00     U   
9      00005  USB charger   1000  2023-01-01 00:00:00     U   

       start_datetime end_datetime  is_current  
0 2022-01-01 01:01:01   2023-01-01       False  
1 2022-01-01 01:01:01   2023-01-01       False  
2 2022-01-01 01:01:01   2023-01-01       False  
3 2022-01-01 01:01:01   2023-01-01       False  
4 2022-01-01 01:01:01   2023-01-01       False  
5 2023-01-01

In [40]:
scd2_process_time = time.time() - scd2_start_time
print(scd2_process_time)

0.12783598899841309


## Dedupes

In [None]:
# TODO

## Impute deleted records

In [None]:
# TODO