# Data Engineering Use Cases

In [None]:
import pandas as pd
import time, datetime

## Bulk Insert and add curation columns



In [None]:
bulk_insert_start_time = time.time()

In [None]:
full_load = pd.read_parquet('../helpers/dummy_example_creator/full_load.parquet')
print(full_load.head())

In [None]:
end_datetime = datetime.datetime(2250, 1, 1)

full_load['start_datetime'] = full_load['extraction_timestamp']
full_load['end_datetime'] = end_datetime
full_load['is_current'] = True

print(full_load.head())
full_load.to_parquet('bulk_insert.parquet')

In [None]:
bulk_insert_process_time = time.time() - bulk_insert_start_time
print(bulk_insert_process_time)

## Slowly Changing Dimension Type 2

The updates are created by replacing one column with the same value to simplify the testing.
The soft deletes are not taken into account since very similar process from a performance perspective.

1. Read updates
2. Join full load with updates on primary key
3. Set `end_datetime` to the `extraction_timestamp` of the updated records 
4. Close the existing records
5. Add curation columms to updates
6. Append updated data to existing data

In [None]:
scd2_start_time = time.time()

In [None]:
updates= pd.read_parquet('../helpers/dummy_example_creator/updates.parquet')
print(updates.head())

In [None]:
df = pd.merge(full_load,
              updates[['product_id','extraction_timestamp']],
              on='product_id',
              suffixes=(None, "_y")
              )
df['end_datetime'] = df['extraction_timestamp_y']
df.drop(columns=['extraction_timestamp_y'],inplace=True)
df['is_current'] = False

print(df.head())


In [None]:
updates['start_datetime'] = updates['extraction_timestamp']
updates['end_datetime'] = end_datetime
updates['is_current'] = True

output = pd.concat([df,updates],ignore_index=True)
print(output.head(10))
output.to_parquet('updated_data.parquet')

In [None]:
scd2_process_time = time.time() - scd2_start_time
print(scd2_process_time)

## Dedupes

In [None]:
# TODO

## Impute deleted records

In [None]:
# TODO