# Data ETL
ETL - Extract, Transform, Load: Gather raw data from various sources, clean and consolidate it, and then load it into a single centralized destination
- Extract data from various sources (databases, files, APIs, etc.)
- Transform the data by cleaning, formatting, or combining it as needed
- Load the processed data into a destination, such as a database, data warehouse, or analytics tool
ETL is a foundational step in data engineering and analytics pipelines, ensuring data is ready for analysis or further processing.

## Initialize

In [1]:
import pandas as pd
import numpy as np
# import polars as pl
import gdown
import os
import rich

In [2]:
version_tag = "dev"

In [None]:
# UNCOMMENT CODE IN THIS CHUNK FOR THE FIRST TIME USING THIS REPO - Onetime data downloading from Google Drive
# https://drive.google.com/file/d/1NuMdJQb3OfAM-Jg007OxPlUrbaTOeF9k/view?usp=sharing
# https://drive.google.com/file/d/1c96cnf1hea--SUr_tQxzxvEkUG30aJpC/view?usp=sharing
model_file_id = "1NuMdJQb3OfAM-Jg007OxPlUrbaTOeF9k"
gdown.download(
    f"https://drive.google.com/uc?id={model_file_id}",
    '../.local/synthetic_auto_policies_model_data_lecture_1_illustration.csv', 
    quiet=False)
inference_file_id = "1c96cnf1hea--SUr_tQxzxvEkUG30aJpC"
gdown.download(
    f"https://drive.google.com/uc?id={inference_file_id}",
    '../.local/synthetic_auto_policies_inference_data_lecture_1_illustration.csv', 
    quiet=False)

## Extract Data

In [None]:
model_data = pd.read_csv('../.local/synthetic_auto_policies_model_data_lecture_1_illustration.csv')
print( model_data.shape )
model_data.head(5)

(11250, 24)


Unnamed: 0,id,fold,sample,veh_value,exposure,veh_body,veh_age,gender,area,agecat,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,low_education_ind,clm,numclaims,claimcst0
0,4492,3,1|bld,5.36,0.312685,PANVN,2,F,C,3,...,S,1,weekday,12am - 6 am,12,650.314851,0.0,0,0,0.0
1,14969,3,1|bld,6.18,0.153159,SUV,3,M,C,5,...,M,1,weekday,6pm - 12am,12,643.189575,0.0,0,0,0.0
2,14275,4,2|val,7.28,0.530446,UTE,4,F,B,6,...,M,1,weekday,6am - 12pm,12,641.311518,0.0,0,0,0.0
3,12850,3,1|bld,4.19,0.714919,STNWG,2,F,A,3,...,M,1,weekday,6am - 12pm,12,648.784436,0.0,0,0,0.0
4,3264,1,1|bld,5.06,0.138961,STNWG,3,M,E,3,...,S,1,weekend,12pm - 6pm,6,639.907492,0.0,0,0,0.0


In [None]:
inference_data = pd.read_csv(f'../.local/synthetic_auto_policies_inference_data_lecture_1_illustration.csv')
print( inference_data.shape )
inference_data.head(5)

(15000, 19)


Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,driving_history_score,veh_color,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,low_education_ind
0,8295,6.36,0.777085,SUV,3,M,A,2,hybrid,128,81,silver,M,1,weekend,6am - 12pm,12,644.721808,0
1,17625,2.56,0.528369,STNWG,1,F,A,1,dissel,178,94,white,M,1,weekday,12pm - 6pm,12,634.306196,0
2,3802,6.18,0.384591,HDTOP,4,F,D,3,petrol,270,77,yellow,S,1,weekday,6pm - 12am,12,649.245139,0
3,12865,6.49,0.116378,SUV,3,M,D,4,petrol,120,59,gray,M,1,weekday,6am - 12pm,12,647.594655,0
4,6495,6.46,0.688417,SEDAN,2,F,C,4,petrol,94,81,green,M,1,weekday,6am - 12pm,12,657.5505,0


## Transform Data
### Visualize data

In [6]:
output_folder = "../.local/analysis_pipeline/data_etl"
os.makedirs(output_folder, exist_ok=True)

In [7]:
from ydata_profiling import ProfileReport

model_data_profile = ProfileReport(model_data, title="Model Data Profiling Report")

In [8]:
# model_data_profile.to_notebook_iframe()
model_data_profile.to_file(f"{output_folder}/model_data_profile_{version_tag}.html") 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:00<00:00, 50.20it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Clean and Consolidate data

In [9]:
## Show column information for abnormal checking purpose
rich.print( model_data.info() )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11250 entries, 0 to 11249
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     11250 non-null  int64  
 1   fold                   11250 non-null  int64  
 2   sample                 11250 non-null  object 
 3   veh_value              11250 non-null  float64
 4   exposure               11250 non-null  float64
 5   veh_body               11250 non-null  object 
 6   veh_age                11250 non-null  int64  
 7   gender                 11250 non-null  object 
 8   area                   11250 non-null  object 
 9   agecat                 11250 non-null  int64  
 10  engine_type            11250 non-null  object 
 11  max_power              11250 non-null  int64  
 12  driving_history_score  11250 non-null  float64
 13  veh_color              11250 non-null  object 
 14  marital_status         11250 non-null  object 
 15  e_

In [10]:
# Cap the heavy right tailed 'veh_value' at the 99th percentile for outlier control
veh_value_cap = round(np.nanpercentile(model_data['veh_value'], 99), 3)
print(f"veh_value cap at 99th percentile: {veh_value_cap}")
model_data['veh_value'] = model_data['veh_value'].clip(upper=veh_value_cap)

veh_value cap at 99th percentile: 7.2


In [11]:
# Check category distribution of 'veh_body'
rich.print( model_data['veh_body'].value_counts() )
# Group 'MCARA', 'CONVT', 'BUS', and 'RDSTR' 'veh_body' as 'Other'
model_data.loc[model_data['veh_body'].isin(['MCARA','CONVT','BUS','RDSTR']), 'veh_body'] = 'Other'

In [12]:
# Cap the heavy right tailed 'veh_value' at 722
credit_score_cap = 722
model_data['credit_score'] = model_data['credit_score'].clip(upper=credit_score_cap)

In [13]:
# Assume single vehicle policy and create a vehicle count variable
model_data['veh_cnt'] = 1

# Add policy year 
model_data['data_segment'] = "1|model"

### Carry the above steps to inference data 

In [14]:
import sys
sys.path.append(os.path.abspath(".."))
from analysis_tool_chest.data_etl import DataETL

output_folder = "../.local/analysis_pipeline/data_etl"
os.makedirs(output_folder, exist_ok=True)
etl = DataETL(inference_data)
etl.profile_analysis(output_folder, file_name=f"inference_data_profile_{version_tag}.html", title="Inference Data Profiling Report")
processed_inference_data = etl.transform(cap_dict={'veh_value': veh_value_cap})

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 84.80it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

veh_value capped at 7.2


## Load Data Into Memory

In [15]:
output_folder = "../.local/analysis_pipeline/data_etl"
model_data.to_parquet(f"{output_folder}/model_data_{version_tag}.parquet", index=False)
processed_inference_data.to_parquet(f"{output_folder}/inference_data_{version_tag}.parquet", index=False)