# Data ETL
ETL - Extract, Transform, Load: Gather raw data from various sources, clean and consolidate it, and then load it into a single centralized destination
- Extract data from various sources (databases, files, APIs, etc.)
- Transform the data by cleaning, formatting, or combining it as needed
- Load the processed data into a destination, such as a database, data warehouse, or analytics tool
ETL is a foundational step in data engineering and analytics pipelines, ensuring data is ready for analysis or further processing.

## Initialize

In [1]:
import pandas as pd
import numpy as np
# import polars as pl
import gdown
import os
import rich

In [2]:
version_tag = "dev"

In [3]:
# ## Onetime data downloading
# model_file_id = "15DamSAHtEUsLn2qwnwmJFgD7Djk1MizJ"
# data_folder = "c:/teaching/fall2025/data_science_bootcamp_lecture_1"
# gdown.download(
#     f"https://drive.google.com/uc?id={model_file_id}",
#     f'{data_folder}/.local/auto_policies_2017.csv', 
#     quiet=False)
# inference_file_id = "1ZppcSp8WMinV3iUdNUDapfDg3bMEqMat"
# gdown.download(
#     f"https://drive.google.com/uc?id={inference_file_id}",
#     f'{data_folder}/.local/auto_potential_customers_2018.csv', 
#     quiet=False)

## Extract Data

In [4]:
data_folder = "c:/teaching/fall2025/data_science_bootcamp_lecture_1"
# model_data = pd.read_csv(f'{data_folder}/.local/auto_policies_2017.csv')
# model_data = pd.read_csv(f'{data_folder}/.local/synthetic_model_data.csv')
model_data = pd.read_csv(f'{data_folder}/.local/auto_policies_model_data.csv')
print( model_data.shape )
model_data.head(5)

(22619, 22)


Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,0.77,0.444504,SEDAN,4,M,D,3,petrol,147,...,S,1,weekday,6pm - 12am,6,640.448137,1.0,0,0,0.0
1,2,4.45,0.562183,STNWG,1,M,A,3,petrol,158,...,S,1,weekday,6am - 12pm,12,683.749691,0.0,0,0,0.0
2,3,4.9,0.465244,STNWG,1,F,A,3,petrol,159,...,M,1,weekday,6pm - 12am,6,653.656117,1.0,0,0,0.0
3,4,0.48,0.271039,PANVN,4,M,A,4,petrol,80,...,S,1,weekday,12pm - 6pm,12,642.574671,0.0,0,0,0.0
4,5,0.85,0.141624,SEDAN,4,F,A,5,petrol,126,...,S,0,weekday,6am - 12pm,6,647.175035,0.0,0,0,0.0


In [5]:
# inference_data = pd.read_csv(f'{data_folder}/.local/auto_potential_customers_2018.csv')
# inference_data = pd.read_csv(f'{data_folder}/.local/synthetic_inference_data.csv')
inference_data = pd.read_csv(f'{data_folder}/.local/auto_policies_inference_data.csv')
print( inference_data.shape )
inference_data.head(5)

(22620, 19)


Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,driving_history_score,veh_color,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind
0,1,3.4,0.076279,STNWG,2,M,B,4,petrol,174,83,black,S,1,weekday,6pm - 12am,6,648.247594,0
1,2,2.55,0.093443,STNWG,2,F,A,3,petrol,181,65,yellow,M,0,weekday,12am - 6 am,12,637.752677,0
2,3,3.04,0.157762,STNWG,2,F,E,4,petrol,136,64,white,S,1,weekday,12pm - 6pm,12,661.483786,0
3,4,2.05,0.560735,MIBUS,4,M,C,6,dissel,164,82,gray,M,1,weekday,6am - 12pm,12,647.846365,0
4,5,1.93,0.258275,HBACK,2,M,C,4,dissel,89,48,black,S,0,weekday,6am - 12pm,12,640.25755,0


## Transform Data
### Visualize data

In [6]:
output_folder = "../.local/analysis_pipeline/data_etl"
os.makedirs(output_folder, exist_ok=True)

In [7]:
from ydata_profiling import ProfileReport

model_data_profile = ProfileReport(model_data, title="Model Data Profiling Report")

In [8]:
# model_data_profile.to_notebook_iframe()
model_data_profile.to_file(f"{output_folder}/model_data_profile_{version_tag}.html") 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 22/22 [00:00<00:00, 52.65it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Clean and Consolidate data

In [9]:
## Show column information for abnormal checking purpose
rich.print( model_data.info() )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22619 entries, 0 to 22618
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     22619 non-null  int64  
 1   veh_value              22619 non-null  float64
 2   exposure               22619 non-null  float64
 3   veh_body               22619 non-null  object 
 4   veh_age                22619 non-null  int64  
 5   gender                 22619 non-null  object 
 6   area                   22619 non-null  object 
 7   agecat                 22619 non-null  int64  
 8   engine_type            22619 non-null  object 
 9   max_power              22619 non-null  int64  
 10  driving_history_score  22619 non-null  float64
 11  veh_color              22619 non-null  object 
 12  marital_status         22619 non-null  object 
 13  e_bill                 22619 non-null  int64  
 14  time_of_week_driven    22619 non-null  object 
 15  ti

In [10]:
# Cap the heavy right tailed 'veh_value' at the 99th percentile for outlier control
veh_value_cap = round(np.nanpercentile(model_data['veh_value'], 99), 3)
print(f"veh_value cap at 99th percentile: {veh_value_cap}")
model_data['veh_value'] = model_data['veh_value'].clip(upper=veh_value_cap)

veh_value cap at 99th percentile: 6.09


In [11]:
# Check category distribution of 'veh_body'
rich.print( model_data['veh_body'].value_counts() )
# Group 'MCARA', 'CONVT', 'BUS', and 'RDSTR' 'veh_body' as 'Other'
model_data.loc[model_data['veh_body'].isin(['MCARA','CONVT','BUS','RDSTR']), 'veh_body'] = 'Other'

In [12]:
# Cap the heavy right tailed 'veh_value' at 722
credit_score_cap = 722
model_data['credit_score'] = model_data['credit_score'].clip(upper=credit_score_cap)

In [13]:
# Assume single vehicle policy and create a vehicle count variable
model_data['veh_cnt'] = 1

# Add policy year 
model_data['data_segment'] = "1|model"

### Carry the above steps to inference data 

In [14]:
import sys
sys.path.append(os.path.abspath(".."))
from analysis_tool_chest.data_etl import DataETL

output_folder = "../.local/analysis_pipeline/data_etl"
os.makedirs(output_folder, exist_ok=True)
etl = DataETL(inference_data)
etl.profile_analysis(output_folder, file_name=f"inference_data_profile_{version_tag}.html", title="Inference Data Profiling Report")
processed_inference_data = etl.transform(cap_dict={'veh_value': veh_value_cap})

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 115.75it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

veh_value capped at 6.09


## Load Data Into Memory

In [15]:
output_folder = "../.local/analysis_pipeline/data_etl"
model_data.to_parquet(f"{output_folder}/model_data_{version_tag}.parquet", index=False)
processed_inference_data.to_parquet(f"{output_folder}/inference_data_{version_tag}.parquet", index=False)