# Data ETL
ETL - Extract, Transform, Load: Gather raw data from various sources, clean and consolidate it, and then load it into a single centralized destination
- Extract data from various sources (databases, files, APIs, etc.)
- Transform the data by cleaning, formatting, or combining it as needed
- Load the processed data into a destination, such as a database, data warehouse, or analytics tool
ETL is a foundational step in data engineering and analytics pipelines, ensuring data is ready for analysis or further processing.

## Initialize

In [1]:
import pandas as pd
import numpy as np
# import polars as pl
import gdown
import os
import rich

In [2]:
version_tag = "dev"

In [None]:
## UNCOMMENT CODE IN THIS CHUNK FOR THE FIRST TIME USING THIS REPO - Onetime data downloading from Google Drive
# # https://drive.google.com/file/d/1MiD9s007n_XrSVpxDpLyeTcBjpKLaQi3/view?usp=sharing
# # https://drive.google.com/file/d/1da-n6cmkb-L5TTtuq-vQLutDN8dmiExi/view?usp=sharing
# model_file_id = "1da-n6cmkb-L5TTtuq-vQLutDN8dmiExi"
# data_folder = "c:/teaching/fall2025/data_science_bootcamp_lecture_1"
# gdown.download(
#     f"https://drive.google.com/uc?id={model_file_id}",
#     f'{data_folder}/.local/synthetic_auto_policies_model_data_copy.csv', 
#     quiet=False)
# inference_file_id = "1MiD9s007n_XrSVpxDpLyeTcBjpKLaQi3"
# gdown.download(
#     f"https://drive.google.com/uc?id={inference_file_id}",
#     f'{data_folder}/.local/synthetic_auto_policies_inference_data_copy.csv', 
#     quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1da-n6cmkb-L5TTtuq-vQLutDN8dmiExi
To: c:\teaching\fall2025\data_science_bootcamp_lecture_1\.local\synthetic_auto_policies_model_data_copy.csv
100%|██████████| 1.85M/1.85M [00:00<00:00, 6.20MB/s]
Downloading...
From: https://drive.google.com/uc?id=1MiD9s007n_XrSVpxDpLyeTcBjpKLaQi3
To: c:\teaching\fall2025\data_science_bootcamp_lecture_1\.local\synthetic_auto_policies_inference_data_copy.csv
100%|██████████| 1.45M/1.45M [00:00<00:00, 5.59MB/s]


'c:/teaching/fall2025/data_science_bootcamp_lecture_1/.local/synthetic_auto_policies_inference_data_copy.csv'

## Extract Data

In [8]:
data_folder = "c:/teaching/fall2025/data_science_bootcamp_lecture_1"
model_data = pd.read_csv(f'{data_folder}/.local/synthetic_auto_policies_model_data_copy.csv')
print( model_data.shape )
model_data.head(5)

(15000, 24)


Unnamed: 0,id,fold,sample,veh_value,exposure,veh_body,veh_age,gender,area,agecat,...,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind,clm,numclaims,claimcst0
0,1,2,1|bld,1.77,0.362191,SEDAN,2,M,B,1,...,S,0,weekday,12pm - 6pm,6,646.516469,0.0,1,1,202.319818
1,2,3,1|bld,1.9,0.632068,STNWG,4,M,A,2,...,M,0,weekend,6am - 12pm,12,635.400369,0.0,1,1,360.017223
2,3,1,1|bld,1.67,0.36746,HBACK,2,M,C,2,...,S,0,weekday,6am - 12pm,12,646.463131,0.0,1,1,202.114407
3,4,2,1|bld,2.78,0.802184,STNWG,3,F,B,4,...,S,1,weekday,6pm - 12am,12,645.598794,0.0,1,1,400.684549
4,5,2,1|bld,0.89,0.485009,HBACK,3,F,C,1,...,M,0,weekday,6am - 12pm,12,657.348612,0.0,1,1,254.295393


In [10]:
inference_data = pd.read_csv(f'{data_folder}/.local/synthetic_auto_policies_inference_data_copy.csv')
print( inference_data.shape )
inference_data.head(5)

(15000, 19)


Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,agecat,engine_type,max_power,driving_history_score,veh_color,marital_status,e_bill,time_of_week_driven,time_driven,trm_len,credit_score,high_education_ind
0,8295,1.21,0.777085,SEDAN,3,F,A,2,hybrid,128,73,silver,M,1,weekend,6am - 12pm,12,644.721808,0
1,17625,5.01,0.528369,STNWG,1,M,A,1,dissel,178,60,white,M,1,weekday,12pm - 6pm,12,634.306196,0
2,3802,1.39,0.384591,HDTOP,4,M,D,3,petrol,270,77,yellow,S,1,weekday,6pm - 12am,12,649.245139,0
3,12865,1.08,0.116378,SEDAN,3,F,D,4,petrol,120,95,gray,M,1,weekday,6am - 12pm,12,647.594655,0
4,6495,1.11,0.688417,HBACK,2,M,C,4,petrol,94,73,green,M,1,weekday,6am - 12pm,12,657.5505,0


## Transform Data
### Visualize data

In [22]:
output_folder = "../.local/analysis_pipeline/data_etl"
os.makedirs(output_folder, exist_ok=True)

In [23]:
from ydata_profiling import ProfileReport

model_data_profile = ProfileReport(model_data, title="Model Data Profiling Report")

In [24]:
# model_data_profile.to_notebook_iframe()
model_data_profile.to_file(f"{output_folder}/model_data_profile_{version_tag}.html") 

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 24/24 [00:00<00:00, 50.22it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Clean and Consolidate data

In [25]:
## Show column information for abnormal checking purpose
rich.print( model_data.info() )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     15000 non-null  int64  
 1   fold                   15000 non-null  int64  
 2   sample                 15000 non-null  object 
 3   veh_value              15000 non-null  float64
 4   exposure               15000 non-null  float64
 5   veh_body               15000 non-null  object 
 6   veh_age                15000 non-null  int64  
 7   gender                 15000 non-null  object 
 8   area                   15000 non-null  object 
 9   agecat                 15000 non-null  int64  
 10  engine_type            15000 non-null  object 
 11  max_power              15000 non-null  int64  
 12  driving_history_score  15000 non-null  float64
 13  veh_color              15000 non-null  object 
 14  marital_status         15000 non-null  object 
 15  e_

In [26]:
# Cap the heavy right tailed 'veh_value' at the 99th percentile for outlier control
veh_value_cap = round(np.nanpercentile(model_data['veh_value'], 99), 3)
print(f"veh_value cap at 99th percentile: {veh_value_cap}")
model_data['veh_value'] = model_data['veh_value'].clip(upper=veh_value_cap)

veh_value cap at 99th percentile: 6.5


In [27]:
# Check category distribution of 'veh_body'
rich.print( model_data['veh_body'].value_counts() )
# Group 'MCARA', 'CONVT', 'BUS', and 'RDSTR' 'veh_body' as 'Other'
model_data.loc[model_data['veh_body'].isin(['MCARA','CONVT','BUS','RDSTR']), 'veh_body'] = 'Other'

In [28]:
# Cap the heavy right tailed 'veh_value' at 722
credit_score_cap = 722
model_data['credit_score'] = model_data['credit_score'].clip(upper=credit_score_cap)

In [29]:
# Assume single vehicle policy and create a vehicle count variable
model_data['veh_cnt'] = 1

# Add policy year 
model_data['data_segment'] = "1|model"

### Carry the above steps to inference data 

In [30]:
import sys
sys.path.append(os.path.abspath(".."))
from analysis_tool_chest.data_etl import DataETL

output_folder = "../.local/analysis_pipeline/data_etl"
os.makedirs(output_folder, exist_ok=True)
etl = DataETL(inference_data)
etl.profile_analysis(output_folder, file_name=f"inference_data_profile_{version_tag}.html", title="Inference Data Profiling Report")
processed_inference_data = etl.transform(cap_dict={'veh_value': veh_value_cap})

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:00<00:00, 88.09it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

veh_value capped at 6.5


## Load Data Into Memory

In [31]:
output_folder = "../.local/analysis_pipeline/data_etl"
model_data.to_parquet(f"{output_folder}/model_data_{version_tag}.parquet", index=False)
processed_inference_data.to_parquet(f"{output_folder}/inference_data_{version_tag}.parquet", index=False)