In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = r"D:\repos\thai_traffic_deaths\data\traffic_death.xlsx"

df = pd.read_excel(
    DATA_PATH,
)

df.head()

Unnamed: 0,id,DEAD_YEAR(Budha),DEAD_YEAR,Age,Sex,BirthYear,NationalityId,Tumbol,District,Province,...,DeadDate,DateRec,TimeRec,AccSubDist,AccDist,AccProv,AccLat,Acclong,ICD-10,Vehicle
0,8635072,2555,2012,18,1.0,,99.0,,,,...,2012-12-16,NaT,,,,สุรินทร์,,,V284,รถจักรยานยนต์
1,8635074,2555,2012,18,1.0,,99.0,,,,...,2012-05-27,NaT,,,,สุรินทร์,,,V892,ไม่ระบุพาหนะ
2,8635084,2555,2012,21,1.0,,99.0,,,,...,2012-05-01,NaT,,,,สุรินทร์,,,V892,ไม่ระบุพาหนะ
3,8640055,2555,2012,58,1.0,,99.0,,,,...,2012-02-03,NaT,,,,สุรินทร์,,,V892,ไม่ระบุพาหนะ
4,8634703,2555,2012,4,1.0,,99.0,,,,...,2012-12-11,NaT,,,,ศรีสะเกษ,,,V846,ไม่ระบุพาหนะ


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215389 entries, 0 to 215388
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                215389 non-null  int64         
 1   DEAD_YEAR(Budha)  215389 non-null  int64         
 2   DEAD_YEAR         215389 non-null  int64         
 3   Age               215389 non-null  object        
 4   Sex               211286 non-null  float64       
 5   BirthYear         0 non-null       float64       
 6   NationalityId     119233 non-null  float64       
 7   Tumbol            0 non-null       float64       
 8   District          0 non-null       float64       
 9   Province          0 non-null       float64       
 10  RiskHelmet        0 non-null       float64       
 11  RiskSafetyBelt    0 non-null       float64       
 12  DeadDate          215389 non-null  datetime64[ns]
 13  DateRec           102745 non-null  datetime64[ns]
 14  Time

In [4]:
for col in df.columns:
    if (df[col].nunique() <= 25) and (df[col].isnull().sum() != len(df[col])):
        print("\n")
        print(df[col].value_counts(dropna=False))



DEAD_YEAR(Budha)
2554    21996
2559    21745
2560    21607
2555    21603
2556    21221
2557    20790
2558    19960
2561    19931
2562    19904
2563    17831
2564     8801
Name: count, dtype: int64


DEAD_YEAR
2011    21996
2016    21745
2017    21607
2012    21603
2013    21221
2014    20790
2015    19960
2018    19931
2019    19904
2020    17831
2021     8801
Name: count, dtype: int64


Sex
1.0    165296
2.0     44902
NaN      4103
0.0      1085
3.0         3
Name: count, dtype: int64


NationalityId
99.0     119108
NaN       96156
198.0        22
44.0         21
71.0         12
48.0         11
36.0         11
199.0         8
208.0         5
72.0          5
56.0          5
98.0          5
57.0          3
82.0          3
96.0          2
9.0           2
38.0          2
46.0          1
25.0          1
30.0          1
45.0          1
49.0          1
126.0         1
259.0         1
258.0         1
Name: count, dtype: int64


Vehicle
ไม่ระบุพาหนะ              104039
รถจักรยานยนต์         

In [5]:
# correcting data types for better profiling
corrected_dtype: dict = {
    "id": "int",
    "DEAD_YEAR(Budha)": "category",
    "DEAD_YEAR": "category",
    "Age": "category",
    "Sex": "category",
    "NationalityId": "category",
    "AccSubDist": "category",
    "AccDist": "category",
    "AccProv": "category",
    "ICD-10": "category",
    "Vehicle": "category",
}

df = df.astype(corrected_dtype)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215389 entries, 0 to 215388
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   id                215389 non-null  int64         
 1   DEAD_YEAR(Budha)  215389 non-null  category      
 2   DEAD_YEAR         215389 non-null  category      
 3   Age               215389 non-null  category      
 4   Sex               211286 non-null  category      
 5   BirthYear         0 non-null       float64       
 6   NationalityId     119233 non-null  category      
 7   Tumbol            0 non-null       float64       
 8   District          0 non-null       float64       
 9   Province          0 non-null       float64       
 10  RiskHelmet        0 non-null       float64       
 11  RiskSafetyBelt    0 non-null       float64       
 12  DeadDate          215389 non-null  datetime64[ns]
 13  DateRec           102745 non-null  datetime64[ns]
 14  Time

In [7]:
# make sure the output directory exists
import os

if not os.path.exists("output"):
    os.makedirs("output")

# output file configuration
SAVE_NAME = "traffic_death_eda_report"
SAVE_DATE = datetime.now().strftime("%Y-%m-%d %H:%M:%S").split(" ")[0]
SAVE_DIR = r"output"
OUTPUT_PATH = f"{SAVE_DIR}/{SAVE_NAME}_{SAVE_DATE}.html"

In [8]:
# Generate the profiling report
report = ProfileReport(
    df,
    title="Thai Traffic Deaths Data Profiling Report",
    explorative=True,
)
# Save the report to an HTML file
report.to_file(OUTPUT_PATH)

100%|██████████| 22/22 [00:00<00:00, 35.03it/s]0<00:00, 28.67it/s, Describe variable: Vehicle]      
Summarize dataset: 100%|██████████| 41/41 [00:04<00:00, 10.15it/s, Completed]                 
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.60s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 125.08it/s]
