## 1. Load libraries, and set path once for all

In [1]:
import os, sys
import numpy as np
import pandas as pd
from pathlib import Path

project_root = os.path.dirname(os.path.dirname(os.getcwd()))  # Two level-up
sys.path.insert(0, project_root)

print(f"Current working directory: {os.getcwd()}")
print(f"Project root added to path: {project_root}")
print(f"Updated sys.path[0]: {sys.path[0]}")

Current working directory: C:\Users\pfaha\PROJECTS\brfss-diabetes-surveys\brfss_diabetes_surveys_ml\notebooks
Project root added to path: C:\Users\pfaha\PROJECTS\brfss-diabetes-surveys
Updated sys.path[0]: C:\Users\pfaha\PROJECTS\brfss-diabetes-surveys


In [2]:
from brfss_diabetes_surveys_etl.src.main.configs.sinkers.data_sinker import SINK_CONFIGS
DATA_PATH = f"{SINK_CONFIGS['path']}/LLCP2024{SINK_CONFIGS['format']}"

## 2. Get data

In [3]:
if os.path.exists(DATA_PATH):
    print("File exists")
    df = pd.read_parquet(DATA_PATH)
else:
    print("File missing")

File exists


In [4]:
df.head()

Unnamed: 0,CHECKUP1,DIABETE4,DISPCODE,DRNKANY6,DROCDY4_,GENHLTH,LASTDEN4,MEDCOST1,MENTHLTH,PERSDOC3,...,_PHYS14D,_RFBING6,_RFBMI5,_RFDRHV9,_RFHLTH,_RFSMOK3,_SEX,_SMOKER3,_STATE,_TOTINDA
0,1.0,3.0,1100.0,2.0,0.0,3.0,1.0,2.0,88.0,2.0,...,2.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,1.0,1.0
1,1.0,3.0,1100.0,2.0,0.0,1.0,1.0,2.0,88.0,1.0,...,1.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0
2,4.0,3.0,1100.0,1.0,100.0,2.0,4.0,1.0,88.0,3.0,...,3.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
3,1.0,3.0,1100.0,2.0,0.0,1.0,1.0,2.0,88.0,1.0,...,1.0,1.0,2.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0
4,1.0,3.0,1100.0,2.0,0.0,3.0,1.0,2.0,88.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,2.0


## 3. Perform some EDA

In [5]:
df1 = df.copy()

In [6]:
df1.dtypes

CHECKUP1    float64
DIABETE4    float64
DISPCODE    float64
DRNKANY6    float64
DROCDY4_    float64
GENHLTH     float64
LASTDEN4    float64
MEDCOST1    float64
MENTHLTH    float64
PERSDOC3    float64
PHYSHLTH    float64
PRIMINS2    float64
RMVTETH4    float64
_AGE65YR    float64
_AGE80      float64
_AGEG5YR    float64
_AGE_G      float64
_ASTHMS1    float64
_CASTHM1    float64
_CHLDCNT    float64
_CURECI3    float64
_DENVST3    float64
_DRNKWK3    float64
_EDUCAG     float64
_EXTETH3    float64
_HCVU654    float64
_HLTHPL2    float64
_INCOMG1    float64
_LCSAGE     float64
_LTASTH1    float64
_MENT14D    float64
_PHYS14D    float64
_RFBING6    float64
_RFBMI5     float64
_RFDRHV9    float64
_RFHLTH     float64
_RFSMOK3    float64
_SEX        float64
_SMOKER3    float64
_STATE      float64
_TOTINDA    float64
dtype: object

### 3.1. DRNKANY6: Adults who reported having had at least one drink of alcohol in the past 30 days

In [7]:
df1['DRNKANY6'].value_counts(dropna=False)

DRNKANY6
1.0    210607
2.0    203059
9.0     40882
7.0      2877
Name: count, dtype: int64

In [8]:
conditions = [
    df1['DRNKANY6'] == 1.0,  # Yes
    df1['DRNKANY6'] == 2.0,  # No
]

choices = ["yes", "no"]
df1["did_you_have_at_least_1drink_in_last_30days"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['DRNKANY6'])

In [9]:
df1['did_you_have_at_least_1drink_in_last_30days'].value_counts(dropna=False)

did_you_have_at_least_1drink_in_last_30days
yes    210607
no     203059
NaN     43759
Name: count, dtype: int64

### 3.2. DROCDY4_: Drink-occasions-per-day

In [10]:
df1['DROCDY4_'].value_counts(dropna=False) # Numerical value, but has a better meaning if converted into categories

DROCDY4_
0.0      203059
900.0     43759
3.0       34419
7.0       25370
14.0      19943
100.0     18967
29.0      15897
10.0      14522
43.0      11524
13.0      10978
17.0      10500
33.0       6950
67.0       5826
57.0       5674
71.0       5488
50.0       5457
20.0       3686
27.0       3120
23.0       2735
83.0       2711
86.0       1761
40.0       1634
93.0        829
47.0        590
97.0        294
30.0        275
53.0        214
90.0        209
60.0        208
80.0        192
70.0        189
87.0        167
73.0        161
77.0         68
37.0         38
63.0         10
NaN           1
Name: count, dtype: int64

In [11]:
conditions = [
    df1['DROCDY4_'] == 0.0,                    # No drink occasion
    (df1['DROCDY4_'] >= 1.0) & (df1['DROCDY4_'] <= 899.0),  # Yes at least 1 occasion
]

choices = ["no", "yes"]
df1["at_least_1drink_occasion_per_day"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['DROCDY4_'])

In [12]:
df1["at_least_1drink_occasion_per_day"].value_counts(dropna=False)

at_least_1drink_occasion_per_day
yes    210606
no     203059
NaN     43760
Name: count, dtype: int64

### 3.3. _AGE65YR, _AGE80, _AGEG5YR, _AGE_G : Age imputed, Age groups and Age imputed groups

**_AGE65YR**

In [13]:
df1['_AGE65YR'].value_counts(dropna=False)

_AGE65YR
1.0    278272
2.0    170844
3.0      8309
Name: count, dtype: int64

In [14]:
conditions = [
    df1['_AGE65YR'] == 1.0,  # 18-64
    df1['_AGE65YR'] == 2.0,  # 65-99
]

choices = ["18-64", "65-99"]
df1["age_in_2groups"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_AGE65YR'])

In [15]:
df1["age_in_2groups"].value_counts(dropna=False)

age_in_2groups
18-64    278272
65-99    170844
NaN        8309
Name: count, dtype: int64

**_AGE80**

In [16]:
df1['_AGE80'].value_counts(dropna=False)

_AGE80
80.0    41677
70.0    10481
65.0    10382
67.0     9803
68.0     9476
        ...  
21.0     4289
22.0     4196
20.0     4158
19.0     4009
18.0     3772
Name: count, Length: 63, dtype: int64

In [17]:
df1['_AGE80'].describe()

count    457425.000000
mean         55.077014
std          18.126777
min          18.000000
25%          40.000000
50%          58.000000
75%          70.000000
max          80.000000
Name: _AGE80, dtype: float64

In [18]:
df1['_AGE80'] = df1['_AGE80'].astype('Int64')
df1 = df1.rename(columns={'_AGE80': 'imputed_age'})

**_AGEG5YR**

In [19]:
df1['_AGEG5YR'].value_counts(dropna=False)

_AGEG5YR
10.0    47670
11.0    44735
9.0     43376
13.0    41663
12.0    36776
8.0     34923
7.0     31690
5.0     30893
1.0     29686
6.0     28964
4.0     28802
3.0     26235
2.0     23703
14.0     8309
Name: count, dtype: int64

In [20]:
conditions = [
    df1['_AGEG5YR'] == 1.0,  # 18-24
    df1['_AGEG5YR'] == 2.0,  # 25-29
    df1['_AGEG5YR'] == 3.0,  # 30-34
    df1['_AGEG5YR'] == 4.0,  # 35-39
    df1['_AGEG5YR'] == 5.0,  # 40-44
    df1['_AGEG5YR'] == 6.0,  # 45-49
    df1['_AGEG5YR'] == 7.0,  # 50-54
    df1['_AGEG5YR'] == 8.0,  # 55-59
    df1['_AGEG5YR'] == 9.0,  # 60-64
    df1['_AGEG5YR'] == 10.0, # 65-69
    df1['_AGEG5YR'] == 11.0, # 70-74
    df1['_AGEG5YR'] == 12.0, # 75-79
    df1['_AGEG5YR'] == 13.0  # 80+
]

choices = [
    "18-24", "25-29", "30-34", "35-39", "40-44", 
    "45-49", "50-54", "55-59", "60-64", "65-69", 
    "70-74", "75-79", "80+"
]

df1["age_in_14groups"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_AGEG5YR'])

In [21]:
df1["age_in_14groups"].value_counts(dropna=False)

age_in_14groups
65-69    47670
70-74    44735
60-64    43376
80+      41663
75-79    36776
55-59    34923
50-54    31690
40-44    30893
18-24    29686
45-49    28964
35-39    28802
30-34    26235
25-29    23703
NaN       8309
Name: count, dtype: int64

**_AGE_G**

In [22]:
df1['_AGE_G'].value_counts(dropna=False)

_AGE_G
6.0    172636
5.0     80256
4.0     64445
3.0     60410
2.0     49990
1.0     29688
Name: count, dtype: int64

In [23]:
conditions = [
    df1['_AGE_G'] == 1.0,  # 18-24
    df1['_AGE_G'] == 2.0,  # 25-34
    df1['_AGE_G'] == 3.0,  # 35-44
    df1['_AGE_G'] == 4.0,  # 45-54
    df1['_AGE_G'] == 5.0,  # 55-64
    df1['_AGE_G'] == 6.0,  # 65+
]

choices = ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"]

df1["imputed_age_in_6groups"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_AGE_G'])

In [24]:
df1["imputed_age_in_6groups"].value_counts(dropna=False)

imputed_age_in_6groups
65+      172636
55-64     80256
45-54     64445
35-44     60410
25-34     49990
18-24     29688
Name: count, dtype: int64

#### 3.4. _ASTHMS1: Asthma status

In [25]:
df1["_ASTHMS1"].value_counts(dropna=False)

_ASTHMS1
3.0    383537
1.0     49686
2.0     20265
9.0      3937
Name: count, dtype: int64

In [26]:
conditions = [
    df1['_ASTHMS1'] == 1.0,  # Having asthma currently 
    df1['_ASTHMS1'] == 2.0,  # Had asthma, but not anymore
    df1['_ASTHMS1'] == 3.0,  # Never had asthma
]

choices = ["currently", "formerly", "never"]

df1["do_you_have_asthma"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_ASTHMS1'])

In [27]:
df1["do_you_have_asthma"].value_counts(dropna=False)

do_you_have_asthma
never        383537
currently     49686
formerly      20265
NaN            3937
Name: count, dtype: int64

#### 3.5. _CASTHM1: Adults who have been told they currently have asthma

In [28]:
df1["_CASTHM1"].value_counts(dropna=False)

_CASTHM1
1.0    403802
2.0     49686
9.0      3937
Name: count, dtype: int64

In [29]:
conditions = [
    df1['_CASTHM1'] == 1.0,  # No
    df1['_CASTHM1'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["have_you_currently_been_told_to_have_asthma"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_CASTHM1'])

In [30]:
df1["have_you_currently_been_told_to_have_asthma"].value_counts(dropna=False)

have_you_currently_been_told_to_have_asthma
no     403802
yes     49686
NaN      3937
Name: count, dtype: int64

#### 3.6. _CHLDCNT: Number of children in household

In [31]:
df1["_CHLDCNT"].value_counts(dropna=False)

_CHLDCNT
1.0    336077
2.0     48200
3.0     38833
4.0     16114
9.0      9375
5.0      5670
6.0      3155
NaN         1
Name: count, dtype: int64

In [32]:
conditions = [
    df1['_CHLDCNT'] == 1.0,  # 0 child
    df1['_CHLDCNT'] == 2.0,  # 1 child
    df1['_CHLDCNT'] == 3.0,  # 2 children
    df1['_CHLDCNT'] == 4.0,  # 3 children
    df1['_CHLDCNT'] == 5.0,  # 4 children
    df1['_CHLDCNT'] == 6.0,  # 5+ children
]

choices = ["0", "1", "2", "3", "4", "5+"]

df1["children_count_categories"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_CHLDCNT'])

In [33]:
df1["children_count_categories"].value_counts(dropna=False)

children_count_categories
0      336077
1       48200
2       38833
3       16114
NaN      9376
4        5670
5+       3155
Name: count, dtype: int64

#### 3.7. _CURECI3: Adults who are current e-cigarette users

In [34]:
df1["_CURECI3"].value_counts(dropna=False)

_CURECI3
1.0    400184
9.0     32735
2.0     24506
Name: count, dtype: int64

In [35]:
conditions = [
    df1['_CURECI3'] == 1.0,  # No
    df1['_CURECI3'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["are_you_currently_an_ecigars_user"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_CURECI3'])

In [36]:
df1["are_you_currently_an_ecigars_user"].value_counts(dropna=False)

are_you_currently_an_ecigars_user
no     400184
NaN     32735
yes     24506
Name: count, dtype: int64

#### 3.8. _DENVST3: Adults who have visited a dentist, dental hygenist or dental clinic within the past year

In [37]:
df1["_DENVST3"].value_counts(dropna=False)

_DENVST3
1.0    311034
2.0    140686
9.0      5705
Name: count, dtype: int64

In [38]:
conditions = [
    df1['_DENVST3'] == 1.0,  # Yes
    df1['_DENVST3'] == 2.0,  # No
]

choices = ["yes", "no"]

df1["have_you_ever_visited_for_dental_problems_within_last_year"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_DENVST3'])

In [39]:
df1["have_you_ever_visited_for_dental_problems_within_last_year"].value_counts(dropna=False)

have_you_ever_visited_for_dental_problems_within_last_year
yes    311034
no     140686
NaN      5705
Name: count, dtype: int64

#### 3.9. _DRNKWK3: Calculated total number of alcoholic beverages consumed per week

In [40]:
df1["_DRNKWK3"].value_counts(dropna=False) # Numerical column

_DRNKWK3
0.0        203469
99900.0     46680
23.0        22646
47.0        19471
93.0        12669
            ...  
15750.0         1
8423.0          1
5717.0          1
9147.0          1
26250.0         1
Name: count, Length: 283, dtype: int64

In [41]:
df1["_DRNKWK3"] = df1["_DRNKWK3"].replace(99900.0, np.nan).astype('Int64')
df1["_DRNKWK3"].value_counts(dropna=False)

_DRNKWK3
0        203469
<NA>      46680
23        22646
47        19471
93        12669
          ...  
15750         1
8423          1
5717          1
9147          1
26250         1
Name: count, Length: 283, dtype: Int64

In [42]:
df1 = df1.rename(columns={'_DRNKWK3': 'weekly_number_of_alcoholic_drinks'})

#### 3.10. _EDUCAG: Level of education completed

In [43]:
df1["_EDUCAG"].value_counts(dropna=False)

_EDUCAG
4.0    191411
3.0    120686
2.0    115826
1.0     27139
9.0      2363
Name: count, dtype: int64

In [44]:
conditions = [
    df1['_EDUCAG'] == 1.0,  # < High School
    df1['_EDUCAG'] == 2.0,  # High School
    df1['_EDUCAG'] == 3.0,  # < College / Technical School
    df1['_EDUCAG'] == 4.0   # College / Technical School
]

choices = ["< high school", "high school", "< college / technical school", "college / technical school"]

df1["education_level_completed"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_EDUCAG'])

In [45]:
df1["education_level_completed"].value_counts(dropna=False)

education_level_completed
college / technical school      191411
< college / technical school    120686
high school                     115826
< high school                    27139
NaN                               2363
Name: count, dtype: int64

#### 3.11. _EXTETH3: Adults aged 18+ who have had permanent teeth extracted

In [46]:
df1["_EXTETH3"].value_counts(dropna=False)

_EXTETH3
1.0    241087
2.0    206883
9.0      9455
Name: count, dtype: int64

In [47]:
conditions = [
    df1['_EXTETH3'] == 1.0,  # No
    df1['_EXTETH3'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["have_you_had_permanent_teeth_extracted"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_EXTETH3'])

In [48]:
df1["have_you_had_permanent_teeth_extracted"].value_counts(dropna=False)

have_you_had_permanent_teeth_extracted
no     241087
yes    206883
NaN      9455
Name: count, dtype: int64

#### 3.12. _HCVU654: Respondents aged 18-64 who have any form of health insurance

In [49]:
df1["_HCVU654"].value_counts(dropna=False)

_HCVU654
1.0    242573
9.0    190842
2.0     24010
Name: count, dtype: int64

In [50]:
conditions = [
    df1['_HCVU654'] == 1.0,  # Yes
    df1['_HCVU654'] == 2.0,  # No
]

choices = ["yes", "no"]

df1["have_you_any_health_insurance"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_HCVU654'])

In [51]:
df1["have_you_any_health_insurance"].value_counts(dropna=False)

have_you_any_health_insurance
yes    242573
NaN    190842
no      24010
Name: count, dtype: int64

#### 3.13. _HLTHPL2: Adults who had some form of health insurance

In [52]:
df1["_HLTHPL2"].value_counts(dropna=False)

_HLTHPL2
1.0    413487
2.0     25397
9.0     18541
Name: count, dtype: int64

In [53]:
conditions = [
    df1['_HLTHPL2'] == 1.0,  # Yes
    df1['_HLTHPL2'] == 2.0,  # No
]

choices = ["yes", "no"]

df1["had_you_any_health_insurance"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_HLTHPL2'])

In [54]:
df1["had_you_any_health_insurance"].value_counts(dropna=False)

had_you_any_health_insurance
yes    413487
no      25397
NaN     18541
Name: count, dtype: int64

#### 3.14. _INCOMG1: Income categories

In [55]:
df1["_INCOMG1"].value_counts(dropna=False)

_INCOMG1
5.0    112325
9.0     87361
6.0     82405
4.0     49670
3.0     41688
2.0     33493
7.0     29684
1.0     20799
Name: count, dtype: int64

In [56]:
conditions = [
    df1['_INCOMG1'] == 1.0,  # < 15K
    df1['_INCOMG1'] == 2.0,  # [15K, 25K[
    df1['_INCOMG1'] == 3.0,  # [25K, 35K[
    df1['_INCOMG1'] == 4.0,  # [35K, 50K[
    df1['_INCOMG1'] == 5.0,  # [50K, 100K[
    df1['_INCOMG1'] == 6.0,  # [100K, 200K[
    df1['_INCOMG1'] == 7.0,  # >= 200K
]

choices = ['< 15k', '[15k, 25k[', '[25k, 35k[', '[35k, 50k[', '[50k, 100k[', '[100k, 200k[', '>= 200k']

df1["income_categories"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_INCOMG1'])

In [57]:
df1["income_categories"].value_counts(dropna=False)

income_categories
[50k, 100k[     112325
NaN              87361
[100k, 200k[     82405
[35k, 50k[       49670
[25k, 35k[       41688
[15k, 25k[       33493
>= 200k          29684
< 15k            20799
Name: count, dtype: int64

#### 3.15. _LCSAGE: Lung Cancer Screening Age Groups

In [58]:
df1["_LCSAGE"].value_counts(dropna=False)

_LCSAGE
2.0    251999
1.0    169833
3.0     35593
Name: count, dtype: int64

In [59]:
conditions = [
    df1['_LCSAGE'] == 1.0,  # 18-49
    df1['_LCSAGE'] == 2.0,  # 50-80
    df1['_LCSAGE'] == 3.0,  # 81+
]

choices = ["18-49", "50-80", "81+"]

df1["age_imputed_in_3groups"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_LCSAGE'])

In [60]:
df1["age_imputed_in_3groups"].value_counts(dropna=False)

age_imputed_in_3groups
50-80    251999
18-49    169833
81+       35593
Name: count, dtype: int64

#### 3.16. _LTASTH1: Adults who have ever been told they have asthma currently or formerly

In [61]:
df1["_LTASTH1"].value_counts(dropna=False)

_LTASTH1
1.0    383537
2.0     72027
9.0      1861
Name: count, dtype: int64

In [62]:
conditions = [
    df1['_LTASTH1'] == 1.0,  # No
    df1['_LTASTH1'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["have_you_formely_or_currently_been_told_to_have_asthma"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_LTASTH1'])

In [63]:
df1["have_you_formely_or_currently_been_told_to_have_asthma"].value_counts(dropna=False)

have_you_formely_or_currently_been_told_to_have_asthma
no     383537
yes     72027
NaN      1861
Name: count, dtype: int64

#### 3.17. _MENT14D: 3 level not good mental health status (0 days, 1-13 days, 14-30 days)

In [64]:
df1["_MENT14D"].value_counts(dropna=False)

_MENT14D
1.0    269677
2.0    117286
3.0     62306
9.0      8156
Name: count, dtype: int64

In [65]:
conditions = [
    df1['_MENT14D'] == 1.0,  # 0days
    df1['_MENT14D'] == 2.0,  # 1-13days
    df1['_MENT14D'] == 3.0,  # 14+days
]

choices = ["0", "1-13", "14+"]

df1["bad_mental_health_days_categories"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_MENT14D'])

In [66]:
df1["bad_mental_health_days_categories"].value_counts(dropna=False)

bad_mental_health_days_categories
0       269677
1-13    117286
14+      62306
NaN       8156
Name: count, dtype: int64

#### 3.18. _PHYS14D: 3 level not good physical health status (0 days, 1-13 days, 14-30 days)

In [67]:
df1["_PHYS14D"].value_counts(dropna=False)

_PHYS14D
1.0    267342
2.0    115196
3.0     63820
9.0     11067
Name: count, dtype: int64

In [68]:
conditions = [
    df1['_PHYS14D'] == 1.0,  # 0days
    df1['_PHYS14D'] == 2.0,  # 1-13days
    df1['_PHYS14D'] == 3.0,  # 14+days
]

choices = ["0", "1-13", "14+"]

df1["bad_physical_health_days_categories"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_PHYS14D'])

In [69]:
df1["bad_physical_health_days_categories"].value_counts(dropna=False)

bad_physical_health_days_categories
0       267342
1-13    115196
14+      63820
NaN      11067
Name: count, dtype: int64

#### 3.19. _RFBING6: Binge drinkers (males having five or more drinks on one occasion, females having four or more drinks on one occasion)

In [70]:
df1["_RFBING6"].value_counts(dropna=False)

_RFBING6
1.0    355043
2.0     54984
9.0     47398
Name: count, dtype: int64

In [71]:
conditions = [
    df1['_RFBING6'] == 1.0,  # No
    df1['_RFBING6'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["are_you_a_binge_drinker"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_RFBING6'])

In [72]:
df1["are_you_a_binge_drinker"].value_counts(dropna=False)

are_you_a_binge_drinker
no     355043
yes     54984
NaN     47398
Name: count, dtype: int64

#### 3.20. _RFBMI5: Adults who have a body mass index greater than 25.00 (Overweight or Obese)

In [73]:
df1["_RFBMI5"].value_counts(dropna=False)

_RFBMI5
2.0    286063
1.0    128345
9.0     43017
Name: count, dtype: int64

In [74]:
conditions = [
    df1['_RFBMI5'] == 1.0,  # No
    df1['_RFBMI5'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["are_you_overweight_or_obese"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_RFBMI5'])

In [75]:
df1["are_you_overweight_or_obese"].value_counts(dropna=False)

are_you_overweight_or_obese
yes    286063
no     128345
NaN     43017
Name: count, dtype: int64

#### 3.21. _RFDRHV9: Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week)

In [76]:
df1["_RFDRHV9"].value_counts(dropna=False)

_RFDRHV9
1.0    386585
9.0     46680
2.0     24160
Name: count, dtype: int64

In [77]:
conditions = [
    df1['_RFDRHV9'] == 1.0,  # No
    df1['_RFDRHV9'] == 2.0,  # Yes
]

choices = ["no", "yes"]

df1["are_you_a_heavy_drinker"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_RFDRHV9'])

In [78]:
df1["are_you_a_heavy_drinker"].value_counts(dropna=False)

are_you_a_heavy_drinker
no     386585
NaN     46680
yes     24160
Name: count, dtype: int64

#### 3.22. _RFHLTH: Adults with good or better health

In [79]:
df1["_RFHLTH"].value_counts(dropna=False)

_RFHLTH
1.0    366004
2.0     90111
9.0      1310
Name: count, dtype: int64

In [80]:
conditions = [
    df1['_RFHLTH'] == 1.0,  # Good / Better
    df1['_RFHLTH'] == 2.0,  # Poor / Fair
]

choices = ["good / better", "poor / fair"]

df1["health_status"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_RFHLTH'])

In [81]:
df1["health_status"].value_counts(dropna=False)

health_status
good / better    366004
poor / fair       90111
NaN                1310
Name: count, dtype: int64

#### 3.23. _RFSMOK3: Adults who are current smokers

In [82]:
df1["_RFSMOK3"].value_counts(dropna=False)

_RFSMOK3
1.0    378327
2.0     47092
9.0     32006
Name: count, dtype: int64

In [83]:
conditions = [
    df1['_RFSMOK3'] == 1.0, # No
    df1['_RFSMOK3'] == 2.0, # Yes
]

choices = ["no", "yes"]

df1["are_you_a_current_smoker"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_RFSMOK3'])

In [84]:
df1["are_you_a_current_smoker"].value_counts(dropna=False)

are_you_a_current_smoker
no     378327
yes     47092
NaN     32006
Name: count, dtype: int64

#### 3.24. _SEX: Calculated sex variable

In [85]:
df1["_SEX"].value_counts(dropna=False)

_SEX
2.0    240015
1.0    217410
Name: count, dtype: int64

In [86]:
conditions = [
    df1['_SEX'] == 1.0, # Male
    df1['_SEX'] == 2.0, # Female
]

choices = ["man", "woman"]

df1["sex"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_SEX'])

In [87]:
df1["sex"].value_counts(dropna=False)

sex
woman    240015
man      217410
Name: count, dtype: int64

#### 3.25. _SMOKER3: Four-level smoker status (Everyday smoker, Someday smoker, Former smoker, Non-smoker)

In [88]:
df1["_SMOKER3"].value_counts(dropna=False)

_SMOKER3
4.0    258764
3.0    119563
1.0     33117
9.0     32006
2.0     13975
Name: count, dtype: int64

In [89]:
conditions = [
    df1['_SMOKER3'] == 1.0, # Everyday smoker
    df1['_SMOKER3'] == 2.0, # Someday smoker
    df1['_SMOKER3'] == 3.0, # Former smoker
    df1['_SMOKER3'] == 4.0, # Non-smoker
]

choices = ["everyday smoker", "someday smoker", "former smoker", "non-smoker"]

df1["smoker_categories"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_SMOKER3'])

In [90]:
df1["smoker_categories"].value_counts(dropna=False)

smoker_categories
non-smoker         258764
former smoker      119563
everyday smoker     33117
NaN                 32006
someday smoker      13975
Name: count, dtype: int64

#### 3.26. _TOTINDA: Adults who reported doing physical activity or exercise during the past 30 days other than their regular job

In [91]:
df1["_TOTINDA"].value_counts(dropna=False)

_TOTINDA
1.0    349838
2.0    106272
9.0      1315
Name: count, dtype: int64

In [92]:
conditions = [
    df1['_TOTINDA'] == 1.0, # Yes 
    df1['_TOTINDA'] == 2.0, # No
]

choices = ["yes", "no"]

df1["have_you_exercised_during_last_30days"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['_TOTINDA'])

In [93]:
df1["have_you_exercised_during_last_30days"].value_counts(dropna=False)

have_you_exercised_during_last_30days
yes    349838
no     106272
NaN      1315
Name: count, dtype: int64

#### 3.27. DIABETE4: (Ever told) (you had) diabetes?

In [94]:
df1["DIABETE4"].value_counts(dropna=False)

DIABETE4
3.0    375901
1.0     65791
4.0     11305
2.0      3395
7.0       797
9.0       232
NaN         4
Name: count, dtype: int64

In [95]:
conditions = [
    df1['DIABETE4'] == 1.0, # diabetes
    df1['DIABETE4'] == 3.0, # No diabetes
    df1['DIABETE4'] == 4.0, # prediabetes
]

choices = ["diabetes", "no diabetes", "prediabetes"]

df1["diabetic_status"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['DIABETE4'])

In [96]:
df1["diabetic_status"].value_counts(dropna=False)

diabetic_status
no diabetes    375901
diabetes        65791
prediabetes     11305
NaN              4428
Name: count, dtype: int64

**Exclude people who didn't give their diabetic status or have an ambiguous diabetic status**

In [97]:
df1 = df1.dropna(subset=['diabetic_status'])

#### 3.28. CHECKUP1: About how long has it been since you last visited a doctor for a routine checkup?

In [98]:
df1["CHECKUP1"].value_counts(dropna=False)

CHECKUP1
1.0    366392
2.0     39214
3.0     21220
4.0     18229
7.0      4547
8.0      2747
9.0       648
Name: count, dtype: int64

In [99]:
conditions = [
    df1['CHECKUP1'] == 1.0, # < 1yr
    df1['CHECKUP1'] == 2.0, # [1, 2yrs[
    df1['CHECKUP1'] == 3.0, # [2, 5yrs[
    df1['CHECKUP1'] == 4.0, # 5yrs+
    df1['CHECKUP1'] == 8.0, # Never
]

choices = ["< 1yr", "[1, 2yrs[", "[2, 5yrs[", "5yrs+", "never"]

df1["how_long_since_your_last_doctor_visit"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['CHECKUP1'])

In [100]:
df1["how_long_since_your_last_doctor_visit"].value_counts(dropna=False)

how_long_since_your_last_doctor_visit
< 1yr        366392
[1, 2yrs[     39214
[2, 5yrs[     21220
5yrs+         18229
NaN            5195
never          2747
Name: count, dtype: int64

#### 3.29. MEDCOST1: Was there a time in the past 12 months when you needed to see a doctor but could not because you could not afford it?

In [101]:
df1["MEDCOST1"].value_counts(dropna=False)

MEDCOST1
2.0    408770
1.0     42641
7.0      1199
9.0       383
NaN         4
Name: count, dtype: int64

In [102]:
conditions = [
    df1['MEDCOST1'] == 1.0, # Yes
    df1['MEDCOST1'] == 2.0, # No
]

choices = ["yes", "no"]

df1["could_you_not_afford_a_doctor_once_in_last_30days"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['MEDCOST1'])

In [103]:
df1["could_you_not_afford_a_doctor_once_in_last_30days"].value_counts(dropna=False)

could_you_not_afford_a_doctor_once_in_last_30days
no     408770
yes     42641
NaN      1586
Name: count, dtype: int64

#### 3.30. PERSDOC3: Do you have one person or a group of doctors that you think of as your personal health care provider?

In [104]:
df1["PERSDOC3"].value_counts(dropna=False)

PERSDOC3
1.0    240148
2.0    152931
3.0     55434
7.0      3382
9.0      1101
NaN         1
Name: count, dtype: int64

In [105]:
conditions = [
    df1['PERSDOC3'] == 1.0, # 1
    df1['PERSDOC3'] == 2.0, # 1+
    df1['PERSDOC3'] == 3.0, # 0
]

choices = ["1", "1+", "0"]

df1["num_of_personal_health_care_providers"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['PERSDOC3'])

In [106]:
df1["num_of_personal_health_care_providers"].value_counts(dropna=False)

num_of_personal_health_care_providers
1      240148
1+     152931
0       55434
NaN      4484
Name: count, dtype: int64

#### 3.31. PRIMINS2: What is the current primary source of your health care coverage?

In [107]:
df1["PRIMINS2"].value_counts(dropna=False)

PRIMINS2
1.0     153751
3.0     144560
2.0      38254
5.0      32270
88.0     24994
7.0      16834
77.0     12359
9.0      12174
10.0     10005
99.0      5895
8.0       1382
4.0        380
6.0        138
NaN          1
Name: count, dtype: int64

In [108]:
conditions = [
    df1['PRIMINS2'] == 1.0, # Union / Employer plan subscription 
    df1['PRIMINS2'] == 2.0, # Private plan subscription 
    df1['PRIMINS2'] == 3.0, # Medicare
    df1['PRIMINS2'] == 4.0, # Medigap
    df1['PRIMINS2'] == 5.0, # Medicaid
    df1['PRIMINS2'] == 6.0, # Children health insurance program
    df1['PRIMINS2'] == 7.0, # Military health care 
    df1['PRIMINS2'] == 8.0, # Indian health service
    df1['PRIMINS2'] == 9.0, # State sponsored health plan
    df1['PRIMINS2'] == 10.0, # Other government program
    df1['PRIMINS2'] == 88.0, # No current coverage
]

choices = [
    "union / employer plan subscription", "private plan subscription", "medicare", "medigap", "medicaid", "children's health insurance program"
    , "military health care", "indian health service", "state-sponsored health plan", "other government program", "no current coverage"
]

df1["current_main_healthcare_coverage"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['PRIMINS2'])

In [109]:
df1["current_main_healthcare_coverage"].value_counts(dropna=False)

current_main_healthcare_coverage
union / employer plan subscription     153751
medicare                               144560
private plan subscription               38254
medicaid                                32270
no current coverage                     24994
NaN                                     18255
military health care                    16834
state-sponsored health plan             12174
other government program                10005
indian health service                    1382
medigap                                   380
children's health insurance program       138
Name: count, dtype: int64

#### 3.32. GENHLTH: Would you say that in general your health is

In [110]:
df1["GENHLTH"].value_counts(dropna=False)

GENHLTH
3.0    154487
2.0    144443
4.0     67161
1.0     63699
5.0     21965
7.0       892
9.0       348
NaN         2
Name: count, dtype: int64

In [111]:
conditions = [
    df1['GENHLTH'] == 1.0, # Excellent
    df1['GENHLTH'] == 2.0, # Very good
    df1['GENHLTH'] == 3.0, # Good
    df1['GENHLTH'] == 4.0, # Fair
    df1['GENHLTH'] == 5.0, # Poor
]

choices = ["excellent", "very good", "good", "fair", "poor"]

df1["general_health_status"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['GENHLTH'])

In [112]:
df1["general_health_status"].value_counts(dropna=False)

general_health_status
good         154487
very good    144443
fair          67161
excellent     63699
poor          21965
NaN            1242
Name: count, dtype: int64

#### 3.33. MENTHLTH: Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?

In [113]:
df1["MENTHLTH"].value_counts(dropna=False)

MENTHLTH
88.0    267642
30.0     26911
2.0      24903
5.0      21114
10.0     16261
3.0      15778
1.0      15257
15.0     15102
20.0      9579
4.0       8273
7.0       8185
77.0      5501
25.0      3305
14.0      2800
99.0      2466
6.0       2336
8.0       1850
12.0      1326
28.0       920
21.0       544
29.0       484
18.0       343
9.0        331
16.0       296
17.0       287
27.0       233
22.0       185
13.0       185
11.0       144
23.0       131
24.0       131
26.0       123
19.0        70
NaN          1
Name: count, dtype: int64

In [114]:
df1['MENTHLTH'] = df1['MENTHLTH'].apply(lambda x: None if x >= 77 else x).astype('Int64')
df1 = df1.rename(columns={"MENTHLTH": "how_many_days_in_last_month_did_you_have_mental_health_issues"})

In [115]:
df1["how_many_days_in_last_month_did_you_have_mental_health_issues"].describe()

count     177387.0
mean       11.0379
std      10.077812
min            1.0
25%            3.0
50%            7.0
75%           15.0
max           30.0
Name: how_many_days_in_last_month_did_you_have_mental_health_issues, dtype: Float64

#### 3.34. PHYSHLTH: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?

In [116]:
df1["PHYSHLTH"].value_counts(dropna=False)

PHYSHLTH
88.0    265087
30.0     35983
2.0      27596
1.0      18333
3.0      16752
5.0      15973
10.0     10991
15.0      9701
7.0       9270
4.0       8931
77.0      8619
20.0      6415
14.0      4616
25.0      2468
6.0       2458
99.0      2221
8.0       1833
12.0      1110
21.0       960
28.0       879
9.0        413
29.0       405
18.0       338
16.0       297
27.0       239
22.0       203
17.0       195
13.0       151
26.0       137
11.0       132
24.0       118
23.0       116
19.0        54
NaN          3
Name: count, dtype: int64

In [117]:
df1['PHYSHLTH'] = df1['PHYSHLTH'].apply(lambda x: None if x >= 77 else x).astype('Int64')
df1 = df1.rename(columns={"PHYSHLTH": "how_many_days_in_last_month_that_did_you_have_physical_health_issues"})

In [118]:
df1["how_many_days_in_last_month_that_did_you_have_physical_health_issues"].describe()

count     177067.0
mean     11.494734
std      10.944552
min            1.0
25%            2.0
50%            6.0
75%           20.0
max           30.0
Name: how_many_days_in_last_month_that_did_you_have_physical_health_issues, dtype: Float64

#### 3.35. LASTDEN4: Including all types of dentists, such as orthodontists, oral surgeons, and all other dental specialists, as well as dental hygienists, how long has it been since you last visited a dentist or a dental clinic for any reason?

In [119]:
df1["LASTDEN4"].value_counts(dropna=False)

LASTDEN4
1.0    308259
2.0     47025
4.0     45521
3.0     42620
7.0      4818
8.0      4017
9.0       737
Name: count, dtype: int64

In [120]:
conditions = [
    df1['LASTDEN4'] == 1.0, # < 1yr
    df1['LASTDEN4'] == 2.0, # [1yr, 2yrs[
    df1['LASTDEN4'] == 3.0, # [2yrs, 5yrs[
    df1['LASTDEN4'] == 4.0, # 5yrs+
    df1['LASTDEN4'] == 8.0, # No visitation
]

choices = ["< 1yr", "[1yr, 2yrs[", "[2yrs, 5yrs[", "5yrs+", "no visit"]

df1["how_long_since_your_last_dentist_visit"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['LASTDEN4'])

In [121]:
df1["how_long_since_your_last_dentist_visit"].value_counts(dropna=False)

how_long_since_your_last_dentist_visit
< 1yr           308259
[1yr, 2yrs[      47025
5yrs+            45521
[2yrs, 5yrs[     42620
NaN               5555
no visit          4017
Name: count, dtype: int64

#### 3.36. RMVTETH4: Not including teeth lost for injury or orthodontics, how many of your permanent teeth have been removed because of tooth decay or gum disease?

In [122]:
df1["RMVTETH4"].value_counts(dropna=False)

RMVTETH4
8.0    238677
1.0    132439
2.0     46289
3.0     26351
7.0      8028
9.0      1210
NaN         3
Name: count, dtype: int64

In [123]:
conditions = [
    df1['RMVTETH4'] == 1.0, # 1-5
    df1['RMVTETH4'] == 2.0, # 6+
    df1['RMVTETH4'] == 3.0, # All
]

choices = ["1-5", "6+", "all"]

df1["num_of_removed_teeth"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['RMVTETH4'])

In [124]:
df1["num_of_removed_teeth"].value_counts(dropna=False)

num_of_removed_teeth
NaN    247918
1-5    132439
6+      46289
all     26351
Name: count, dtype: int64

#### 3.37. DISPCODE: Have you completed the interview totally or not?

In [125]:
df1["DISPCODE"].value_counts(dropna=False)

DISPCODE
1100.0    368492
1200.0     84505
Name: count, dtype: int64

In [126]:
conditions = [
    df1['DISPCODE'] == 1100.0, # Yes
    df1['DISPCODE'] == 1200.0, # No
]

choices = ["yes", "no"]

df1["have_you_totally_completed_the_interview"] = np.select(conditions, choices, default=None)
df1 = df1.drop(columns=['DISPCODE'])

In [127]:
df1["have_you_totally_completed_the_interview"].value_counts(dropna=False)

have_you_totally_completed_the_interview
yes    368492
no      84505
Name: count, dtype: int64

#### 3.38. _STATE: State FIPS Code

In [128]:
df1["_STATE"].value_counts(dropna=False)

_STATE
36.0    43447
53.0    25518
27.0    15039
24.0    14303
12.0    12979
55.0    12942
18.0    12896
49.0    12598
31.0    12545
48.0    12169
23.0    12030
17.0    11852
26.0    11304
20.0    10785
8.0     10775
39.0    10066
25.0     9396
45.0     9380
19.0     8649
34.0     8584
4.0      8506
6.0      8288
21.0     7382
29.0     7263
13.0     7195
9.0      7139
40.0     6946
33.0     6925
51.0     6804
30.0     6751
15.0     6558
50.0     6469
41.0     6100
38.0     5814
46.0     5812
54.0     5750
2.0      5433
44.0     5407
5.0      5286
1.0      5044
56.0     4567
22.0     4495
10.0     4322
16.0     4316
72.0     4293
37.0     4290
42.0     3488
35.0     3353
11.0     3172
28.0     2961
32.0     2642
66.0     1634
78.0     1335
Name: count, dtype: int64

In [129]:
df1["state"] = df1["_STATE"].map(
    {
    1: "Alabama",
    2: "Alaska",
    4: "Arizona",
    5: "Arkansas",
    6: "California",
    8: "Colorado",
    9: "Connecticut",
    10: "Delaware",
    11: "District of Columbia",
    12: "Florida",
    13: "Georgia",
    15: "Hawaii",
    16: "Idaho",
    17: "Illinois",
    18: "Indiana",
    19: "Iowa",
    20: "Kansas",
    21: "Kentucky",
    22: "Louisiana",
    23: "Maine",
    24: "Maryland",
    25: "Massachusetts",
    26: "Michigan",
    27: "Minnesota",
    28: "Mississippi",
    29: "Missouri",
    30: "Montana",
    31: "Nebraska",
    32: "Nevada",
    33: "New Hampshire",
    34: "New Jersey",
    35: "New Mexico",
    36: "New York",
    37: "North Carolina",
    38: "North Dakota",
    39: "Ohio",
    40: "Oklahoma",
    41: "Oregon",
    42: "Pennsylvania",
    44: "Rhode Island",
    45: "South Carolina",
    46: "South Dakota",
    48: "Texas",
    49: "Utah",
    50: "Vermont",
    51: "Virginia",
    53: "Washington",
    54: "West Virginia",
    55: "Wisconsin",
    56: "Wyoming",
    66: "Guam",
    72: "Puerto Rico",
    78: "Virgin Islands"
}
)
df1 = df1.drop(columns=['_STATE'])

In [130]:
df1["state"].value_counts(dropna=False)

state
New York                43447
Washington              25518
Minnesota               15039
Maryland                14303
Florida                 12979
Wisconsin               12942
Indiana                 12896
Utah                    12598
Nebraska                12545
Texas                   12169
Maine                   12030
Illinois                11852
Michigan                11304
Kansas                  10785
Colorado                10775
Ohio                    10066
Massachusetts            9396
South Carolina           9380
Iowa                     8649
New Jersey               8584
Arizona                  8506
California               8288
Kentucky                 7382
Missouri                 7263
Georgia                  7195
Connecticut              7139
Oklahoma                 6946
New Hampshire            6925
Virginia                 6804
Montana                  6751
Hawaii                   6558
Vermont                  6469
Oregon                   6100
Nort

In [131]:
df1.dtypes

how_many_days_in_last_month_did_you_have_mental_health_issues           Int64
how_many_days_in_last_month_that_did_you_have_physical_health_issues    Int64
imputed_age                                                             Int64
weekly_number_of_alcoholic_drinks                                       Int64
did_you_have_at_least_1drink_in_last_30days                               str
at_least_1drink_occasion_per_day                                          str
age_in_2groups                                                            str
age_in_14groups                                                           str
imputed_age_in_6groups                                                    str
do_you_have_asthma                                                        str
have_you_currently_been_told_to_have_asthma                               str
children_count_categories                                                 str
are_you_currently_an_ecigars_user                               

In [132]:
df1 = df1.drop_duplicates()

In [133]:
ordered_cols = sorted(list(df1.columns))
df1 = df1[ordered_cols]

In [134]:
df1.shape

(452992, 41)

## 4. EDA Code Structure 
- Basic Units :
  - DataLoader (Load data)
  - DataCleaner (Perform transformations on data loaded)

In [135]:
# Test these implemented processes

from brfss_diabetes_surveys_ml.src.main.scripts.loaders.data_loader import DataLoader
from brfss_diabetes_surveys_ml.src.main.scripts.cleaners.data_cleaner import DataCleaner

loader = DataLoader(data_path=DATA_PATH)
loader.read_data()
df = loader.df.copy()
cleaner = DataCleaner(df=df)
cleaner.clean_data()
df = cleaner.df.copy()

pd.testing.assert_frame_equal(df, df1)

[32m2026-02-20 21:05:39.585[0m | [1mINFO    [0m | [36mbrfss_diabetes_surveys_ml.src.main.scripts.loaders.data_loader[0m:[36m__init__[0m:[36m12[0m - [1mInitializing DataLoader with data_path='C:/Users/pfaha/PROJECTS/brfss-diabetes-surveys/brfss_diabetes_surveys_etl/data/clean/LLCP2024.parquet'[0m
[32m2026-02-20 21:05:39.587[0m | [1mINFO    [0m | [36mbrfss_diabetes_surveys_ml.src.main.scripts.loaders.data_loader[0m:[36m__init__[0m:[36m20[0m - [1mDataLoader initialized successfully[0m
[32m2026-02-20 21:05:39.589[0m | [1mINFO    [0m | [36mbrfss_diabetes_surveys_ml.src.main.scripts.loaders.data_loader[0m:[36mread_data[0m:[36m23[0m - [1mReading PARQUET data from: C:/Users/pfaha/PROJECTS/brfss-diabetes-surveys/brfss_diabetes_surveys_etl/data/clean/LLCP2024.parquet[0m
[32m2026-02-20 21:05:39.663[0m | [1mINFO    [0m | [36mbrfss_diabetes_surveys_ml.src.main.scripts.loaders.data_loader[0m:[36mread_data[0m:[36m33[0m - [1mData loaded successfully: 4574