In [3]:
import pandas as pd
import os

# Paths
data_path = "../data"
output_path = "../cleaned"
os.makedirs(output_path, exist_ok=True)

# Mapping of xlsx files to csv filenames
files = {
    "coverage-data.xlsx": "coverage_data.csv",
    "incidence-rate-data.xlsx": "incidence_rate.csv",
    "reported-cases-data.xlsx": "reported_cases.csv",
    "vaccine-introduction-data.xlsx": "vaccine_intro.csv",
    "vaccine-schedule-data.xlsx": "vaccine_schedule.csv"
}

# Convert all XLSX files to CSV
for xlsx_file, csv_file in files.items():
    file_path = os.path.join(data_path, xlsx_file)
    df = pd.read_excel(file_path, engine="openpyxl")
    df.to_csv(os.path.join(output_path, csv_file), index=False)
    print(f"✅ Converted {xlsx_file} → {csv_file}")

print("\n All Excel files converted to CSV successfully!")


✅ Converted coverage-data.xlsx → coverage_data.csv
✅ Converted incidence-rate-data.xlsx → incidence_rate.csv
✅ Converted reported-cases-data.xlsx → reported_cases.csv
✅ Converted vaccine-introduction-data.xlsx → vaccine_intro.csv
✅ Converted vaccine-schedule-data.xlsx → vaccine_schedule.csv

 All Excel files converted to CSV successfully!


In [49]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Visualization settings
sns.set(style="whitegrid")
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)


## Coverage Data 

In [50]:

# ----------------------------------------------
# STEP 2: Load Dataset
# ----------------------------------------------
data_path = "../cleaned"
df_coverage = pd.read_csv(os.path.join(data_path, "coverage_data.csv"))

print("✅ Dataset Loaded Successfully!")
print(df_coverage.shape)
print(df_coverage.head())

# ----------------------------------------------
# STEP 3: Handle Missing Values
# ----------------------------------------------

# 3.1 Check missing values
print("\nMissing Values Before Handling:\n", df_coverage.isna().sum())

# 3.2 Fix COUNTRY NAME & CODE Mapping

# We know 'WB_LONG_NA' and 'WB_SHORT_NA' are aggregate regions, not countries.
# Option → Assign a placeholder name instead of dropping
df_coverage.loc[df_coverage['CODE'] == "WB_LONG_NA", "NAME"] = "World Bank Long-term Aggregate"
df_coverage.loc[df_coverage['CODE'] == "WB_SHORT_NA", "NAME"] = "World Bank Short-term Aggregate"

# Drop rows where CODE is missing (only 1 row)
df_coverage = df_coverage.dropna(subset=['CODE'])


# 3.3 Handle Remaining Missing Values

# Fill COVERAGE with median per antigen
df_coverage['COVERAGE'] = df_coverage.groupby('ANTIGEN')['COVERAGE'].transform(
    lambda x: x.fillna(x.median())
)

# Fill DOSES with median per antigen
df_coverage['DOSES'] = df_coverage.groupby('ANTIGEN')['DOSES'].transform(
    lambda x: x.fillna(x.median())
)

# Fill TARGET_NUMBER with median per antigen
df_coverage['TARGET_NUMBER'] = df_coverage.groupby('ANTIGEN')['TARGET_NUMBER'].transform(
    lambda x: x.fillna(x.median())
)

# Drop rows where COVERAGE is still missing after imputation
df_coverage = df_coverage.dropna(subset=['COVERAGE'])

print("\nMissing Values After Handling:\n", df_coverage.isna().sum())

# ----------------------------------------------
# STEP 4: Normalize Units (if required)
# ----------------------------------------------
# COVERAGE should always be between 0 and 100 (percentage)
df_coverage['COVERAGE'] = df_coverage['COVERAGE'].clip(0, 100)

# ----------------------------------------------
# STEP 5: Consistency Checks
# ----------------------------------------------

# 5.1 Check Duplicates
duplicates = df_coverage[df_coverage.duplicated()]
print(f"\nTotal Duplicate Rows: {duplicates.shape[0]}")
df_coverage.drop_duplicates(inplace=True)

# 5.2 Check Negative Values for DOSES & TARGET_NUMBER
negatives = df_coverage[(df_coverage['DOSES'] < 0) | (df_coverage['TARGET_NUMBER'] < 0)]
print(f"Rows with Negative Values: {negatives.shape[0]}")
df_coverage = df_coverage[~((df_coverage['DOSES'] < 0) | (df_coverage['TARGET_NUMBER'] < 0))]

# 5.3 Fix DOSES > TARGET_NUMBER (Cap doses at target)
df_coverage.loc[df_coverage['DOSES'] > df_coverage['TARGET_NUMBER'], 'DOSES'] = df_coverage['TARGET_NUMBER']

# 5.4 Fix COVERAGE > 100
df_coverage.loc[df_coverage['COVERAGE'] > 100, 'COVERAGE'] = 100

# 5.5 Validate Year Range
df_coverage = df_coverage[(df_coverage['YEAR'] >= 1980) & (df_coverage['YEAR'] <= 2025)]

# ----------------------------------------------
# STEP 6: Feature Engineering (FE)
# ----------------------------------------------

# 6.1 Coverage Category - Binary Feature
def categorize_coverage(value):
    if value >= 90:
        return "High"
    elif value >= 70:
        return "Medium"
    else:
        return "Low"

df_coverage['COVERAGE_STATUS'] = df_coverage['COVERAGE'].apply(categorize_coverage)


# 6.2 Calculate Dose Gap
df_coverage['DOSE_GAP'] = df_coverage['TARGET_NUMBER'] - df_coverage['DOSES']

# 6.3 Vaccination Efficiency (DOSES / TARGET_NUMBER)
df_coverage['VACCINE_EFFICIENCY'] = np.where(
    df_coverage['TARGET_NUMBER'] > 0,
    (df_coverage['DOSES'] / df_coverage['TARGET_NUMBER']) * 100,
    0
)

# 6.4 Year-wise Coverage Flag
df_coverage['RECENT_DATA'] = df_coverage['YEAR'].apply(lambda x: 'Recent' if x >= 2018 else 'Old')

# ----------------------------------------------
# STEP 7: Final Sanity Check
# ----------------------------------------------
print("\n✅ Final Dataset Overview:\n")
print(df_coverage.info())
print(df_coverage.describe())

# ----------------------------------------------
# STEP 8: Save Cleaned Dataset
# ----------------------------------------------
cleaned_path = "../cleaned"
os.makedirs(cleaned_path, exist_ok=True)

df_coverage.to_csv(os.path.join(cleaned_path, "coverage_data_cleaned.csv"), index=False)
print("\n Data Cleaning & Feature Engineering Completed Successfully!")


✅ Dataset Loaded Successfully!
(399859, 11)
       GROUP CODE   NAME    YEAR  ANTIGEN                                ANTIGEN_DESCRIPTION COVERAGE_CATEGORY COVERAGE_CATEGORY_DESCRIPTION  TARGET_NUMBER   DOSES  COVERAGE
0  COUNTRIES  ABW  Aruba  2023.0      BCG                                                BCG             ADMIN       Administrative coverage            NaN     NaN       NaN
1  COUNTRIES  ABW  Aruba  2023.0      BCG                                                BCG          OFFICIAL             Official coverage            NaN     NaN       NaN
2  COUNTRIES  ABW  Aruba  2023.0  DIPHCV4  Diphtheria-containing vaccine, 4th dose (1st b...             ADMIN       Administrative coverage         1044.0   945.0     90.52
3  COUNTRIES  ABW  Aruba  2023.0  DIPHCV4  Diphtheria-containing vaccine, 4th dose (1st b...          OFFICIAL             Official coverage            NaN     NaN     90.52
4  COUNTRIES  ABW  Aruba  2023.0  DIPHCV5  Diphtheria-containing vaccine, 5th dose (2n

## Incidence Rate

In [53]:

# ---------------------------------------------------------
# STEP 2: Load Dataset
# ---------------------------------------------------------
data_path = "../cleaned"
df_incidence = pd.read_csv(os.path.join(data_path, "incidence_rate.csv"))

print("✅ Incidence Dataset Loaded!")
print(df_incidence.shape)
print(df_incidence.head())

# ---------------------------------------------------------
# STEP 3: Handle Missing Values
# ---------------------------------------------------------

# 3.1 Check missing values
print("\nMissing Values Before Handling:\n", df_incidence.isna().sum())

# 3.2 Handle Missing CODE and NAME

# Drop rows where CODE is missing (critical identifier)
df_incidence = df_incidence.dropna(subset=['CODE'])

# Build CODE → NAME mapping from existing valid values
code_to_name = (
    df_incidence.dropna(subset=['NAME'])
               .drop_duplicates(subset=['CODE'], keep='first')
               .set_index('CODE')['NAME']
               .to_dict()
)

# Fill missing NAME using CODE mapping
df_incidence['NAME'] = df_incidence['NAME'].fillna(df_incidence['CODE'].map(code_to_name))

# Handle World Bank aggregates (if exist)
wb_map = {
    'WB_LONG_NA': 'World Bank Long-term Aggregate',
    'WB_SHORT_NA': 'World Bank Short-term Aggregate'
}
df_incidence.loc[df_incidence['CODE'].isin(wb_map.keys()), 'NAME'] = \
    df_incidence.loc[df_incidence['CODE'].isin(wb_map.keys()), 'CODE'].map(wb_map)

# Final fallback: fill missing NAME with "Unknown"
df_incidence['NAME'] = df_incidence['NAME'].fillna('Unknown')

# 3.3 Handle Missing Disease Descriptions
df_incidence['DISEASE_DESCRIPTION'] = df_incidence['DISEASE_DESCRIPTION'].fillna(
    df_incidence['DISEASE']
)

# 3.4 Handle Missing Denominator (population basis)
# Fill with median per disease
# Strip spaces and lowercase for consistency
df_incidence['DENOMINATOR'] = df_incidence['DENOMINATOR'].str.strip().str.lower()

# Check unique values
print(df_incidence['DENOMINATOR'].unique())

import re

def extract_number(text):
    match = re.search(r"(\d[\d,]*)", str(text))
    return int(match.group(1).replace(",", "")) if match else np.nan

df_incidence['DENOMINATOR_VALUE'] = df_incidence['DENOMINATOR'].apply(extract_number)

print(df_incidence[['DENOMINATOR', 'DENOMINATOR_VALUE']].head())

# Fill missing denominator descriptions with "per total population" by default
df_incidence['DENOMINATOR'] = df_incidence['DENOMINATOR'].fillna("per total population")


# 3.5 Handle Missing Incidence Rate
# If INCIDENCE_RATE missing, calculate if CASES & DENOMINATOR exist
if 'CASES' in df_incidence.columns:
    df_incidence['INCIDENCE_RATE'] = np.where(
        (df_incidence['DENOMINATOR'] > 0) & df_incidence['INCIDENCE_RATE'].isna(),
        (df_incidence['CASES'] / df_incidence['DENOMINATOR_VALUE']) * 100000
,
        df_incidence['INCIDENCE_RATE']
    )
# Fill remaining missing values with median per disease
df_incidence['INCIDENCE_RATE'] = df_incidence.groupby('DISEASE')['INCIDENCE_RATE'].transform(
    lambda x: x.fillna(x.median())
)

print("\nMissing Values After Handling:\n", df_incidence.isna().sum())

# ---------------------------------------------------------
# STEP 4: Normalize Units
# ---------------------------------------------------------

# Ensure incidence rate ≥ 0
df_incidence.loc[df_incidence['INCIDENCE_RATE'] < 0, 'INCIDENCE_RATE'] = 0

# Ensure denominator numeric values ≥ 0
df_incidence.loc[df_incidence['DENOMINATOR_VALUE'] < 0, 'DENOMINATOR_VALUE'] = np.nan

# Fill missing numeric denominators with median per disease
df_incidence['DENOMINATOR_VALUE'] = df_incidence.groupby('DISEASE')['DENOMINATOR_VALUE'].transform(
    lambda x: x.fillna(x.median())
)


# ---------------------------------------------------------
# STEP 5: Consistency Checks
# ---------------------------------------------------------

# 5.1 Remove duplicates
print(f"\nTotal Duplicate Rows: {df_incidence.duplicated().sum()}")
df_incidence.drop_duplicates(inplace=True)

# 5.2 Validate year range
df_incidence = df_incidence[(df_incidence['YEAR'] >= 1980) & (df_incidence['YEAR'] <= 2025)]

# 5.3 Ensure correct disease codes
df_incidence['DISEASE'] = df_incidence['DISEASE'].str.upper().str.strip()

# ---------------------------------------------------------
# STEP 6: Feature Engineering (FE)
# ---------------------------------------------------------

# 6.1 Create "Incidence Severity" based on thresholds
def categorize_incidence(rate):
    if rate == 0:
        return "No Cases"
    elif rate <= 10:
        return "Low"
    elif rate <= 50:
        return "Moderate"
    else:
        return "High"

df_incidence['INCIDENCE_SEVERITY'] = df_incidence['INCIDENCE_RATE'].apply(categorize_incidence)

# 6.2 Create WHO Region if not present (optional — depends on your dataset)
# If there's a WHO region column, skip this. Otherwise, you can merge from another table later.

# 6.3 Create "Recent Data" flag
df_incidence['RECENT_DATA'] = df_incidence['YEAR'].apply(lambda x: 'Recent' if x >= 2018 else 'Old')

# ---------------------------------------------------------
# STEP 7: Final Sanity Check
# ---------------------------------------------------------
print("\n✅ Final Dataset Overview:\n")
print(df_incidence.info())
print(df_incidence.describe())

# ---------------------------------------------------------
# STEP 8: Save Cleaned Dataset
# ---------------------------------------------------------
cleaned_path = "../cleaned"
os.makedirs(cleaned_path, exist_ok=True)

df_incidence.to_csv(os.path.join(cleaned_path, "incidence_rate_cleaned.csv"), index=False)
print("\n Incidence Rate Data Cleaning & Feature Engineering Completed Successfully!")


✅ Incidence Dataset Loaded!
(84946, 8)
       GROUP CODE   NAME    YEAR          DISEASE             DISEASE_DESCRIPTION                     DENOMINATOR  INCIDENCE_RATE
0  COUNTRIES  ABW  Aruba  2023.0              CRS     Congenital rubella syndrome          per 10,000 live births             0.0
1  COUNTRIES  ABW  Aruba  2023.0       DIPHTHERIA                      Diphtheria  per 1,000,000 total population             0.0
2  COUNTRIES  ABW  Aruba  2023.0  INVASIVE_MENING  Invasive meningococcal disease  per 1,000,000 total population             9.3
3  COUNTRIES  ABW  Aruba  2023.0          MEASLES                         Measles  per 1,000,000 total population             NaN
4  COUNTRIES  ABW  Aruba  2023.0            MUMPS                           Mumps  per 1,000,000 total population             0.0

Missing Values Before Handling:
 GROUP                      0
CODE                       1
NAME                       1
YEAR                       1
DISEASE                    1
DI

## Reported Cases

In [54]:

# ---------------------------------------------------------
# STEP 2: Load Dataset
# ---------------------------------------------------------
data_path = "../cleaned"   # Use "../data" if original file is there
df_cases = pd.read_csv(os.path.join(data_path, "reported_cases.csv"))

print("✅ Reported Cases Dataset Loaded!")
print(df_cases.shape)
print(df_cases.head())

# ---------------------------------------------------------
# STEP 3: Handle Missing Values
# ---------------------------------------------------------

# 3.1 Check missing values before handling
print("\nMissing Values Before Handling:\n", df_cases.isna().sum())

# 3.2 Handle missing CODE and NAME
# Drop rows where CODE is missing — critical identifier
df_cases = df_cases.dropna(subset=['CODE'])

# Build CODE → NAME mapping from valid entries
code_to_name = (
    df_cases.dropna(subset=['NAME'])
           .drop_duplicates(subset=['CODE'], keep='first')
           .set_index('CODE')['NAME']
           .to_dict()
)

# Fill missing NAME using CODE mapping
df_cases['NAME'] = df_cases['NAME'].fillna(df_cases['CODE'].map(code_to_name))

# Handle World Bank aggregates (if exist)
wb_map = {
    'WB_LONG_NA': 'World Bank Long-term Aggregate',
    'WB_SHORT_NA': 'World Bank Short-term Aggregate'
}
df_cases.loc[df_cases['CODE'].isin(wb_map.keys()), 'NAME'] = \
    df_cases.loc[df_cases['CODE'].isin(wb_map.keys()), 'CODE'].map(wb_map)

# Final fallback: fill missing NAME with "Unknown"
df_cases['NAME'] = df_cases['NAME'].fillna('Unknown')

# 3.3 Handle Missing Disease Description
df_cases['DISEASE_DESCRIPTION'] = df_cases['DISEASE_DESCRIPTION'].fillna(df_cases['DISEASE'])

# 3.4 Handle Missing CASES
# Convert CASES to numeric, replacing errors with NaN
df_cases['CASES'] = pd.to_numeric(df_cases['CASES'], errors='coerce')

# Fill missing CASES with 0 — assuming no reported cases
df_cases['CASES'] = df_cases['CASES'].fillna(0)

# ---------------------------------------------------------
# STEP 4: Normalize Units
# ---------------------------------------------------------

# Ensure CASES ≥ 0
df_cases.loc[df_cases['CASES'] < 0, 'CASES'] = 0

# ---------------------------------------------------------
# STEP 5: Consistency Checks
# ---------------------------------------------------------

# 5.1 Remove duplicates
print(f"\nTotal Duplicate Rows: {df_cases.duplicated().sum()}")
df_cases.drop_duplicates(inplace=True)

# 5.2 Validate year range
df_cases = df_cases[(df_cases['YEAR'] >= 1980) & (df_cases['YEAR'] <= 2025)]

# 5.3 Standardize disease codes
df_cases['DISEASE'] = df_cases['DISEASE'].str.upper().str.strip()

# ---------------------------------------------------------
# STEP 6: Feature Engineering (FE)
# ---------------------------------------------------------

# 6.1 Categorize case severity
def categorize_cases(cases):
    if cases == 0:
        return "No Cases"
    elif cases <= 100:
        return "Low"
    elif cases <= 1000:
        return "Moderate"
    elif cases <= 10000:
        return "High"
    else:
        return "Severe"

df_cases['CASE_SEVERITY'] = df_cases['CASES'].apply(categorize_cases)

# 6.2 Add Recent Data flag
df_cases['RECENT_DATA'] = df_cases['YEAR'].apply(lambda x: 'Recent' if x >= 2018 else 'Old')

# ---------------------------------------------------------
# STEP 7: Final Sanity Check
# ---------------------------------------------------------
print("\n✅ Final Dataset Overview:\n")
print(df_cases.info())
print(df_cases.describe())

# ---------------------------------------------------------
# STEP 8: Save Cleaned Dataset
# ---------------------------------------------------------
cleaned_path = "../cleaned"
os.makedirs(cleaned_path, exist_ok=True)

df_cases.to_csv(os.path.join(cleaned_path, "reported_cases_cleaned.csv"), index=False)
print("\n🎯 Reported Cases Data Cleaning & Feature Engineering Completed Successfully!")


✅ Reported Cases Dataset Loaded!
(84870, 7)
       GROUP CODE   NAME    YEAR          DISEASE             DISEASE_DESCRIPTION  CASES
0  COUNTRIES  ABW  Aruba  2023.0              CRS     Congenital rubella syndrome    0.0
1  COUNTRIES  ABW  Aruba  2023.0       DIPHTHERIA                      Diphtheria    0.0
2  COUNTRIES  ABW  Aruba  2023.0  INVASIVE_MENING  Invasive meningococcal disease    1.0
3  COUNTRIES  ABW  Aruba  2023.0          MEASLES                         Measles    NaN
4  COUNTRIES  ABW  Aruba  2023.0            MUMPS                           Mumps    0.0

Missing Values Before Handling:
 GROUP                      0
CODE                       1
NAME                       1
YEAR                       1
DISEASE                    1
DISEASE_DESCRIPTION        1
CASES                  19400
dtype: int64

Total Duplicate Rows: 0

✅ Final Dataset Overview:

<class 'pandas.core.frame.DataFrame'>
Index: 84869 entries, 0 to 84868
Data columns (total 9 columns):
 #   Column     

## Vaccine Introduction

In [61]:

# ---------------------------------------------------------
# STEP 2: Load Dataset
# ---------------------------------------------------------
data_path = "../cleaned"
df_intro = pd.read_csv(os.path.join(data_path, "vaccine_intro.csv"))

print("✅ Vaccine Introduction Dataset Loaded!")
print(df_intro.shape)
print(df_intro.head())

# ---------------------------------------------------------
# STEP 3: Check Missing Values BEFORE Handling
# ---------------------------------------------------------
print("\n🔍 Missing Values BEFORE Handling:\n", df_intro.isna().sum())

# ---------------------------------------------------------
# STEP 4: Handle Missing Values
# ---------------------------------------------------------

# 4.1 Handle missing ISO_3_Code → Drop rows where it's missing since it's a primary identifier
df_intro = df_intro.dropna(subset=['ISO_3_CODE'])

# 4.2 Handle missing Country Name → Map using ISO code if possible
code_to_name = (
    df_intro.dropna(subset=['COUNTRYNAME'])
            .drop_duplicates(subset=['ISO_3_CODE'], keep='first')
            .set_index('ISO_3_CODE')['COUNTRYNAME']
            .to_dict()
)
df_intro['COUNTRYNAME'] = df_intro['COUNTRYNAME'].fillna(df_intro['ISO_3_CODE'].map(code_to_name))
df_intro['COUNTRYNAME'] = df_intro['COUNTRYNAME'].fillna('Unknown')

# 4.3 Handle missing WHO Region → Fill missing values with "Unknown"
df_intro['WHO_REGION'] = df_intro['WHO_REGION'].fillna('Unknown')

# 4.4 Handle missing Year → Fill with median year (keeps timeline consistent)
df_intro['YEAR'] = df_intro['YEAR'].fillna(df_intro['YEAR'].median()).astype(int)

# 4.5 Handle missing Description → Fill with "Unknown Vaccine"
df_intro['DESCRIPTION'] = df_intro['DESCRIPTION'].fillna('Unknown Vaccine')

# 4.6 Handle missing Intro column → Fill with "No" (not introduced)
df_intro['INTRO'] = df_intro['INTRO'].fillna('No')

# ✅ Missing values check AFTER handling
print("\n✅ Missing Values AFTER Handling:\n", df_intro.isna().sum())

# ---------------------------------------------------------
# STEP 5: Data Normalization
# ---------------------------------------------------------

# 5.1 Standardize WHO Region text formatting
df_intro['WHO_REGION'] = df_intro['WHO_REGION'].str.strip().str.title()

# 5.2 Standardize Intro column to Yes/No only
df_intro['INTRO'] = df_intro['INTRO'].astype(str).str.strip().str.capitalize()
df_intro['INTRO'] = df_intro['INTRO'].replace({'Y': 'Yes', 'N': 'No', 'yes': 'Yes', 'no': 'No'})

# ---------------------------------------------------------
# STEP 6: Consistency Checks
# ---------------------------------------------------------

# 6.1 Remove duplicates
print(f"\n🧹 Total Duplicate Rows: {df_intro.duplicated().sum()}")
df_intro.drop_duplicates(inplace=True)

# 6.2 Validate Year range → Keep only realistic values
df_intro = df_intro[(df_intro['YEAR'] >= 1980) & (df_intro['YEAR'] <= 2025)]

# ---------------------------------------------------------
# STEP 7: Feature Engineering (FE)
# ---------------------------------------------------------

# 7.1 Add a flag for "Recently Introduced" vaccines
df_intro['Recently_Introduced'] = df_intro['YEAR'].apply(lambda x: 'Yes' if x >= 2018 else 'No')

# 7.2 Count total vaccines introduced per country (for Power BI insights)
df_intro['Total_Vaccines_Introduced'] = df_intro.groupby('COUNTRYNAME')['INTRO'].transform(
    lambda x: (x == 'Yes').sum()
)

# 7.3 Add a Vaccine Status column
df_intro['Vaccine_Status'] = df_intro['INTRO'].apply(lambda x: 'Introduced' if x == 'Yes' else 'Not Introduced')

# ---------------------------------------------------------
# STEP 8: Final Dataset Overview
# ---------------------------------------------------------
print("\n📊 Final Dataset Info:\n")
print(df_intro.info())
print("\n🔎 Final Missing Values Check:\n", df_intro.isna().sum())

# ---------------------------------------------------------
# STEP 9: Save Cleaned Dataset
# ---------------------------------------------------------
cleaned_path = "../cleaned"
os.makedirs(cleaned_path, exist_ok=True)

df_intro.to_csv(os.path.join(cleaned_path, "vaccine_intro_cleaned.csv"), index=False)
print("\n✅ Vaccine Introduction Data Cleaning & Feature Engineering Completed Successfully!")


✅ Vaccine Introduction Dataset Loaded!
(138321, 6)
  ISO_3_CODE  COUNTRYNAME WHO_REGION    YEAR                                  DESCRIPTION INTRO
0        AFG  Afghanistan       EMRO  2023.0             aP (acellular pertussis) vaccine    No
1        AFG  Afghanistan       EMRO  2023.0                          Hepatitis A vaccine    No
2        AFG  Afghanistan       EMRO  2023.0                          Hepatitis B vaccine   Yes
3        AFG  Afghanistan       EMRO  2023.0                              HepB birth dose   Yes
4        AFG  Afghanistan       EMRO  2023.0  Hib (Haemophilus influenzae type B) vaccine   Yes

🔍 Missing Values BEFORE Handling:
 ISO_3_CODE     0
COUNTRYNAME    1
WHO_REGION     1
YEAR           1
DESCRIPTION    1
INTRO          1
dtype: int64

✅ Missing Values AFTER Handling:
 ISO_3_CODE     0
COUNTRYNAME    0
WHO_REGION     0
YEAR           0
DESCRIPTION    0
INTRO          0
dtype: int64

🧹 Total Duplicate Rows: 0

📊 Final Dataset Info:

<class 'pandas.core.f

### Vaccine Schedule 

In [75]:
import pandas as pd
import numpy as np
import os

# ---------------------------------------------------------
# STEP 1 — Load the dataset
# ---------------------------------------------------------
data_path = "../cleaned"    # change if your dataset is in a different folder
df_schedule = pd.read_csv(os.path.join(data_path, "vaccine_schedule.csv"))

print("✅ Vaccine Schedule Dataset Loaded!")
print(df_schedule.shape)
print(df_schedule.head())

# ---------------------------------------------------------
# STEP 2 — Normalize column names
# ---------------------------------------------------------
df_schedule.columns = df_schedule.columns.str.strip().str.lower().str.replace(" ", "_")
print("\nNormalized Columns:", df_schedule.columns)

# ---------------------------------------------------------
# STEP 3 — Check Missing Values Before Handling
# ---------------------------------------------------------
print("\nMissing Values Before Handling:\n", df_schedule.isna().sum())

# ---------------------------------------------------------
# STEP 4 — Handle Missing Values (NO DROPS)
# ---------------------------------------------------------

# 4.1 Handle missing ISO_3_Code → Fill with "UNKNOWN"
df_schedule['iso_3_code'] = df_schedule['iso_3_code'].fillna("UNKNOWN")

# 4.2 Handle missing Country Name → Try mapping from ISO_3_Code, else fill with "Unknown Country"
code_to_name = (
    df_schedule[['iso_3_code', 'countryname']]
    .dropna()
    .drop_duplicates()
    .set_index('iso_3_code')['countryname']
    .to_dict()
)
df_schedule['countryname'] = df_schedule['countryname'].fillna(
    df_schedule['iso_3_code'].map(code_to_name)
)
df_schedule['countryname'] = df_schedule['countryname'].fillna("Unknown Country")

# 4.3 Handle missing WHO Region → Fill with "Unknown Region"
df_schedule['who_region'] = df_schedule['who_region'].fillna("Unknown Region")

# 4.4 Handle missing Vaccine Description → Map from vaccine_code if possible, else "Unknown Vaccine"
code_to_vaccine = (
    df_schedule[['vaccinecode', 'vaccine_description']]
    .dropna()
    .drop_duplicates()
    .set_index('vaccinecode')['vaccine_description']
    .to_dict()
)
df_schedule['vaccine_description'] = df_schedule['vaccine_description'].fillna(
    df_schedule['vaccinecode'].map(code_to_vaccine)
)
df_schedule['vaccine_description'] = df_schedule['vaccine_description'].fillna("Unknown Vaccine")

# 4.5 Handle missing Schedule Rounds → Fill with "Not Specified"
df_schedule['schedulerounds'] = df_schedule['schedulerounds'].fillna("Not Specified")

# 4.6 Handle missing Target Population → Fill with median per vaccine, else 0
df_schedule['targetpop'] = df_schedule['targetpop'].fillna("Not Specified")

# 4.7 Handle missing Target Population Description → Fill with "General Population"
df_schedule['targetpop_description'] = df_schedule['targetpop_description'].fillna("General Population")

# 4.8 Handle missing Geoarea, Age Administered, Source Comment → Fill with "Unknown"
for col in ['geoarea', 'age_administered', 'source_comment']:
    if col in df_schedule.columns:
        df_schedule[col] = df_schedule[col].fillna("Unknown")

# ---------------------------------------------------------
# STEP 5 — Consistency Checks (Fix, Do Not Drop)
# ---------------------------------------------------------

# 5.1 Fix negative target populations → Set to median per vaccine
df_schedule['targetpop'] = df_schedule['targetpop'].astype(str).str.upper()


# 5.2 Fix invalid years → Replace out-of-range values with median year
median_year = df_schedule['year'].median()
df_schedule['year'] = df_schedule['year'].apply(
    lambda y: median_year if pd.notna(y) and (y < 1980 or y > 2030) else y
)
df_schedule['year'] = df_schedule['year'].fillna(median_year)


df_schedule['ageadministered'] = df_schedule['ageadministered'].fillna("Not Specified")

# 2.12 Source comment → Fill missing with "Not Provided"
df_schedule['sourcecomment'] = df_schedule['sourcecomment'].fillna("Not Provided")

# ---------------------------------------------------------
# STEP 6 — Check Missing Values After Handling
# ---------------------------------------------------------
print("\nMissing Values After Handling:\n", df_schedule.isna().sum())

# ---------------------------------------------------------
# STEP 7 — Save Cleaned Dataset
# ---------------------------------------------------------
cleaned_path = "../cleaned"
os.makedirs(cleaned_path, exist_ok=True)

df_schedule.to_csv(os.path.join(cleaned_path, "vaccine_schedule_cleaned.csv"), index=False)

print("\n✅ Vaccine Schedule Data Cleaning Completed & Saved Successfully!")


✅ Vaccine Schedule Dataset Loaded!
(8053, 12)
  ISO_3_CODE COUNTRYNAME WHO_REGION    YEAR VACCINECODE               VACCINE_DESCRIPTION  SCHEDULEROUNDS  TARGETPOP TARGETPOP_DESCRIPTION   GEOAREA AGEADMINISTERED SOURCECOMMENT
0        ABW       Aruba       AMRO  2023.0  DTAPHIBIPV  DTaP-Hib-IPV (acellular) vaccine             1.0        NaN       General/routine  NATIONAL              M2           NaN
1        ABW       Aruba       AMRO  2023.0  DTAPHIBIPV  DTaP-Hib-IPV (acellular) vaccine             2.0        NaN       General/routine  NATIONAL              M4           NaN
2        ABW       Aruba       AMRO  2023.0  DTAPHIBIPV  DTaP-Hib-IPV (acellular) vaccine             3.0        NaN       General/routine  NATIONAL              M6           NaN
3        ABW       Aruba       AMRO  2023.0  DTAPHIBIPV  DTaP-Hib-IPV (acellular) vaccine             4.0    B_2YL_W       General/routine  NATIONAL             M15           NaN
4        ABW       Aruba       AMRO  2023.0     DTAPIPV    