Notebook: Exploration of csv files

In [None]:
"""Exploration notebook for data analysis.

This notebook contains data exploration steps for disaster analysis.
"""

import hashlib
import sys
from pathlib import Path

import pandas as pd

from src.data_consolidation.dictionary import STANDARD_COLUMNS

module_path = Path("..").resolve()
sys.path.append(str(module_path))

In [46]:
def read_dat(dat_file: str) -> pd:
    """Reads a CSV file from the data_prep directory."""
    dat_dir = Path("../data_prep/").resolve()
    dat_path = dat_dir / dat_file
    return pd.read_csv(dat_path)

In [47]:
glide_prep_df = read_dat("glide_prep.csv")
gdacs_prep_df = read_dat("gdacs_prep.csv")
emdat_prep_df = read_dat("emdat_prep.csv")
disaster_charter_df = read_dat("disaster_charter_prep.csv")
cerf_df = read_dat("cerf_prep.csv")
idmc_df = read_dat("idmc_prep.csv")
ifrc_df = read_dat("ifrc_prep.csv")

In [48]:
pre_dfs = [
    glide_prep_df,
    gdacs_prep_df,
    emdat_prep_df,
    disaster_charter_df,
    cerf_df,
    idmc_df,
    ifrc_df,
]

In [49]:
for i, df in enumerate(pre_dfs):
    missing_cols = set(STANDARD_COLUMNS) - set(df.columns)
    for col in missing_cols:
        df[col] = None
    df_standard = df[STANDARD_COLUMNS]
    pre_dfs[i] = df_standard

all_data = pd.concat(pre_dfs, ignore_index=True)
all_data["Date"] = pd.to_datetime(all_data["Date"], errors="coerce")
group_key = ["Event_Type", "Country"]

  all_data = pd.concat(pre_dfs, ignore_index=True)


In [51]:
def consolidate_group(group: pd.DataFrame) -> dict:
    """Consolidates a group of data."""
    consolidated_row = {}
    event_ids = sorted(set(group["Source_Event_IDs"].dropna().astype(str).tolist()))
    consolidated_row["Event_ID"] = event_ids
    unique_str = "|".join(event_ids)
    disaster_impact_id = "DI_" + hashlib.sha256(unique_str.encode("utf-8")).hexdigest()
    consolidated_row["Disaster_Impact_ID"] = disaster_impact_id
    for column in group.columns:
        if column in group_key or column in ["Event_ID", "Disaster_Impact_ID"]:
            if column == "Disaster_Impact_ID":
                continue
            consolidated_row[column] = sorted(
                set(group[column].dropna().astype(str).tolist()),
            )
        else:
            values = group[column].dropna().tolist()
            if values:
                if all(isinstance(val, list) for val in values):
                    flat_values = [item for sublist in values for item in sublist]
                    consolidated_row[column] = sorted(set(map(str, flat_values)))
                else:
                    consolidated_row[column] = sorted(set(map(str, values)))
            else:
                consolidated_row[column] = None
    return consolidated_row