In [32]:
import pandas as pd
import os
import ast
data_path = '/home/evangelos/src/disaster-impact/data_mid/data_standardised/'


standardised_dfs = {
    "glide": pd.read_csv(os.path.join(data_path, 'glide_standardised.csv')),
    "gdacs": pd.read_csv(os.path.join(data_path, 'gdacs_standardised.csv')),
    "disaster_charter": pd.read_csv(os.path.join(data_path, 'disaster_charter_standardised.csv')),
    "emdat": pd.read_csv(os.path.join(data_path, 'emdat_standardised.csv')),
    "idmc": pd.read_csv(os.path.join(data_path, 'idmc_standardised.csv')),
    "cerf": pd.read_csv(os.path.join(data_path, 'cerf_standardised.csv')),
    "ifrc": pd.read_csv(os.path.join(data_path, 'ifrc_standardised.csv'))
}

# for name, df in standardised_dfs.items():
#     print(f"Dataset: {name}, Shape: {df.shape}")
#     if not df.empty:
#         display(df.iloc[0])
#     else:
#         print("The DataFrame is empty.")


def prefix_event_ids(value, prefix):
    if pd.isna(value):
        return None
    if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
        try:
            parsed = ast.literal_eval(value)
            if not isinstance(parsed, list):
                parsed = [parsed]
            return [f"{prefix}_{item}" for item in parsed]
        except:
            return f"{prefix}_{value}"
    else:
        if isinstance(value, list):
            return [f"{prefix}_{item}" for item in value]
        else:
            return f"{prefix}_{value}"

for name, df in standardised_dfs.items():
    if "Event_ID" in df.columns:
        df["Event_ID"] = df["Event_ID"].apply(lambda x: prefix_event_ids(x, name))


In [33]:
all_data = pd.concat(standardised_dfs.values(), ignore_index=True)

  all_data = pd.concat(standardised_dfs.values(), ignore_index=True)


In [36]:
import hashlib

group_key = ['Event_Type', 'Country', 'Date']

def consolidate_group(group):
    consolidated_row = {}
    event_ids = sorted(set(group['Event_ID'].dropna().astype(str).tolist()))
    consolidated_row["Event_ID"] = event_ids

    unique_str = "|".join(event_ids)
    disaster_impact_id = "DI_" + hashlib.md5(unique_str.encode("utf-8")).hexdigest()
    consolidated_row["Disaster_Impact_ID"] = disaster_impact_id

    for column in group.columns:
        if column in group_key or column == "Event_ID" or column == "Disaster_Impact_ID":
            if column == "Disaster_Impact_ID":
                continue
            consolidated_row[column] = sorted(set(group[column].dropna().astype(str).tolist()))
        else:
            values = group[column].dropna().tolist()
            if values:
                if all(isinstance(val, list) for val in values):
                    flat_values = [item for sublist in values for item in sublist]
                    consolidated_row[column] = sorted(set(map(str, flat_values)))
                else:
                    consolidated_row[column] = sorted(set(map(str, values)))
            else:
                consolidated_row[column] = None

    return consolidated_row



In [37]:
unified_rows = []
for _, group in all_data.groupby(group_key):
    unified_rows.append(consolidate_group(group))

unified_df = pd.DataFrame(unified_rows)

cols = ['Disaster_Impact_ID', 'Event_ID'] + [c for c in unified_df.columns if c not in ('Disaster_Impact_ID','Event_ID')]
unified_df = unified_df[cols]

print(f"Unified DataFrame shape: {unified_df.shape}")
display(unified_df.head())

Unified DataFrame shape: (66886, 29)


Unnamed: 0,Disaster_Impact_ID,Event_ID,Source_Event_IDs,Event_Name,Event_Type,Country,Country_Code,Location,Latitude,Longitude,...,Financial_Loss,Alert_Level,Source,Comments,External_Links,AID_Contribution,Admin_Units,External_IDs,Approval_Date,Disbursement_Date
0,DI_38b6c39da0f00ce35fb11b327947c987,[glide_e0c9f247a7808cc69f6652e56d051d99],[['AC-2000-000005-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Fuging']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['(Road)']""]]",[[]],,[[]],[[]],,
1,DI_55a537b19ff1fb6709ace74673fe1687,[glide_b58780e7c552092bf2392bc2d1b0b8be],[['AC-2000-000006-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Xiangtan (Hunan Province)']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['{Hotel} (Misc:Fire)']""]]",[[]],,[[]],[[]],,
2,DI_3349612a14a4481e3b7f3c16dec96300,[glide_649016f3fb106db945bfcc081e4c3bab],[['AC-2000-000034-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Jiangsu Province']],[[32.061707]],[[118.763232]],...,,[[]],[['[]']],"[[""['{Coal Mine} (Ind:Collapse)']""]]",[[]],,[[]],[[]],,
3,DI_f95438b2942c1a1ded4297e3f5dcddbb,[glide_8f1649c0fc7a534bf0d99a381cd57524],[['AC-2000-000093-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Guigang (Guangxi Province)']],[[23.111531]],[[109.598927]],...,,[[]],[['[]']],"[[""['(Misc:Fire)']""]]",[[]],,[[]],[[]],,
4,DI_2fc9b60a9b7cc8ce4e7fc276d7b3eb5a,[glide_cf14cab0de23931d096296308952c1d1],[['AC-2000-000026-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Shunde']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['(Road)']""]]",[[]],,[[]],[[]],,


In [38]:
os.makedirs('/home/evangelos/src/disaster-impact/data_out/data_unified/', exist_ok=True)

unified_df.to_csv('/home/evangelos/src/disaster-impact/data_out/data_unified/unified_data.csv', index=False)

In [39]:
analysis_df = unified_df.copy()
display(analysis_df.head())


Unnamed: 0,Disaster_Impact_ID,Event_ID,Source_Event_IDs,Event_Name,Event_Type,Country,Country_Code,Location,Latitude,Longitude,...,Financial_Loss,Alert_Level,Source,Comments,External_Links,AID_Contribution,Admin_Units,External_IDs,Approval_Date,Disbursement_Date
0,DI_38b6c39da0f00ce35fb11b327947c987,[glide_e0c9f247a7808cc69f6652e56d051d99],[['AC-2000-000005-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Fuging']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['(Road)']""]]",[[]],,[[]],[[]],,
1,DI_55a537b19ff1fb6709ace74673fe1687,[glide_b58780e7c552092bf2392bc2d1b0b8be],[['AC-2000-000006-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Xiangtan (Hunan Province)']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['{Hotel} (Misc:Fire)']""]]",[[]],,[[]],[[]],,
2,DI_3349612a14a4481e3b7f3c16dec96300,[glide_649016f3fb106db945bfcc081e4c3bab],[['AC-2000-000034-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Jiangsu Province']],[[32.061707]],[[118.763232]],...,,[[]],[['[]']],"[[""['{Coal Mine} (Ind:Collapse)']""]]",[[]],,[[]],[[]],,
3,DI_f95438b2942c1a1ded4297e3f5dcddbb,[glide_8f1649c0fc7a534bf0d99a381cd57524],[['AC-2000-000093-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Guigang (Guangxi Province)']],[[23.111531]],[[109.598927]],...,,[[]],[['[]']],"[[""['(Misc:Fire)']""]]",[[]],,[[]],[[]],,
4,DI_2fc9b60a9b7cc8ce4e7fc276d7b3eb5a,[glide_cf14cab0de23931d096296308952c1d1],[['AC-2000-000026-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Shunde']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['(Road)']""]]",[[]],,[[]],[[]],,
