In [1]:
import pandas as pd
import os

data_path = '/home/evangelos/src/disaster-impact/data_mid/data_standardised/'


standardised_dfs = {
    "glide": pd.read_csv(os.path.join(data_path, 'glide_standardised.csv')),
    "gdacs": pd.read_csv(os.path.join(data_path, 'gdacs_standardised.csv')),
    "disaster_charter": pd.read_csv(os.path.join(data_path, 'disaster_charter_standardised.csv')),
    "emdat": pd.read_csv(os.path.join(data_path, 'emdat_standardised.csv')),
    "idmc": pd.read_csv(os.path.join(data_path, 'idmc_standardised.csv')),
    "cerf": pd.read_csv(os.path.join(data_path, 'cerf_standardised.csv')),
    "ifrc": pd.read_csv(os.path.join(data_path, 'ifrc_standardised.csv'))
}

for name, df in standardised_dfs.items():
    print(f"Dataset: {name}, Shape: {df.shape}")
    if not df.empty:
        display(df.iloc[0])
    else:
        print("The DataFrame is empty.")


Dataset: glide, Shape: (7457, 29)


Disaster_Impact_ID                                  NaN
Event_ID               e0c9f247a7808cc69f6652e56d051d99
Source_Event_IDs                 ['AC-2000-000005-CHN']
Event_Name                       ["['Tech. Disaster']"]
Event_Type                                       ['AC']
Country                    ["China, People's Republic"]
Country_Code                                ["['CHN']"]
Location                                     ['Fuging']
Latitude                                   [19.0785907]
Longitude                                 [-98.2598043]
Date                                         2000-01-01
Year                                               2000
Month                                                 1
Day                                                   1
Time                                                NaN
Severity                                         ['[]']
Population_Affected                                 NaN
Fatalities                                      

Dataset: gdacs, Shape: (27458, 29)


Disaster_Impact_ID                                          NaN
Event_ID                       a86a21880e7fb6e3dfd40c2aef67cb2a
Source_Event_IDs                                      [1016960]
Event_Name             ['["Drought in Cote d\'Ivoire, Ghana"]']
Event_Type                                               ['DR']
Country                                ["Cote d'Ivoire, Ghana"]
Country_Code                                   ["['CIV, GHA']"]
Location                      ['["Cote d\'Ivoire", \'Ghana\']']
Latitude                                                 [7.51]
Longitude                                                [-3.5]
Date                                                 2024-04-21
Year                                                       2024
Month                                                       NaN
Day                                                         NaN
Time                                                        NaN
Severity                                

Dataset: disaster_charter, Shape: (381, 29)


Disaster_Impact_ID                                                   NaN
Event_ID                                33e783b3a36c2a747caa09ee87818580
Source_Event_IDs                                                 ['608']
Event_Name                                 ["['Cyclone Fani in India']"]
Event_Type                                              ['Cyclone Fani']
Country                                                        ['India']
Country_Code                                                          []
Location                                                             NaN
Latitude                                                             NaN
Longitude                                                            NaN
Date                                                          2019-05-03
Year                                                                2019
Month                                                                May
Day                                                

Dataset: emdat, Shape: (10105, 29)


Disaster_Impact_ID                                  NaN
Event_ID               0b04a77c968e39023ebbeea416f0d06b
Source_Event_IDs                      ['2014-0461-NER']
Event_Name                                       ['[]']
Event_Type                          ['Animal incident']
Country                                       ['Niger']
Country_Code                                ["['NER']"]
Location                                ['Near Niamey']
Latitude                                             []
Longitude                                            []
Date                                         2014-11-17
Year                                               2014
Month                                              11.0
Day                                                17.0
Time                                                NaN
Severity                                         ['[]']
Population_Affected                                 5.0
Fatalities                                      

Dataset: idmc, Shape: (17805, 29)


Disaster_Impact_ID                                                   NaN
Event_ID                                73e3f57e5c39712d8a915c8ce0d7a1a7
Source_Event_IDs                                                 [12181]
Event_Name                                     ["['CIV-2018 Conflict']"]
Event_Type                                                  ['Conflict']
Country                                                ["Côte d'Ivoire"]
Country_Code                                                 ["['CIV']"]
Location                                                   ['Biankouma']
Latitude                                                      [7.738348]
Longitude                                                    [-7.612799]
Date                                                          2018-05-14
Year                                                                2018
Month                                                                NaN
Day                                                

Dataset: cerf, Shape: (0, 29)
The DataFrame is empty.
Dataset: ifrc, Shape: (4084, 29)


Disaster_Impact_ID                                  NaN
Event_ID               9601bfceec3a265c0b35ae932b0fad91
Source_Event_IDs                                 [2347]
Event_Name                           ["['Poison gas']"]
Event_Type                     ['Biological Emergency']
Country                                    ['Cameroon']
Country_Code                                ["['CMR']"]
Location                                            NaN
Latitude                                            NaN
Longitude                                           NaN
Date                                         1986-08-01
Year                                                NaN
Month                                               NaN
Day                                                 NaN
Time                                                NaN
Severity                                 ["['Yellow']"]
Population_Affected                                 NaN
Fatalities                                      

In [2]:
all_data = pd.concat(standardised_dfs.values(), ignore_index=True)

  all_data = pd.concat(standardised_dfs.values(), ignore_index=True)


In [27]:
import hashlib

group_key = ['Event_Type', 'Country', 'Date']

def consolidate_group(group):
    consolidated_row = {}
    event_ids = sorted(set(group['Event_ID'].dropna().astype(str).tolist()))
    consolidated_row["Event_ID"] = event_ids
    unique_str = "|".join(event_ids)
    disaster_impact_id = "DI_" + hashlib.md5(unique_str.encode("utf-8")).hexdigest()
    consolidated_row["Disaster_Impact_ID"] = disaster_impact_id
    
    for column in group.columns:
        if column in group_key or column == "Event_ID":
            consolidated_row[column] = sorted(set(group[column].dropna().astype(str).tolist()))
        else:
            values = group[column].dropna().tolist()
            if values:
                if all(isinstance(val, list) for val in values):
                    flat_values = [item for sublist in values for item in sublist]
                    consolidated_row[column] = sorted(set(map(str, flat_values)))
                else:
                    consolidated_row[column] = sorted(set(map(str, values)))
            else:
                consolidated_row[column] = None
    
    return consolidated_row


In [28]:
unified_rows = []
for _, group in all_data.groupby(group_key):
    unified_rows.append(consolidate_group(group))

unified_df = pd.DataFrame(unified_rows)

print(f"Unified DataFrame shape: {unified_df.shape}")
display(unified_df.iloc[28412])


Unified DataFrame shape: (66886, 29)


Event_ID               [00af0e3dabd7e9d76971e104aac4bf6b, 5733a456a3a...
Disaster_Impact_ID                                                  None
Source_Event_IDs       [['EQ-2004-000093-JPN'], ['[7889]', '[7856]', ...
Event_Name             [["['Earthquake in Japan']"], ["['Earthquake']"]]
Event_Type                                                      [['EQ']]
Country                                                      [['Japan']]
Country_Code                                               [["['JPN']"]]
Location               [['Near South Coast of Western Honshu'], ['["[...
Latitude               [['[33.05]', '[33.24]', '[33.35]', '[33.12]', ...
Longitude              [['[137.16]', '[136.79]', '[137.02]', '[136.87...
Date                                                        [2004-09-05]
Year                   [2004, [2004, 2004, 2004, 2004, 2004, 2004, 20...
Month                                                                [9]
Day                                                

In [23]:
os.makedirs('/home/evangelos/src/disaster-impact/data_out/data_unified/', exist_ok=True)

unified_df.to_csv('/home/evangelos/src/disaster-impact/data_out/data_unified/unified_data.csv', index=False)

In [29]:
analysis_df = unified_df.copy()
display(analysis_df.head())


Unnamed: 0,Event_ID,Disaster_Impact_ID,Source_Event_IDs,Event_Name,Event_Type,Country,Country_Code,Location,Latitude,Longitude,...,Financial_Loss,Alert_Level,Source,Comments,External_Links,AID_Contribution,Admin_Units,External_IDs,Approval_Date,Disbursement_Date
0,[e0c9f247a7808cc69f6652e56d051d99],,[['AC-2000-000005-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Fuging']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['(Road)']""]]",[[]],,[[]],[[]],,
1,[b58780e7c552092bf2392bc2d1b0b8be],,[['AC-2000-000006-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Xiangtan (Hunan Province)']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['{Hotel} (Misc:Fire)']""]]",[[]],,[[]],[[]],,
2,[649016f3fb106db945bfcc081e4c3bab],,[['AC-2000-000034-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Jiangsu Province']],[[32.061707]],[[118.763232]],...,,[[]],[['[]']],"[[""['{Coal Mine} (Ind:Collapse)']""]]",[[]],,[[]],[[]],,
3,[8f1649c0fc7a534bf0d99a381cd57524],,[['AC-2000-000093-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Guigang (Guangxi Province)']],[[23.111531]],[[109.598927]],...,,[[]],[['[]']],"[[""['(Misc:Fire)']""]]",[[]],,[[]],[[]],,
4,[cf14cab0de23931d096296308952c1d1],,[['AC-2000-000026-CHN']],"[[""['Tech. Disaster']""]]",[['AC']],"[[""China, People's Republic""]]","[[""['CHN']""]]",[['Shunde']],[[19.0785907]],[[-98.2598043]],...,,[[]],[['[]']],"[[""['(Road)']""]]",[[]],,[[]],[[]],,
