In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from tqdm import trange
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
disaster_info = pd.read_excel('./data/emdat_public_2023_05_31_query_uid-ldmOwE.xlsx', skiprows=6)

  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
# Data cleaning.
disaster_info['Start Date'] = pd.to_datetime(dict(year=disaster_info['Start Year'], month=disaster_info['Start Month'], day=disaster_info['Start Day']))
disaster_info['End Date'] = pd.to_datetime(dict(year=disaster_info['End Year'], month=disaster_info['End Month'], day=disaster_info['End Day']))
disaster_info = disaster_info.drop(columns=['Seq', 'OFDA Response', 'Local Time', 'Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day', "Reconstruction Costs ('000 US$)", "Insured Damages ('000 US$)", "Total Damages ('000 US$)", 'CPI', 'Adm Level', 'Admin1 Code', 'Admin2 Code'])

In [4]:
# Omit all natural disasters that took place on dates we do not have news articles on. 
disaster_info = disaster_info[disaster_info['Start Date'] < '2020-04-02'].copy()
disaster_info.reset_index(drop=True, inplace=True)
disaster_info

Unnamed: 0,Dis No,Year,Glide,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,Location,Origin,Associated Dis,Associated Dis2,Appeal,Declaration,AID Contribution ('000 US$),Dis Mag Value,Dis Mag Scale,Latitude,Longitude,River Basin,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,"Reconstruction Costs, Adjusted ('000 US$)","Insured Damages, Adjusted ('000 US$)","Total Damages, Adjusted ('000 US$)",Geo Locations,Start Date,End Date
0,2016-0175-BGD,2016,TC-2016-000052,Natural,Meteorological,Storm,Tropical cyclone,,Cyclone Roanu,Bangladesh,BGD,Southern Asia,Asia,"Barisal; Noakhali, Lakshmipur, Chandpur (Chitt...",,Flood,"Slide (land, mud, snow, rock)",,,,,Kph,,,,28.0,,1083855.0,119700.0,1203555.0,,,731615.0,"Barguna, Bhola, Chandpur, Cox's Bazar, Lakshmi...",2016-05-21,2016-05-21
1,2016-0117-ECU,2016,EQ-2016-000035,Natural,Geophysical,Earthquake,Ground movement,,,Ecuador,ECU,South America,Americas,"Guayaquil district (Guayas province), Muisne d...",,,"Slide (land, mud, snow, rock)",,Yes,44992.0,8.0,Richter,0.382,-79.920,,672.0,6274.0,383090.0,,389364.0,4077535.0,682841.0,2438717.0,"Los Rios, Santa Elena, Santo Domingo de los Ts...",2016-04-16,2016-04-16
2,2016-0095-CHN,2016,,Natural,Hydrological,Flood,Flash flood,,,China,CHN,Eastern Asia,Asia,"Jiangxi Sheng, Hunan Sheng, Guangdong Sheng, G...",Monsoonal rain,"Slide (land, mud, snow, rock)",Hail,,,,383224.0,Km2,24.704,113.991,,5.0,,216000.0,3300.0,219300.0,,,207291.0,"Guangdong Sheng, Guangxi Zhuangzu Zizhiqu, Gui...",2016-03-19,2016-03-22
3,2016-0137-CHN,2016,,Natural,Hydrological,Flood,Riverine flood,,,China,CHN,Eastern Asia,Asia,"Anhui, Hubei, Fujian, Zhejiang, Jiangxi, Hunan...",Heavy rains,"Slide (land, mud, snow, rock)",Storm,,,,,Km2,,,Yangtze river,20.0,,48000.0,,48000.0,,,118278.0,"Anhui Sheng, Fujian Sheng, Guangdong Sheng, Gu...",2016-04-20,2016-04-28
4,2016-0041-FJI,2016,TC-2016-000014,Natural,Meteorological,Storm,Tropical cyclone,,Cyclone Winston,Fiji,FJI,Melanesia,Oceania,"Savusavu (Cakaudrove district, Northern provin...",,Flood,Surge,Yes,Yes,41862.0,325.0,Kph,,,,45.0,144.0,540414.0,,540558.0,,60968.0,731615.0,"Ba, Bua, Cakaudrove, Lau, Lomaiviti, Macuata, ...",2016-02-20,2016-02-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,2020-0050-ZAF,2020,,Natural,Hydrological,Flood,Flash flood,,,South Africa,ZAF,Southern Africa,Africa,Johannesburg and Gauteng Province,Heavy rains,,,,,,,Km2,,,,3.0,,200.0,,200.0,,,,Gauteng (Adm1).,2020-02-07,2020-02-10
1432,2020-0123-COD,2020,,Natural,Hydrological,Flood,,,,Congo (the Democratic Republic of the),COD,Middle Africa,Africa,"Territories of Kailo, Punia Kasongo (Maniema P...",Heavy rains,,,,,,,Km2,,,Congo river,,,30000.0,,30000.0,,,,Maniema (Adm1).,2020-03-17,2020-04-01
1433,2020-0128-COD,2020,,Natural,Hydrological,Flood,,,,Congo (the Democratic Republic of the),COD,Middle Africa,Africa,Bukama Territory (south Haut-Lomami Province).,Torrential rains,,,,,,,Km2,,,,,,73000.0,,73000.0,,,,Haut-Lomami (Adm2).,2020-03-14,2020-03-18
1434,2020-0110-ZMB,2020,,Natural,Hydrological,Flood,,,,Zambia,ZMB,Eastern Africa,Africa,"Samfya, Mushindamo, Nakonde districts (Luapula...",Heavy rains,,,,,,,Km2,,,,,,700000.0,,700000.0,,,,"Nakonde, Samfya, Solwezi (Adm2).",2020-03-20,2020-03-26


In [5]:
# Create a list of the locations (keywords) involved in the natural disaster.
disaster_info['Location List'] = disaster_info['Location'].apply(lambda x: x.replace(' (', '_').replace('), ', '_').replace(', ', '_').replace(')', '').replace('; ', '_').split('_') if type(x) == str else [])

In [6]:
max_total_deaths = disaster_info.loc[disaster_info['Total Deaths'].idxmax()]['Total Deaths']
min_total_deaths = disaster_info.loc[disaster_info['Total Deaths'].idxmin()]['Total Deaths']
max_total_damages_adjusted = disaster_info.loc[disaster_info["Total Damages, Adjusted ('000 US$)"].idxmax()]["Total Damages, Adjusted ('000 US$)"]
min_total_damages_adjusted = disaster_info.loc[disaster_info["Total Damages, Adjusted ('000 US$)"].idxmin()]["Total Damages, Adjusted ('000 US$)"]

print(f'Maximum Total Deaths: {max_total_deaths}')
print(f'Minimum Total Deaths: {min_total_deaths}')
print(f'Maximum Total Damages Adjusted: {max_total_damages_adjusted}')
print(f'Minimum Total Damages Adjusted: {min_total_damages_adjusted}')

Maximum Total Deaths: 4140.0
Minimum Total Deaths: 1.0
Maximum Total Damages Adjusted: 113423011.0
Minimum Total Damages Adjusted: 42.0


In [56]:
disaster_info['Year'].value_counts().sort_index()

NameError: name 'disaster_info' is not defined

In [7]:
news_data = pd.read_csv('./data/all-the-news-2-1.csv') 

In [8]:
news_data['Date'] = pd.to_datetime(dict(year=news_data['year'], month=news_data['month'], day=news_data['day']))
news_data['Time'] = news_data.date.str[-8:]

In [None]:
news_data['year'].value_counts().sort_index()

In [9]:
news_data.drop(columns=['date', 'year', 'month', 'day'], inplace=True)
news_data.rename(columns={'author': 'Author', 'title':'Title', 'url':'Url', 'section':'Section', 'publication':'Publication', 'article': 'Article'}, inplace=True)

In [10]:
news_data['Article'].isna().sum()

104713

In [11]:
news_data = news_data[news_data['Article'].notna()].copy()

In [12]:
news_data['Article'] = news_data['Article'].apply(lambda x: x.replace("\'", "").replace("\xa0", " ").replace("  ", " ").replace("\n", " ") if type(x) == str else float("nan"))

In [13]:
disaster_info['mentions_1'] = 0
disaster_info['mentions_2'] = 0
disaster_info['mentions_3'] = 0

## Data Merging

In [14]:
pre_margin, post_margin = 1, 7

In [15]:
conv_dict = conv_dict = {'Cyclone Roanu': [['cyclone', 'roanu']],
             'Cyclone Winston': [['cyclone', 'winston']], 
             'Fort McMurray wildfires': [['fort', 'wildfires'],['mcmurray', 'wildfires']], 
             'Typhoon Butchoy (Nepartak)': [['typhoon','butchoy'],['typhoon', 'nepartak']], 
             'Tropical cyclone Zena': [['cyclone', 'Zena']], 
             'Yarloop busfires': [['yarloop','busfires']], 
             "Tropical storm 'Earl'": [['storm','earl']], 
             'Cholera': [['cholera']], 
             'Hurricane Newton': [['hurricane', 'newton']], 
             'Typhoon Megi': [['typhoon', 'megi']], 
             "Hurricane 'Matthew'": [['hurricane', 'matthew']], 
             'Typhoon Ferdie (Meranti)': [['typhoon', 'ferdie'], ['typhoon', 'meranti']], 
             'Tropical storm Otto': [['storm', 'otto']], 
             'Storm Jonas (Snowzilla)': [['storm','jonas'], ['storm', 'snowzilla']], 
             'Cyclone Ulla': [['cyclone', 'ulla']], 
             'Sand Fire': [['sand', 'fire']], 
             'Clayton Fire': [['clayton', 'fire']], 
             'Storm Mirinae': [['storm', 'mirinae']], 
             'Dengue': [['dengue']], 
             'Yellow fever': [['yellow', 'fever']], 
             'Typhoon Karen (Sarika)': [['typhoon', 'karen'], ['typhoon', 'sarika']], 
             'Typhoon Lawin (Haima)': [['typhoon' ,'lawin'], ['typhoon', 'haima']], 
             'Tropical storm Dianmu': [['storm','dianmu']], 
             'Typhoon Carina (Nida)': [['typhoon','carina'], ['typhoon','nada']], 
             "Tropical depression'16/Hurricane 'Nate'": [['depression', 'nate'], ['hurricane', 'nate']], 
             "Hurricane 'Maria'": [['hurricane','Maria']], 
             'Sir Ivan fire': [['fire','Ivan']], 
             'Gustafsen, Wildwood, Elephant Hill Fires': [['fire','Gustafson'], ['fire','wildwood'], ['fire','elephant', 'hill']], 
             'Debbie': [['Debbie']], 
             "Typhoon 'Hato'": [['typhoon','hate']], 
             "Hurricane 'Irma'": [['hurricane','irma']], 
             'Cyclone Mora': [['cyclone','mora']], 
             'Diphteria': [['diphtheria']], 
             'Tropical Storm Talas': [['storm','talas']], 
             'Typhoon Nesat & Haitang': [['typhoon','nesat'], ['typhoon','haitang']], 
             "Tropical storm 'Pakhar'/'Jolina'": [['storm','pakhar'], ['storm','jolina']], 
             "'Zeus'": [['zeus']], 'Eleanor': [['eleanor']], 
             "Tropical storm 'Alberto'": [['storm','alberto']], 
             'Volcan de Fuego': [['volcan','fuego']], 
             "Typhoon 'Maria' (Gardo)": [['typhoon', 'maria'], ['typhoon', 'gardo']], 
             "Typhoon 'Bebinca'": [['typhoon', 'bebinca']], 
             'Tropical Storm Yagi': [['storm', 'yagi']], 
             'Tropical storm Rumbia': [['storm', 'rumbia']], 
             "Cyclone 'Sagar'": [['cyclone', 'sagar']], 
             "Cyclone 'Josie'": [['cyclone', 'josie']], 
             "'Friederike' (David)": [['friederike'], ['david']], 
             'Influenza': [['influenza']], 
             'Hepatitis E': [['hepatitis e']], 
             "Tropical storm 'Ewiniar'": [['storm', 'ewiniar']], 
             "Tropical storm 'Ampil' (Inday)": [['storm', 'ampil'], ['storm', 'inday']], 
             "Typhoon 'Soulik'": [['typhoon', 'soulik']], 
             'Typhoon Mangkut (Ompong)': [['typhoon', 'mangkut'], ['typhoon', 'ompong']], 
             'Hurricane Michael': [['hurricane', 'michael']], 
             "Hurricane 'Beryl'": [['hurricane', 'beryl']], 
             "Tropical cyclone 'Keni'": [['cyclone', 'keni']], 
             "Storm 'Adrian'": [['storm','adrian']], 
             "Cyclone 'Kenneth'": [['cyclone', 'kenneth']], 
             "Cyclone 'Fani'": [['cyclone', 'fani']], 
             "Cyclone 'Wutip'": [['cyclone', 'wutip']], 
             'Typhoon Chaba (Chyba?)': [['typhoon','chaba'], ['typhoon', 'chyba']], 
             'Tropical storm Mindulle': [['storm', 'mindulle']], 
             'Tropical storm Marce (Tokage)': [['storm','marce'],['storm','tokage']], 
             "Cyclone 'Vardah'": [['cyclone','vardah']], 
             'Typhoon Lionrock': [['typhoon','lionrock']], 
             'Typhoon Malakas': [['typhoon','malakas']], 
             'Typhoon Helen (Megi)': [['typhoon','helen']], 
             'Measles': [['measles']], 'Lassa fever': [['lassa', 'fever']], 
             "Typhoon 'Nina' (Nock-Ten)": [['typhoon','nina'], ['typhoon','nock-ten']], 
             "Typhoon 'Lan'/'Paolo'": [['typhoon','lan'], ['typhoon','paolo']], 
             "Typhoon 'Talim'": [['typhoon','talim']], 
             "Cyclone 'Ockhi'": [['cyclone','ockhi']], 
             'Beatriz': [['beatriz']], 
             "Hurricane 'Franklin'": [['hurricane','franklin']], 
             "Typhoon 'Damrey' / 'Ramil'": [['typhoon','damrey'], ['typhoon','ramil']], 
             "Storm 'Tembin' (Vinta)": [['storm','tembin'],['storm','vinta']], 
             "Tropical storm 'Kai-Tak' (Urduja)": [['storm','kai-tak'], ['storm','urduja']], 
             "Cyclone 'Cempaka'": [['cyclone','cempaka']], 
             'Hurricane Katia': [['hurricane','katia']], 
             "Typhoon 'Doksuri'": [['typhoon','doksuri']], 
             "Cyclone 'Enawo'": [['cyclone','enawo']], 
             "Tropical storm 'Lidia'": [['storm','lidia']], 
             "Cyclone 'Dineo'": [['cyclone','dineo']], 
             'Mt. Agung': [['mt. agung'],['mount agung']], 
             "Typhoon 'Noru'": [['typhoon','noru']], 
             'Marburg': [['marburg']], 
             'Chikungunya': [['chikungunya']], 
             'Plague': [['plague']], 
             'Meningitis': [['meningitis']], 
             'Hepatitis E Virus': [['hepatitis e']], 
             'Tropical depression 02W (Crising)': [['depression','crising']], 
             'Influenza A': [['influenza a']], 
             "Cyclone 'Ava'": [['cyclone','ava']], 
             "Tropical storm 'Basyang' (Sanba)": [['storm','basyang'], ['storm','sanba']], 
             "Tropical depression 'Usman'": [['depression','usman']], 
             "Tropical storm 'Titli'": [['storm','titli']], 
             "Typhoon 'Yutu' (Rosita)": [['typhoon', 'yutu'], ['typhoon', 'rosita']], 
             'Anak Krakatoa': [['anak'], ['krakatoa']], 
             'Mount Etna': [['mount', 'etna'], ['etna']], 
             'Typhoon Jebi': [['typhoon', 'jebi']], 
             'Typhoon Trami': [['typhoon', 'trami']], 
             "Tyhoon 'Son Tinh'": [['typhoon', 'son', 'tinh']], 
             "Tropical storm 'Eliakim'": [['storm', 'eliakim']], 
             "Tropical storm 'Vicente'": [['storm', 'vicente']], 
             "Tropical storm 'Berguitta'": [['storm', 'berguitta']], 
             "Cyclone 'Mekunu'": [['cyclone', 'mekunu']], 
             "Cyclone 'Luban'": [['cyclone', 'luban']], 
             "Cyclone 'Gaja'": [['cyclone', 'gaja']], 
             "Tropical storm 'Phethai'": [['storm', 'pethai']], 
             "Typhoon 'Kong-Rey'": [['typhoon', 'kong-rey']], 
             'Hurricane Willa': [['hurricane', 'willa']], 
             "Tropical cyclone 'Penny'": [['cyclone', 'penny']], 
             'Mt. Mayon': [['mt.', 'mayon'], ['mount', 'mayon']], 
             'Agaton (01W)': [['agaton']], 
             "Storm 'Leslie'": [['storm', 'leslie']], 
             "Storm 'Norma'": [['storm', 'norma']], 
             "Cyclone 'Idai'": [['cyclone', 'idai']], 
             'Acute Encephalitis Syndrome (AES)': [['acute', 'encephalitis', 'syndrome']], 
             "Tropical depression 'Amang' (01W)": [['depression', 'amang']], 
             'Mount Ulawun and Manam': [['mount', 'ulawun', 'manam'], ['mount', 'ulawun'], ['mount', 'manam']], 
             'Hurricane Hermine': [['hurricane', 'hermine']], 
             'Tropical storm Aere': [['storm', 'aere']], 
             'Hurricane Harvey': [['hurricane', 'harvey']], 
             'Acute Watery Diarrhoea': [['acute', 'watery', 'diarrhea'], ['acute', 'watery', 'diarrhoea']], 
             'Wall Fire, Alamo Fire, Whittier Fire': [['wall', 'fire'], ['Alamo', 'fire'], ['Whittier', 'fire']], 
             'Tubbs, Atlas, Nuns Fires': [['tubes', 'atlas', 'nuns', 'fires'], ['tubes', 'nuns', 'fires'], ['nuns', 'fires'], ['tubes', 'fires']], 
             "'Thomas'": [['thomas']], 
             'Mt Monaro': [['mt', 'monaro'], ['mount', 'monaro']], 
             "Tropical cyclone 'Donna'": [['cyclone', 'donna']], 
             'Knysna fire': [['knysna', 'fire']], 'Kilauea': [['kilauea']], 
             'Ambryn': [['ambryn']], 
             'Carr and Mendocino Complex fires': [['carr', 'mendocino', 'fires']], 
             'Cyclone Gita': [['cyclone', 'gita']], 'Hurricane Florence': [['hurricane', 'florence']], 
             'Camp Fire': [['camp', 'fire']], 'Woolsey Fire': [['woolsey', 'fire']], 
             "Tropical depression 'Toraji'": [['depression', 'toraji']], 
             'Ebola': [['ebola']], 
             'Typhoid fever': [['typhoid', 'fever']], 
             'Currowan': [['currowan']], 
             'Tropical cyclone Lekima (Hanna)': [['cyclone', 'lekima'], ['cyclone', 'hanna']], 
             'FL-2019-000131': [['fl-2019-000131']], 
             "Tropical cylone 'Pabuk'": [['cyclone', 'pabuk']], 
             "Tropical cyclone 'Bulbul'": [['cyclone', 'bulbul']], 
             "Tropical cyclone 'Dorian'": [['cyclone', 'dorian']], 
             'Ubinas': [['ubinas']], 
             "Tropical cyclone 'Lingling'": [['cyclone', 'lingling']], 
             "Tropical cyclone 'Mitag'": [['cyclone', 'mitag']], 
             "Cyclone 'Belna'": [['cyclone', 'belna']], 
             "Storm 'Ciara' (Sabine)": [['storm', 'ciara'], ['storm', 'sabine']], 
             "Storm 'Amelie'": [['storm', 'amelie']], 
             "Tropical cylone 'Hagibis'": [['cyclone', 'hagibis']], 
             "Tropical cyclone 'Lorena'": [['cyclone', 'lorena']], 
             "Tropical storm 'Fernand'": [['storm', 'fernand']], 
             'Elsa, Fabien': [['elsa'], ['fabien']], 
             "Tropical cyclone 'Sarai'": [['cyclone', 'sarai']], 
             "Tropical storm 'Tapah'": [['storm', 'tapah']], 
             "Typhoon 'Faxai'": [['typhoon', 'faxai']], 
             "Tropical storm 'Narda'": [['storm', 'narda']], 
             'Whakaari/White Island volcano': [['whakaari', 'volcano'], ['white', 'island', 'volcano']], 
             "Cyclone 'Herold'": [['cyclone', 'herold']], 
             "'Gloria'": [['gloria']], 
             "Cyclone 'Tino'": [['cyclone', 'tino']], 
             "Tropical cyclone 'Kalmaegi' (Ramon)": [['cyclone', 'kalmaegi'], ['cyclone', 'ramon']], 
             "Tropical cyclone 'Phanfone' (Ursula)": [['cyclone', 'phanfone'], ['cyclone', 'ursula']], 
             "Tropical Storm 'Pawan'": [['storm', 'pawan']], 
             "Typhoon 'Matmo'": [['typhoon', 'matmo']], 
             "Tropical storm ' Imelda'": [['storm', 'imelda']], 
             "Tropical cyclone 'Nakri'": [['cyclone', 'nakri']], 
             "Tropical cyclone 'Danas'": [['cyclone', 'danas']], 
             "Tropical cyclone 'Podul'": [['cyclone', 'podul']], 
             "Tropical cyclone 'Kammuri' (Tisoy)": [['cyclone', 'kammuri'], ['cyclone', 'tisoy']], 
             "Tropical cylone 'Barry'": [['cyclone', 'barry']], 
             'Saddleridge fire & Sandalwood fire': [['saddleridge', 'fire'], ['sandalwood', 'fire']], 
             'Kincade Fire': [['kincade', 'fire']], 'Taal volcano': [['taal', 'volcano']]}

In [16]:
from rdflib import Namespace, Graph, URIRef, Literal

# Define your custom namespace
DIS = Namespace("Disaster_namespace")

# Create a new RDF graph
g = Graph()
g.bind("dis", DIS)

for i in trange(len(disaster_info)):
    start_date, end_date = disaster_info.iloc[i]['Start Date'], disaster_info.iloc[i]['End Date']
    start_date_adj = start_date - timedelta(days=pre_margin)
    end_date_adj = end_date + timedelta(days=post_margin)
    candidate_articles = news_data[(news_data['Date']  < end_date_adj) & (news_data['Date']  > start_date_adj)]
    disaster_types = [disaster_info.iloc[i]['Disaster Type'], disaster_info.iloc[i]['Disaster Subtype'], disaster_info.iloc[i]['Disaster Subsubtype']]
    
    disaster_uri = URIRef(disaster_info.iloc[i]['Dis No'])
    
    
    if not pd.isna(disaster_info.iloc[i]['Year']):
        g.add((disaster_uri, DIS.year, Literal(disaster_info.iloc[i]['Year'])))
    if not pd.isna(disaster_info.iloc[i]['Glide']):
        g.add((disaster_uri, DIS.glide, Literal(disaster_info.iloc[i]['Glide'])))
    if not pd.isna(disaster_info.iloc[i]['Disaster Group']):
        g.add((disaster_uri, DIS.disasterGroup, Literal(disaster_info.iloc[i]['Disaster Group'])))
    if not pd.isna(disaster_info.iloc[i]['Disaster Subgroup']):
        g.add((disaster_uri, DIS.disasterSubgroup, Literal(disaster_info.iloc[i]['Disaster Subgroup'])))
    if not pd.isna(disaster_info.iloc[i]['Disaster Type']):
        g.add((disaster_uri, DIS.disasterType, Literal(disaster_info.iloc[i]['Disaster Type'])))
    if not pd.isna(disaster_info.iloc[i]['Disaster Subtype']):
        g.add((disaster_uri, DIS.disasterSubtype, Literal(disaster_info.iloc[i]['Disaster Subtype'])))
    if not pd.isna(disaster_info.iloc[i]['Disaster Subsubtype']):
        g.add((disaster_uri, DIS.disasterSubsubtype, Literal(disaster_info.iloc[i]['Disaster Subsubtype'])))
    if not pd.isna(disaster_info.iloc[i]['Event Name']):
        g.add((disaster_uri, DIS.eventName, Literal(disaster_info.iloc[i]['Event Name'])))
    if not pd.isna(disaster_info.iloc[i]['Country']):
        g.add((disaster_uri, DIS.country, Literal(disaster_info.iloc[i]['Country'])))
    if not pd.isna(disaster_info.iloc[i]['ISO']):
        g.add((disaster_uri, DIS.iso, Literal(disaster_info.iloc[i]['ISO'])))
    if not pd.isna(disaster_info.iloc[i]['Region']):
        g.add((disaster_uri, DIS.region, Literal(disaster_info.iloc[i]['Region'])))
    if not pd.isna(disaster_info.iloc[i]['Continent']):
        g.add((disaster_uri, DIS.continent, Literal(disaster_info.iloc[i]['Continent'])))
    if not pd.isna(disaster_info.iloc[i]['Location']):
        g.add((disaster_uri, DIS.location, Literal(disaster_info.iloc[i]['Location'])))
    if not pd.isna(disaster_info.iloc[i]['Origin']):
        g.add((disaster_uri, DIS.origin, Literal(disaster_info.iloc[i]['Origin'])))
    if not pd.isna(disaster_info.iloc[i]['Total Deaths']):
        g.add((disaster_uri, DIS.totalDeaths, Literal(disaster_info.iloc[i]['Total Deaths'])))
    if not pd.isna(disaster_info.iloc[i]['No Injured']):
        g.add((disaster_uri, DIS.noInjured, Literal(disaster_info.iloc[i]['No Injured'])))
    if not pd.isna(disaster_info.iloc[i]['No Affected']):
        g.add((disaster_uri, DIS.noAffected, Literal(disaster_info.iloc[i]['No Affected'])))
    if not pd.isna(disaster_info.iloc[i]['No Homeless']):
        g.add((disaster_uri, DIS.noHomeless, Literal(disaster_info.iloc[i]['No Homeless'])))
    if not pd.isna(disaster_info.iloc[i]['Total Affected']):
        g.add((disaster_uri, DIS.totalAffected, Literal(disaster_info.iloc[i]['Total Affected'])))
    if not pd.isna(disaster_info.iloc[i]["Reconstruction Costs, Adjusted ('000 US$)"]):
        g.add((disaster_uri, DIS.reconstructionCosts, Literal(disaster_info.iloc[i]["Reconstruction Costs, Adjusted ('000 US$)"])))
    if not pd.isna(disaster_info.iloc[i]["Insured Damages, Adjusted ('000 US$)"]):
        g.add((disaster_uri, DIS.insuredDamages, Literal(disaster_info.iloc[i]["Insured Damages, Adjusted ('000 US$)"])))
    if not pd.isna(disaster_info.iloc[i]["Total Damages, Adjusted ('000 US$)"]):
        g.add((disaster_uri, DIS.totalDamages, Literal(disaster_info.iloc[i]["Total Damages, Adjusted ('000 US$)"])))
    if not pd.isna(disaster_info.iloc[i]['Start Date']):
        g.add((disaster_uri, DIS.startDate, Literal(disaster_info.iloc[i]['Start Date'])))
    if not pd.isna(disaster_info.iloc[i]['End Date']):
        g.add((disaster_uri, DIS.endDate, Literal(disaster_info.iloc[i]['End Date'])))
    if not pd.isna(disaster_info.iloc[i]['Geo Locations']):
        g.add((disaster_uri, DIS.geoLocations, Literal(disaster_info.iloc[i]['Geo Locations'])))

    
    count_1 = 0
    count_2 = 0
    count_3 = 0
    for j, candidate in candidate_articles.iterrows():
        contains = False
        # _________________________________Mentions 1____________________________________________________
        # If event name is mentioned in an article it assumed to talk about the subject.
        if type(disaster_info.iloc[i]['Event Name']) == str:
            kws_1 = conv_dict[disaster_info.iloc[i]['Event Name']]
            for kws in kws_1:
                if all(x in candidate.Article.lower() for x in kws):
                    count_1 += 1
                    contains = True
                    
                    article_uri = URIRef(candidate.Url)
                    if not pd.isna(candidate['Author']):
                        g.add((article_uri, DIS.author, Literal(candidate['Author'])))
                    if not pd.isna(candidate['Title']):
                        g.add((article_uri, DIS.title, Literal(candidate['Title'])))
                    if not pd.isna(candidate['Article']):
                        g.add((article_uri, DIS.article, Literal(candidate['Article'])))
                    if not pd.isna(candidate['Section']):
                        g.add((article_uri, DIS.section, Literal(candidate['Section'])))
                    if not pd.isna(candidate['Publication']):
                        g.add((article_uri, DIS.publication, Literal(candidate['Publication'])))
                    if not pd.isna(candidate['Date']):
                        g.add((article_uri, DIS.date, Literal(candidate['Date'])))
                    if not pd.isna(candidate['Time']):
                        g.add((article_uri, DIS.time, Literal(candidate['Time'])))  
                    g.add((disaster_uri, DIS.mention_1, article_uri))
                    
                    break
            if contains:
                continue
        
        # _________________________________Mentions 2____________________________________________________
        # If any of the locations in the locations list is mentioned together with the type of disaster or 
        # subtype or disaster, the article is counted towards mentions 2.
        if any(location.lower() in candidate.Article.lower() for location in disaster_info.iloc[i]['Location List'] if type(location) == str) and any(dis_type.lower() in candidate.Article.lower() for dis_type in disaster_types if type(dis_type) == str):
            count_2 += 1
            article_uri = URIRef(candidate.Url)
            if not pd.isna(candidate['Author']):
                g.add((article_uri, DIS.author, Literal(candidate['Author'])))
            if not pd.isna(candidate['Title']):
                g.add((article_uri, DIS.title, Literal(candidate['Title'])))
            if not pd.isna(candidate['Article']):
                g.add((article_uri, DIS.article, Literal(candidate['Article'])))
            if not pd.isna(candidate['Section']):
                g.add((article_uri, DIS.section, Literal(candidate['Section'])))
            if not pd.isna(candidate['Publication']):
                g.add((article_uri, DIS.publication, Literal(candidate['Publication'])))
            if not pd.isna(candidate['Date']):
                g.add((article_uri, DIS.date, Literal(candidate['Date'])))
            if not pd.isna(candidate['Time']):
                g.add((article_uri, DIS.time, Literal(candidate['Time'])))
            g.add((disaster_uri, DIS.mention_2, article_uri))
            
            
            continue
    
    
        # _________________________________Mentions 3____________________________________________________
        # If the country is mentioned together with the type of disaster or 
        # subtype or disaster, the article is counted towards mentions 3.
        if type(disaster_info.iloc[i]['Country'].lower()) == str:
            if disaster_info.iloc[i]['Country'].lower() in candidate.Article.lower() and any(dis_type.lower() in candidate.Article.lower() for dis_type in disaster_types if type(dis_type) == str):
                count_3 += 1
                article_uri = URIRef(candidate.Url)
                if not pd.isna(candidate['Author']):
                    g.add((article_uri, DIS.author, Literal(candidate['Author'])))
                if not pd.isna(candidate['Title']):
                    g.add((article_uri, DIS.title, Literal(candidate['Title'])))
                if not pd.isna(candidate['Article']):
                    g.add((article_uri, DIS.article, Literal(candidate['Article'])))
                if not pd.isna(candidate['Section']):
                    g.add((article_uri, DIS.section, Literal(candidate['Section'])))
                if not pd.isna(candidate['Publication']):
                    g.add((article_uri, DIS.publication, Literal(candidate['Publication'])))
                if not pd.isna(candidate['Date']):
                    g.add((article_uri, DIS.date, Literal(candidate['Date'])))
                if not pd.isna(candidate['Time']):
                    g.add((article_uri, DIS.time, Literal(candidate['Time'])))
                g.add((disaster_uri, DIS.mention_3, article_uri))
    
    disaster_info.loc[i, 'mentions_1'] = count_1
    disaster_info.loc[i, 'mentions_2'] = count_2
    disaster_info.loc[i, 'mentions_3'] = count_3

100%|████████████████████████████████████| 1436/1436 [10:59:57<00:00, 27.58s/it]


In [19]:
disaster_info.to_csv('./results/disaster_results.csv')

In [20]:
g

<Graph identifier=Nd1e906937adb45d1ad7a94f3687b1cd8 (<class 'rdflib.graph.Graph'>)>

In [21]:
g.serialize(destination="./results/disaster_news_graph.ttl")

<Graph identifier=Nd1e906937adb45d1ad7a94f3687b1cd8 (<class 'rdflib.graph.Graph'>)>

In [23]:
g.serialize(destination="./results/disaster_news_graph.xml", format="xml")

ValueError: Can't split 'Disaster_namespaceinsuredDamages'

In [24]:
disaster_info

Unnamed: 0,Dis No,Year,Glide,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Disaster Subsubtype,Event Name,Country,ISO,Region,Continent,Location,Origin,Associated Dis,Associated Dis2,Appeal,Declaration,AID Contribution ('000 US$),Dis Mag Value,Dis Mag Scale,Latitude,Longitude,River Basin,Total Deaths,No Injured,No Affected,No Homeless,Total Affected,"Reconstruction Costs, Adjusted ('000 US$)","Insured Damages, Adjusted ('000 US$)","Total Damages, Adjusted ('000 US$)",Geo Locations,Start Date,End Date,Location List,mentions_1,mentions_2,mentions_3
0,2016-0175-BGD,2016,TC-2016-000052,Natural,Meteorological,Storm,Tropical cyclone,,Cyclone Roanu,Bangladesh,BGD,Southern Asia,Asia,"Barisal; Noakhali, Lakshmipur, Chandpur (Chitt...",,Flood,"Slide (land, mud, snow, rock)",,,,,Kph,,,,28.0,,1083855.0,119700.0,1203555.0,,,731615.0,"Barguna, Bhola, Chandpur, Cox's Bazar, Lakshmi...",2016-05-21,2016-05-21,"[Barisal, Noakhali, Lakshmipur, Chandpur, Chit...",4,0,0
1,2016-0117-ECU,2016,EQ-2016-000035,Natural,Geophysical,Earthquake,Ground movement,,,Ecuador,ECU,South America,Americas,"Guayaquil district (Guayas province), Muisne d...",,,"Slide (land, mud, snow, rock)",,Yes,44992.0,8.0,Richter,0.382,-79.920,,672.0,6274.0,383090.0,,389364.0,4077535.0,682841.0,2438717.0,"Los Rios, Santa Elena, Santo Domingo de los Ts...",2016-04-16,2016-04-16,"[Guayaquil district, Guayas province, Muisne d...",0,5,71
2,2016-0095-CHN,2016,,Natural,Hydrological,Flood,Flash flood,,,China,CHN,Eastern Asia,Asia,"Jiangxi Sheng, Hunan Sheng, Guangdong Sheng, G...",Monsoonal rain,"Slide (land, mud, snow, rock)",Hail,,,,383224.0,Km2,24.704,113.991,,5.0,,216000.0,3300.0,219300.0,,,207291.0,"Guangdong Sheng, Guangxi Zhuangzu Zizhiqu, Gui...",2016-03-19,2016-03-22,"[Jiangxi Sheng, Hunan Sheng, Guangdong Sheng, ...",0,0,21
3,2016-0137-CHN,2016,,Natural,Hydrological,Flood,Riverine flood,,,China,CHN,Eastern Asia,Asia,"Anhui, Hubei, Fujian, Zhejiang, Jiangxi, Hunan...",Heavy rains,"Slide (land, mud, snow, rock)",Storm,,,,,Km2,,,Yangtze river,20.0,,48000.0,,48000.0,,,118278.0,"Anhui Sheng, Fujian Sheng, Guangdong Sheng, Gu...",2016-04-20,2016-04-28,"[Anhui, Hubei, Fujian, Zhejiang, Jiangxi, Huna...",0,0,49
4,2016-0041-FJI,2016,TC-2016-000014,Natural,Meteorological,Storm,Tropical cyclone,,Cyclone Winston,Fiji,FJI,Melanesia,Oceania,"Savusavu (Cakaudrove district, Northern provin...",,Flood,Surge,Yes,Yes,41862.0,325.0,Kph,,,,45.0,144.0,540414.0,,540558.0,,60968.0,731615.0,"Ba, Bua, Cakaudrove, Lau, Lomaiviti, Macuata, ...",2016-02-20,2016-02-21,"[Savusavu, Cakaudrove district, Northern provi...",23,24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1431,2020-0050-ZAF,2020,,Natural,Hydrological,Flood,Flash flood,,,South Africa,ZAF,Southern Africa,Africa,Johannesburg and Gauteng Province,Heavy rains,,,,,,,Km2,,,,3.0,,200.0,,200.0,,,,Gauteng (Adm1).,2020-02-07,2020-02-10,[Johannesburg and Gauteng Province],0,0,8
1432,2020-0123-COD,2020,,Natural,Hydrological,Flood,,,,Congo (the Democratic Republic of the),COD,Middle Africa,Africa,"Territories of Kailo, Punia Kasongo (Maniema P...",Heavy rains,,,,,,,Km2,,,Congo river,,,30000.0,,30000.0,,,,Maniema (Adm1).,2020-03-17,2020-04-01,"[Territories of Kailo, Punia Kasongo, Maniema ...",0,0,0
1433,2020-0128-COD,2020,,Natural,Hydrological,Flood,,,,Congo (the Democratic Republic of the),COD,Middle Africa,Africa,Bukama Territory (south Haut-Lomami Province).,Torrential rains,,,,,,,Km2,,,,,,73000.0,,73000.0,,,,Haut-Lomami (Adm2).,2020-03-14,2020-03-18,"[Bukama Territory, south Haut-Lomami Province.]",0,0,0
1434,2020-0110-ZMB,2020,,Natural,Hydrological,Flood,,,,Zambia,ZMB,Eastern Africa,Africa,"Samfya, Mushindamo, Nakonde districts (Luapula...",Heavy rains,,,,,,,Km2,,,,,,700000.0,,700000.0,,,,"Nakonde, Samfya, Solwezi (Adm2).",2020-03-20,2020-03-26,"[Samfya, Mushindamo, Nakonde districts, Luapul...",0,0,1


In [2]:
from rdflib import Namespace, Graph
from rdflib.plugins.sparql import prepareQuery

g = Graph()
g.parse("./results/disaster_news_graph.ttl", format="turtle")

<Graph identifier=N27ab4880eec24a3b91e849dda1b1aa98 (<class 'rdflib.graph.Graph'>)>

In [None]:
# Define your custom namespace
DIS = Namespace("<http://www.w3.org/2001/XMLSchema#>")

query = prepareQuery("""
    SELECT DISTINCT ?eventName
    WHERE {
        ?disaster DIS:eventName ?eventName.
    }
    """,
    initNs={"DIS": "Disaster_namespace"})

# Execute the query on the graph
results = g.query(query)
print(len(results))

In [None]:
# Define your custom namespace
DIS = Namespace("Disaster_namespace")

query = prepareQuery("""
    SELECT DISTINCT ?disaster ?eventName ?disasterType ?location ((COUNT(?mentions1) + COUNT(?mention2) + COUNT(?mentions3)) AS ?combinedMentions)
    WHERE {?disaster DIS:mentions_1 ?mentions1 ;
      ?disaster DIS:mentions_2 ?mention2 ;
      ?disaster DIS:mentions_3 ?mentions3 ;
      ?disaster DIS:eventName ?eventName ;
      ?disaster DIS:disasterType ?disasterType ;
      ?disaster DIS:location ?location .
    }
    GROUP BY ?disaster
    ORDER BY DESC(?combinedMentions)
    # LIMIT 5
    """,
    initNs={"DIS" : DIS})

# Execute the query on the graph
results = g.query(query)
print(len(results))

In [30]:
# Iterate over the query results and print them
for row in results:
    eventName = row["eventName"]
    disasterType = row["disasterType"]
    location = row["location"]
    combinedMentions = row["combinedMentions"]
    print("Event Name:", eventName)
    print("Disaster Type:", disasterType)
    print("Location:", location)
    print("Combined Mentions:", combinedMentions)
    print("---")