### Get event occurrence date for each category

In [2]:
from bs4 import BeautifulSoup
from urllib import request
import re
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
import pickle

### Earthquakes

In [3]:
disaster_lis=[]#name of earthquake(list)
url="https://en.wikipedia.org/wiki/List_of_earthquakes_2011-2020"#summary page
html = request.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
td_tag = soup.find_all("td")#all link tag
pattern1 = r'<i>see</i> <a (.*)</a>'
pattern2 = r'href="\/wiki\/([^ ]+)"'

for tag in td_tag:
    result1 = re.findall(pattern1, str(tag))
    if result1 != []:
        result2 = re.findall(pattern2, result1[0])
        disaster_lis.append(result2[0])


pattern = r'<td class="infobox-data">(\d*-\d*-\d*) \d*:\d*:\d*</td>'
disaster_dict={}#dictionary(name of earthquake:event occurance date)
not_get=[]
count=0

for disaster in list(dict.fromkeys(disaster_lis)):#Avoid duplication and repeat

    url="https://en.wikipedia.org/wiki/{0}".format(disaster)
    html = request.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    info = soup.find(class_="infobox-data")
    result = re.findall(pattern, str(info))
    
    if result == []:
        not_get.append(disaster)
    else:
        event_date=datetime.datetime.strptime(result[0], '%Y-%m-%d')#Event Occurence Date
        from_date = datetime.datetime(2015, 7, 1)
        to_date = datetime.datetime(2020, 7, 1)

        if from_date <= event_date < to_date: 
            disaster_dict[disaster]=event_date.strftime('%Y-%m-%d')
            
        else:
            not_get.append(disaster)
    count+=1
    print(f"\r{count*100/len(list(dict.fromkeys(disaster_lis)))}%", end='')
    
dt_now = datetime.datetime.now()
print(f"get time:{dt_now}")
print(f"number of URL data: {len(list(dict.fromkeys(disaster_lis)))}data")
print(f"can get date:{len(disaster_dict)}data")

100.0%get time:2022-03-23 14:00:12.500964
number of URL data: 150data
can get date:74data


In [4]:
#Save
with open("NameURL/disaster.pkl","wb") as f:
    pickle.dump(disaster_dict, f)

### Tero incidents

In [15]:
years=[2015,2016,2017,2018,2019,2020]#year list
tero_dict={}#dictionary(name of tero:event occurance date)
page=0

for year in years:
    
    url="https://en.wikipedia.org/wiki/List_of_terrorist_incidents_in_"+str(year)#summary page
    data = pd.read_html(url, header = 0)

    tero_name_lis=data[0]["Article"].values.tolist()#get the list of tero from the table
    tero_lis=[]#list of URL
    pattern = r'href="\/wiki\/([^ ]+)"'
    html = request.urlopen(url)
    soup = BeautifulSoup(html,"html.parser",from_encoding='utf-8')

    for name in tero_name_lis:

        title_tero=soup.find_all(title=name)
        result = re.findall(pattern, str(title_tero))

        if result!=[]:
            tero_lis.append(result[0])
        else:
            tero_lis.append(np.nan)

    data[page]["wikiURL"]=tero_lis#newly added to data frame

    #Deletes dates other than one specific day(Dec 19-23)
    df1=data[page][["Date","wikiURL"]][~data[page]['Date'].str.contains('-')&~data[page]['Date'].str.contains('–')&~data[page]['Date'].str.contains('and')]
    
    #Get date information
    date_lis=[]
    for date in df1["Date"].values.tolist():#

        #Since the notation differs depending on the year, obtain it by changing the conditions
        if year==2016 or year==2017:
            date_new_time=datetime.datetime.strptime(date,"%b %d")
        elif year==2015:
            pattern='^[0-9]'
            result = re.findall(pattern, date)
            if result==[]:#when starting with a number, how to take the date first
                date_new_time=datetime.datetime.strptime(date,"%b %d")
            else:
                date_new_time=datetime.datetime.strptime(date,"%d %b")
        else:    
            date_new_time=datetime.datetime.strptime(date,"%d %B")

        date_new=date_new_time.strftime(str(year)+'-%m-%d')
        date_lis.append(date_new)

    #Get a dictionary (URL: event occurrence date) for a limited time
    for URL,date in zip(df1["wikiURL"].values.tolist(),date_lis):

        event_date=datetime.datetime.strptime(date, '%Y-%m-%d')#event occurrence date
        from_date = datetime.datetime(2015, 7, 1)
        to_date = datetime.datetime(2020, 7, 1)

        if from_date <= event_date < to_date: 
                tero_dict[URL]=event_date.strftime('%Y-%m-%d')

dt_now = datetime.datetime.now()
print(f"get time:{dt_now}")
print(f"can get date:{len(tero_dict)}data")

In [None]:
#save
with open("NameURL/tero.pkl","wb") as f:
    pickle.dump(tero_dict, f)

### Notable death

In [None]:
years = [2015,2016,2017,2018,2019,2020]
months =["January","February","March","April","May","June","July","August","September","October","November","December"]

pattern = r'^<li><a href="\/wiki\/[^ ]+"'
death_lis=[]#List of all people
for year in years:
    
    for month in months:
        url = "https://en.wikipedia.org/wiki/Deaths_in_{0}_{1}".format(month,str(year))
        html = request.urlopen(url)
        soup = BeautifulSoup(html, "html.parser")
        li_tag = soup.find_all("li")#get all link tags

        for i in range(len(li_tag)):
            result = re.findall(pattern, str(li_tag[i]))
            if result != []:
                name = result[0][19:-1]
                if "Deaths_in" not in name and "Category" not in name:
                    death_lis.append(result[0][19:-1])

In [None]:
#Get the death date from each Wikipedia page and make it a dictionary type
death_dict={}
pattern = r'<span style="display:none">\((\d*-\d*-\d*)\)</span>'
kara_lis=[]
zerozero_lis=[]
count=0

for death in death_lis:

    url="https://en.wikipedia.org/wiki/{0}".format(death)
    html = request.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    infos = soup.find_all(class_="infobox-data")#get all infobox-data tag

    for info in infos:
        result = re.findall(pattern, str(info))

        if result!=[] and result[0][-2:]!="00":
            event_date=datetime.datetime.strptime(result[0], '%Y-%m-%d')#event occurance date
            from_date = datetime.datetime(2015, 7, 1)
            to_date = datetime.datetime(2020, 7, 1)

            if from_date <= event_date < to_date: 
                        death_dict[death]=event_date.strftime('%Y-%m-%d')

    count+=1
    print(f"\r{count*100/len(death_lis)}%", end='')

#death_dict
print(f"get time:{dt_now}")
print(f"can get date:{len(death_dict)}data")

In [None]:
#save
with open("NameURL/death.pkl","wb") as f:
    pickle.dump(death_dict, f)

### aviation accidents

In [None]:
years=[2015,2016,2017,2018,2019,2020]

aviation_dict={}
for year in years:
    url="https://en.wikipedia.org/wiki/Category:Aviation_accidents_and_incidents_in_"+str(year)#summary page
    html = request.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    li_tag = soup.find_all("li")#get all link tag
    pattern = r'^<li><a href="\/wiki\/[^ ]+"'
    aviation_lis=[]

    for i in range(len(li_tag)):
        result = re.findall(pattern, str(li_tag[i]))
        if result != []:
            name = result[0][19:-1]
            if "Category" not in name:
                aviation_lis.append(result[0][19:-1])

    for aviation in aviation_lis:
        url_each="https://en.wikipedia.org/wiki/"+aviation#summary page
        html = request.urlopen(url_each)
        soup = BeautifulSoup(html, "html.parser")

        #get event occurance date
        span = soup.find('span', class_='bday dtstart published updated')
        date_pattern=r'^<span class="bday dtstart published updated">([0-9]+-[0-9]+-[0-9]+)</span>'
        result = re.findall(date_pattern, str(span))
        if result!=[]:
            event_date=datetime.datetime.strptime(result[0], '%Y-%m-%d')#event occurance date
            from_date = datetime.datetime(2015, 7, 1)
            to_date = datetime.datetime(2020, 7, 1)

            if from_date <= event_date < to_date: 
                        aviation_dict[aviation]=event_date.strftime('%Y-%m-%d')
dt_now = datetime.datetime.now()
print(f"get time:{dt_now}")
print(f"can get date:{len(aviation_dict)}data")

In [None]:
import pickle
with open("NameURL/aviation.pkl","wb") as f:
    pickle.dump(aviation_dict, f)

### Mass murder

In [None]:
years=[2015,2016,2017,2018,2019,2020]
murder_dict={}
for year in years:
    url="https://en.wikipedia.org/wiki/Category:Mass_murder_in_"+str(year)
    html = request.urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    li_tag = soup.find_all("li")#get all link tag
    pattern = r'^<li><a href="\/wiki\/[^ ]+"'
    murder_lis=[]

    for i in range(len(li_tag)):
        result = re.findall(pattern, str(li_tag[i]))
        if result != []:
            name = result[0][19:-1]
            if "Category" not in name:
                murder_lis.append(result[0][19:-1])

    for murder in murder_lis:
        url_each="https://en.wikipedia.org/wiki/"+murder#summary page
        html = request.urlopen(url_each)
        soup = BeautifulSoup(html, "html.parser")

        ##get event occurance date
        span = soup.find('span', class_='bday dtstart published updated')
        date_pattern=r'^<span class="bday dtstart published updated">([0-9]+-[0-9]+-[0-9]+)</span>'
        result = re.findall(date_pattern, str(span))
        if result!=[]:
            event_date=datetime.datetime.strptime(result[0], '%Y-%m-%d')#event occurance date
            from_date = datetime.datetime(2015, 7, 1)
            to_date = datetime.datetime(2020, 7, 1)

            if from_date <= event_date < to_date: 
                        murder_dict[murder]=event_date.strftime('%Y-%m-%d')
                    
dt_now = datetime.datetime.now()
print(f"get time:{dt_now}")
print(f"can get date:{len(murder_dict)}data")

In [None]:
import pickle
with open("NameURL/murder.pkl","wb") as f:
    pickle.dump(murder_dict, f)