In [87]:
import os
import re
import time
import json
import logging
import calendar
import datetime
import functools

import feedparser
from pytz import utc
from dateutil.parser import parse
import os
import re
import time
import json
import logging
import calendar
import datetime
import functools

import feedparser
from pytz import utc
from dateutil.parser import parse

try:
    from urllib2 import urlopen
    from urllib2 import HTTPError
except ImportError:
    from urllib.request import urlopen
    from urllib.error import HTTPError


# In[3]:

FEED_URL = 'http://gdacs.org/rss.aspx?profile=ARCHIVE&from={}'
DISPLACED_RE = re.compile('.*(\d+) displaced')
DEPTH_RE = re.compile('.*Depth:(\d+\.\d+)km')


def dtconv(struct_time):
    dt = datetime.datetime(*struct_time[:7], tzinfo=utc)
    return calendar.timegm(dt.utctimetuple())


def dt_to_date(s):
    dt = parse(s)
    dt = dt.replace(hour=0, minute=0, second=0)
    return calendar.timegm(dt.utctimetuple())


def tsconv(s):
    dt = parse(s)
    return calendar.timegm(dt.utctimetuple())


def get_map(entry):
    try:
        return [l['href'] for l in entry.links if l['rel'] == 'enclosure'][0]
    except IndexError:
        return None


def url_to_filename(url):
    return url.split('/')[-1]


def fetch_asset(url, outdir):
    print "fetching asset from %s" %url
    logging.debug('Fetchings asset from %s', url)
    filename = url_to_filename(url)
    outpath = os.path.join(outdir, filename)
    try:
        with open(outpath, 'w') as f:
            f.write(urlopen(url).read())
    except HTTPError:
        return ''
    return filename


def get_assets(data, outdir):
    for k, url in data.items():
        if url:
            data[k] = fetch_asset(url, outdir)


def gdacs_data(fn):
    @functools.wraps(fn)
    def wrapper(entry, outdir):
        logging.debug('Formatting entry %s', entry.gdacs_eventid)
        data = {
            'id': entry.gdacs_eventid,
            'type': entry.gdacs_eventtype,
            'name': entry.gdacs_eventname,
            'location': entry.gdacs_country or None,
            'alert_level': entry.gdacs_alertlevel.lower(),
            'updated': dtconv(entry.updated_parsed),
            'summary': entry.summary,
            'maps': {
                'thumb': get_map(entry),
                'details': entry.gdacs_mapimage or None,
            }
        }
        get_assets(data['maps'], outdir)
        data['info'] = fn(entry)
        return data
    return wrapper


@gdacs_data
def format_earthquake(entry):
    try:
        depth = float(DEPTH_RE.match(entry.summary).group(1))
    except (AttributeError, ValueError, TypeError):
        depth = None
    return {
        'severity': entry.gdacs_severity['value'],
        'affected_population': entry.gdacs_population['value'],
        'time': tsconv(entry.gdacs_fromdate),
    }


@gdacs_data
def format_flood(entry):
    return {
        'severity': entry.gdacs_severity['value'],
        'affected_population': entry.gdacs_population['value'],
        'duration': {
            'from': dt_to_date(entry.gdacs_fromdate),
            'to': dt_to_date(entry.gdacs_todate),
        },
    }


@gdacs_data
def format_tcyclone(entry):
    try:
        displaced = int(DISPLACED_RE.match(entry.summary).group(1))
    except (AttributeError, ValueError, TypeError):
        displaced = 0
    return {
        'severity': '{} {}'.format(entry.gdacs_severity['value'],
                                   entry.gdacs_severity['unit']),
        'affected_population': displaced,
        'deaths': entry.gdacs_population['value'],
        'duration': {
            'from': dt_to_date(entry.gdacs_fromdate),
            'to': dt_to_date(entry.gdacs_todate),
        },
    }


FORMATTERS = {
    'EQ': format_earthquake,
    'FL': format_flood,
    'TC': format_tcyclone,
}


def get_feed_url():
    dt = datetime.datetime.utcnow() - datetime.timedelta(days=30)
    return FEED_URL.format(dt.strftime('%Y-%m-%d'))


def get_feed():
    logging.debug('Obtaining feed')
    url = get_feed_url()
    data = feedparser.parse(url)
    return data.entries


def formatted_entries(entries, outdir):
    logging.debug('Formatting entries')
    return [FORMATTERS[e.gdacs_eventtype](e, outdir) for e in entries]


def write_json(outfile, entries):
    with open(outfile, 'w') as f:
        json.dump(entries, f, indent=2)


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Convert GDACS data to JSON')
    parser.add_argument('--output', '-o', metavar='PATH', default='gdacs.json',
                        help='Output JSON file path')
    parser.add_argument('--verbose', '-V', action='store_true', help='output '
                        'debug messages')
    args = parser.parse_args()

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    entries = get_feed()
    outdir = os.path.dirname(args.output)
    write_json(args.output, formatted_entries(entries, outdir))


if __name__ == '__main__':
    main()



usage: ipykernel_launcher.py [-h] [--output PATH] [--verbose]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/michael/Library/Jupyter/runtime/kernel-71ec0206-6f32-49b4-8793-9d8c9e3e7508.json


SystemExit: 2

In [210]:
who = 'http://www.who.int/feeds/entity/hac/en/rss.xml'
gdacs_eq_24h = 'http://www.gdacs.org/xml/rss_eq_24h.xml'
gdacs_eq_gt55_48h = 'http://www.gdacs.org/xml/rss_eq_48h_med.xml'
gdacs_all_24h = 'http://www.gdacs.org/xml/rss_24h.xml'
gdacs_all_7d = 'http://www.gdacs.org/xml/rss_7d.xml'
gdacs_ts_7d = 'http://www.gdacs.org/xml/rss_tc_7d.xml'
gdacs_ts_3m = 'http://www.gdacs.org/xml/rss_tc_3m.xml'
gdacs_fl_7d = 'http://www.gdacs.org/xml/rss_fl_7d.xml'
gdacs_fl_3m = 'http://www.gdacs.org/xml/rss_fl_3m.xml'
reliefweb = 'https://reliefweb.int/disasters/rss.xml'
reliefweb_map = 'https://reliefweb.int/maps/rss.xml'
dost_pagasa_sl = 'https://www1.pagasa.dost.gov.ph/prsdcodes/rss/prsd/slforecast.xml'

In [247]:
outfol = '/Users/michael/Desktop/out/'
fn = fetch_asset(gdacs_all_24h, outfol)
feed = outfol + fn
print feed

fetching asset from http://www.gdacs.org/xml/rss_24h.xml
/Users/michael/Desktop/out/rss_24h.xml


In [248]:
import xml.dom.minidom
import xml.etree.ElementTree as ET
xml = xml.dom.minidom.parse(feed) # or xml.dom.minidom.parseString(xml_string)
xml_string = xml.toprettyxml()
print xml_string

<?xml version="1.0" ?>
<rss version="2.0" xmlns:asgard="http://asgard.jrc.it" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:gdacs="http://www.gdacs.org" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:georss="http://www.georss.org/georss" xmlns:glide="http://glidenumber.net">
	
  
	<channel>
		
    
		<title>GDACS RSS information</title>
		
    
		<link>http://www.gdacs.org/</link>
		
    
		<description>Near real-time alerts about natural disaster with a potential humanitarian impact</description>
		
    
		<managingEditor>stefano.paris@ext.ec.europa.eu</managingEditor>
		
    
		<!--Records number 19-->
		
    
		<!--Start 9/14/2018 4:16:27 PM-->
		
    
		<!--=Start 9/14/2018 4:16:27 PM-->
		
    
		<webMaster>stefano.paris@ext.ec.europa.eu</webMaster>
		
    
		<pubDate>Fri, 14 Sep 2018 14:16:27 GMT</pubDate>
		
    
		<atom:link href="http://www.gdacs.org/xml/rss_24h.xml" rel="self" type="application/rss+xml"/>
		
    
		<ite

In [249]:
tree = ET.parse(feed)
root = tree.getroot()
print root

<Element 'rss' at 0x1088bafd0>


In [250]:
for element in root[0].iter():
    print element.tag, element.attrib

channel {}
title {}
link {}
description {}
managingEditor {}
webMaster {}
pubDate {}
{http://www.w3.org/2005/Atom}link {'href': 'http://www.gdacs.org/xml/rss_24h.xml', 'type': 'application/rss+xml', 'rel': 'self'}
item {}
title {}
description {}
enclosure {'url': 'http://dma.gdacs.org/saved/gdacs/eq/eq1228272_1.png', 'length': '1', 'type': 'image/png'}
{http://www.gdacs.org}temporary {}
link {}
pubDate {}
{http://www.gdacs.org}fromdate {}
{http://www.gdacs.org}todate {}
{http://www.gdacs.org}year {}
{http://purl.org/dc/elements/1.1/}subject {}
guid {'isPermaLink': 'false'}
{http://www.w3.org/2003/01/geo/wgs84_pos#}Point {}
{http://www.w3.org/2003/01/geo/wgs84_pos#}lat {}
{http://www.w3.org/2003/01/geo/wgs84_pos#}long {}
{http://www.gdacs.org}bbox {}
{http://www.georss.org/georss}point {}
{http://www.gdacs.org}cap {}
{http://www.gdacs.org}version {}
{http://www.gdacs.org}eventtype {}
{http://www.gdacs.org}alertlevel {}
{http://www.gdacs.org}alertscore {}
{http://www.gdacs.org}episodeale

{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000499_NOAA/28/all_inpData.xml', 'source': 'JRC', 'version': '0', 'type': 'rss', 'id': 'cyclone_timeline_HWRF'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}xslt {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000499_NOAA/28/all_inpData.xml', 'source': 'JRC', 'version': '0', 'type': 'rss', 'id': 'cyclone_timeline_GFS'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}xslt {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://dma.gdacs.org/saved/gdacs/tc/1000499/clouds_1000499_28_zoom.png', 'source': 'JRC', 'version': '0', 'type': 'image', 'id': 'overviewmap_zoom

{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://earthquake.usgs.gov/earthquakes/eventpage/us2000heil#summary', 'source': 'NEIC', 'version': '0', 'type': 'html', 'id': 'neic_report'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://cnt.rm.ingv.it/', 'source': 'INGV', 'version': '0', 'type': 'link', 'id': 'geooffice'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://dma.gdacs.org/saved/gdacs/eq/eq1228252_1.png', 'source': 'JRC', 'version': '0', 'type': 'image', 'id': 'thumbnailmap_cached'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://dma.gdacs.org/saved/gdacs/eq/eq1228252_4.png'

{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://www.unitar.org/unosat/maps/PER', 'source': '', 'version': '0', 'type': 'html', 'id': 'UNOSATmaps_country'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://dma.gdacs.org/map?application=EARTHQUAKE&eventid=1156614&episodeid=1228227&coordinate=-74.5889,-7.7362', 'source': 'JRC', 'version': '0', 'type': 'map', 'id': 'interactive_event_map'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://www.gdacs.org/gis/calculation/EQ1_WPS/-075\\eq_-07460_-00775.xml', 'source': 'JRC', 'version': '0', 'type': 'xml', 'id': 'impact_xml'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}xslt {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'h

{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000496_JTWC/final/P1_MAXHEIGHT_END.jpg', 'source': 'JRC', 'version': '0', 'type': 'image', 'id': 'storm_surge_maxheight'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000496_JTWC/final/', 'source': 'JRC', 'version': '0', 'type': 'data', 'id': 'storm_surge_data'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000496_JTWC/final

{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000502_JTWC/final/P1_MAXHEIGHT_END.jpg', 'source': 'JRC', 'version': '0', 'type': 'image', 'id': 'storm_surge_maxheight'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000502_JTWC/final/', 'source': 'JRC', 'version': '0', 'type': 'data', 'id': 'storm_surge_data'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://webcritech.jrc.ec.europa.eu/ModellingCyclone/cyclonesurgeVM/1000502_JTWC/final/locations.kmz', 'source': 'JRC', 'version': '0', 'type': 'kmz', 'id

{http://www.gdacs.org}title {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}resource {'url': 'http://www.gdacs.org//datareport/resources/EQ/1156579/rss_1156579_1228174.xml', 'source': 'JRC', 'version': '0', 'type': 'rss', 'id': 'episode_rss'}
{http://www.gdacs.org}title {}
{http://www.gdacs.org}description {}
{http://www.gdacs.org}acknowledgements {}
{http://www.gdacs.org}accesslevel {}
{http://www.gdacs.org}identifiers {}
{http://www.gdacs.org}identifier {'src': 'JRC', 'id': 'eqcalculationpath'}
{http://www.gdacs.org}identifier {'src': 'JRC', 'id': 'gdpcapita'}
{http://www.gdacs.org}identifier {'src': 'NEIC', 'id': 'neicid'}


In [251]:
from xml.dom import minidom
doc = minidom.parse(feed)

for count in doc.getElementsByTagName("guid"):
    print count.toxml()

<guid isPermaLink="false">EQ1156643</guid>
<guid isPermaLink="false">EQ1156641</guid>
<guid isPermaLink="false">TC1000495</guid>
<guid isPermaLink="false">TC1000499</guid>
<guid isPermaLink="false">TC1000500</guid>
<guid isPermaLink="false">TC1000503</guid>
<guid isPermaLink="false">EQ1156629</guid>
<guid isPermaLink="false">TC1000498</guid>
<guid isPermaLink="false">EQ1156619</guid>
<guid isPermaLink="false">EQ1156614</guid>
<guid isPermaLink="false">EQ1156609</guid>
<guid isPermaLink="false">EQ1156605</guid>
<guid isPermaLink="false">TC1000496</guid>
<guid isPermaLink="false">EQ1156593</guid>
<guid isPermaLink="false">EQ1156591</guid>
<guid isPermaLink="false">TC1000502</guid>
<guid isPermaLink="false">EQ1156583</guid>
<guid isPermaLink="false">EQ1156580</guid>
<guid isPermaLink="false">EQ1156579</guid>


In [255]:
for pubDate in root.iter('pubDate'):
    print pubDate.tag, pubDate.attrib