# Extracting key fields from published PDFs

* Open [Appeals page from IFRC website](http://www.ifrc.org/en/publications-and-reports/appeals/)
* Select from and to fields in the filters. For example, if you want to analyse documents from 2016 to 2017, select **from** *2016* and **to** *2017*
* Make sure you select **DREF Operation** from the **type** filter
* Press Search and find **RSS** at the bottom of page
* Click on **RSS** and copy the URL in the below code

### Explanation



The code below downloads the documents and stores the documents in your folder

In [1]:


URL = " http://www.ifrc.org/Utils/Search/Rss.ashx?at=241&c=&co=&dt=1&f=2018&feed=appeals&re=&t=2018&ti=&zo="

 
import re
import urllib
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
import requests



def download(url, user_agent ="wswp", num_retries =2, charset ="utf-8"):
    print("Downloading: ", url)
    request = urllib.request.Request(url);
    request.add_header('User-agent', user_agent)
    try:
        resp = urllib.request.urlopen(request)
        cs = resp.headers.get_content_charset()
        if not cs:
            cs = charset
        html = resp.read().decode(cs)
    except (URLError, HTTPError, ContentTooShortError) as e:
        print("Download error ", e.reason)
        html=None
        if num_retries>0:
            if hasattr(e,'code') and 500 <= e.code <600:
                return download(url, num_retries-1)
    return html


# The following function will get the default name of the appeal document 
def get_filename_from_cd(cd):
    if not cd:
        return None
    fname = re.findall('filename=(.+)', cd)
    if len(fname) == 0:
        return None
    return fname[0]

def crawl_sitemap(url):
    
    sitemap = download(url);
    
    items = re.findall('<item>(.*?)</item>',sitemap)
    # download each link
    for item in items:
        link = re.findall('<link>(.*?)\</link>',item)
        print(link[0])  
        
        try:    
            r = requests.get(link[0], allow_redirects=True)
            filename = get_filename_from_cd(r.headers.get('content-disposition'))
            open(filename, 'wb').write(r.content)
        except:
            pass
 
crawl_sitemap(URL)


Downloading:   http://www.ifrc.org/Utils/Search/Rss.ashx?at=241&c=&co=&dt=1&f=2018&feed=appeals&re=&t=2018&ti=&zo=
http://adore.ifrc.org/Download.aspx?FileId=182931
http://adore.ifrc.org/Download.aspx?FileId=182859
http://adore.ifrc.org/Download.aspx?FileId=182858
http://adore.ifrc.org/Download.aspx?FileId=182684
http://adore.ifrc.org/Download.aspx?FileId=182510
http://adore.ifrc.org/Download.aspx?FileId=182275
http://adore.ifrc.org/Download.aspx?FileId=181859
http://adore.ifrc.org/Download.aspx?FileId=181838
http://adore.ifrc.org/Download.aspx?FileId=181767
http://adore.ifrc.org/Download.aspx?FileId=181750
http://adore.ifrc.org/Download.aspx?FileId=181680
http://adore.ifrc.org/Download.aspx?FileId=181456


### Explanation

The code below opens each document, reads the first page, extracts the summary table and outputs a CSV file

In [2]:
import os
import PyPDF2
import re
import csv


#The following code will create a list with the documents previously downloaded

content_list = []

for content in os.listdir("."): # "." means current directory
    content_list.append(content)

textlist =[]

header = "item"+","+"operation_num"+","+"glide_num"+","+"date_of_issue"+","+"dref_allocated"+","+"shelter_people_targeted"+","+"shelter_male_people_targeted"+","+"shelter_female_people_targeted"+","+"shelter_requirements_CHF"+","+"livelihood_people_targeted"+","+"livelihood_male_people_targeted"+","+"livelihood_female_people_targeted"+","+"livelihood_requirements_CHF"+","+"health_people_targeted"+","+"health_male_people_targeted"+","+"health_female_people_targeted"+","+"health_requirements_CHF"+","+"wash_people_targeted"+","+"wash_male_people_targeted"+","+"wash_female_people_targeted"+","+"wash_requirements_CHF"+","+"protection_people_targeted"+","+"protection_male_people_targeted"+","+"protection_female_people_targeted"+","+"protection_requirements_CHF"+","+"migration_people_targeted"+","+"migration_male_people_targeted"+","+"migration_female_people_targeted"+","+"migration_requirements_CHF "+","+"drr_people_targeted"+","+"drr_male_people_targeted"+","+"drr_female_people_targeted"+","+"drr_requirements_CHF"+","+"sfi_budget"

textlist.append(header)
            
#print(len(content_list))



#The following code will open the PDF, create an object for each page, convert into a string and then call the function res on each page
for item in content_list:
    if "pdf" in item:
        #print(item)
        reportobj = open(item, 'rb')
        readreport = PyPDF2.PdfFileReader(reportobj)
        pages= readreport.numPages
        health ="Not available"
        shelter ="Not available"
        livelihood ="Not available"
        wash = "Not available"
        protection = "Not available"
        migration = "Not available"
        drr = "Not available"
        sfi_budget = "Not available"
        
        for i in range(0,pages):

            pageobj = readreport.getPage(i)
            text = pageobj.extractText()
            text = text.replace("\n","")
            if(i==0):
                try:
                    operation_num = text.split("DREF n°")[1].strip().split(" ",1)[0].replace(",","")
                except:
                    operation_num = "Please look into this document"
                try:
                    glide_num = text.split("Glide n°")[1].strip().split("Date")[0].strip().replace(",","")
                except:
                    glide_num = "Not available"
                try:
                    date_of_issue = text.split("Date of issue:")[1].strip().split("Expected timeframe:")[0].strip().replace(",","")
                except:
                    date_of_issue = "Not available"
                try:
                    dref_allocated = text.split("allocated:")[1].strip().split(" ", 2)[1].replace(",","")
                except:
                    dref_allocated = "Not available"
            
                
            try:
                if(shelter == "Not available"):
                    shelter = text.split("Shelter People targeted:")[1].strip().replace(",","")
                    #print(shelter)
            except:
                shelter = "Not available"
            try:
                shelter_people_targeted = shelter.split("Male:")[0]
            except:
                shelter_people_targeted = "Not available"
            try:
                shelter_male_people_targeted = shelter.split("Male:")[1].split("Female:")[0].strip()
            except:
                shelter_male_people_targeted = "Not available"
            try:
                shelter_female_people_targeted = shelter.split("Female:")[1].split("Requirements (CHF)")[0].strip()
            except:
                shelter_female_people_targeted = "Not available"
                
                
            try:
                shelter_requirements_CHF = shelter.split("Requirements (CHF)")[1].split(" ")[0]   
            except:
                shelter_requirements_CHF = "Not available"
                
            

            try:
                if(livelihood == "Not available"):
                    livelihood = text.split("Livelihoods and basic needs People targeted:")[1].strip().replace(",","")
                    #print(livelihood)
            except:
                livelihood ="Not available"
            try:
                livelihood_people_targeted = livelihood.split("Male:")[0]
            except:
                livelihood_people_targeted = "Not available"
            try:
                livelihood_male_people_targeted = livelihood.split("Male:")[1].split("Female:")[0].strip()
            except:
                livelihood_male_people_targeted = "Not available"
            try:    
                livelihood_female_people_targeted = livelihood.split("Female:")[1].split("Requirements (CHF)")[0].strip()
            except:
                livelihood_female_people_targeted = "Not available"
            try:
                livelihood_requirements_CHF = livelihood.split("Requirements (CHF)")[1].split(" ")[0]
            except:
                livelihood_requirements_CHF = "Not available"
        

            try:
                
                if(health == "Not available"):
                    health = text.split("Health People targeted:")[1].strip()
                    #print(health)
            except:
                health = "Not available"
            try:
                health_people_targeted = health.split("Male:")[0].replace(",","")
            except:
                health_people_targeted = "Not available"
            try:
                health_male_people_targeted = health.split("Male:")[1].split("Female:")[0].strip().replace(",","")
            except:
                health_male_people_targeted = "Not available"
            try:
                health_female_people_targeted = health.split("Female:")[1].split("Requirements (CHF)")[0].strip().replace(",","")
            except:
                 health_female_people_targeted = "Not available"
            try:
                health_requirements_CHF = health.split("Requirements (CHF)")[1].strip().split(" ")[0].replace(",","")
            except:
                health_requirements_CHF = "Not available"
         
     
            try:
                if(wash == "Not available"):
                    wash = text.split("Water, sanitation and hygiene People targeted:")[1].strip().replace(",","")
                    #print(wash)
            except:
                wash = "Not available"
            try:
                wash_people_targeted = wash.split("Male:")[0]
            except:
                wash_people_targeted = "Not available"
            try:
                wash_male_people_targeted = wash.split("Male:")[1].split("Female:")[0].strip()
            except:
                wash_male_people_targeted = "Not available"
            try:
                wash_female_people_targeted = wash.split("Female:")[1].split("Requirements (CHF)")[0].strip()
            except:
                wash_female_people_targeted = "Not available"
                
            try:
                wash_requirements_CHF = wash.split("Requirements (CHF)")[1].strip().split(" ")[0]   
            except:
                wash_requirements_CHF = "Not available"
                
          
            
            try:
                if(protection == "Not available"):
                    protection = text.split("Protection, Gender and Inclusion People targeted:")[1].strip().replace(",","")
                    #print(protection)
            except:
                protection = "Not available"
            try:
                protection_people_targeted = protection.split("Male:")[0]
            except:
                protection_people_targeted = "Not available"
            try:
                protection_male_people_targeted = protection.split("Male:")[1].split("Female:")[0].strip()
            except:
                protection_male_people_targeted = "Not available"
            try:
                protection_female_people_targeted = protection.split("Female:")[1].split("Requirements (CHF)")[0].strip()
            except:
                protection_female_people_targeted = "Not available"
            try:
                protection_requirements_CHF = protection.split("Requirements (CHF)")[1].strip().split(" ")[0]
            except:
                protection_requirements_CHF = "Not available"
                

            
            try:
                if(migration == "Not available"):
                    migration = text.split("Migration People targeted:")[1].strip().replace(",","")
                    #print(migration)
            except:
                migration == "Not available"
                
            try:
                migration_people_targeted = migration.split("Male:")[0]
            except:
                migration_people_targeted = "Not available"
            try:
                migration_male_people_targeted = migration.split("Male:")[1].split("Female:")[0].strip()
            except:
                migration_male_people_targeted = "Not available"
            try:
                migration_female_people_targeted = migration.split("Female:")[1].split("Requirements (CHF)")[0].strip()
            except:
                migration_female_people_targeted = "Not available"
            try:
                migration_requirements_CHF = migration.split("Requirements (CHF)")[1].strip().split(" ")[0]
            except:
                migration_requirements_CHF = "Not available"
                
        
    
            try:
                if(drr == "Not available"):
                    drr = text.split("Disaster Risk Reduction People targeted:")[1].strip().replace(",","")
                    #print(drr)
            except:
                drr ="Not available"
            try:
                drr_people_targeted = drr.split("Male:")[0]
            except:
                drr_people_targeted = "Not available"
            try:
                drr_male_people_targeted = drr.split("Male:")[1].split("Female:")[0].strip()      
            except:
                drr_male_people_targeted = "Not available"
            try:
                drr_female_people_targeted = drr.split("Female:")[1].split("Requirements (CHF)")[0].strip()
            except:
                drr_female_people_targeted ="Not available"
            try:
                drr_requirements_CHF = drr.split("Requirements (CHF)")[1].strip().split(" ")[0]   
            except:
                drr_requirements_CHF = "Not available"
                
            

            try:
                if(sfi_budget == "Not available"):
                    sfi_budget = text.split("Strategies for Implementation Requirements (CHF)")[1].split(" ")[0].replace(",","")
                       
            except:
                sfi_budget ="Not available"
            
        
        
    
        text = item+","+operation_num+","+glide_num+","+date_of_issue+","+dref_allocated+","+shelter_people_targeted+","+shelter_male_people_targeted+","+shelter_female_people_targeted+","+shelter_requirements_CHF+","+livelihood_people_targeted+","+livelihood_male_people_targeted+","+livelihood_female_people_targeted+","+livelihood_requirements_CHF+","+health_people_targeted+","+health_male_people_targeted+","+health_female_people_targeted+","+health_requirements_CHF+","+wash_people_targeted+","+wash_male_people_targeted+","+wash_female_people_targeted+","+wash_requirements_CHF+","+protection_people_targeted+","+protection_male_people_targeted+","+protection_female_people_targeted+","+protection_requirements_CHF+","+migration_people_targeted+","+migration_male_people_targeted+","+migration_female_people_targeted+","+migration_requirements_CHF+","+drr_people_targeted+","+drr_male_people_targeted+","+drr_female_people_targeted+","+drr_requirements_CHF+","+sfi_budget
        textlist.append(text)
            
         
        
f = open('EPoA.csv','w')
for item in textlist:
    f.write(item+"\n") 
f.close()

