In [2]:
import requests
from scraperutils import *
from datamodel import Fields
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
page = requests.get("https://www.adb.org/site/accountability-mechanism/problem-solving-function/complaint-registry-year", headers=headers)
page

<Response [200]>

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
# Function to get Project-Sector 
def extract_sector_info(sector_data):
    sectors = []
    sectors_and_subsectors = sector_data.split("\n")
    for i in range(len(sectors_and_subsectors)):
        sectors_and_subsectors[i] = sectors_and_subsectors[i].replace("\n", "")
        if '/' in sectors_and_subsectors[i]:
            sectors.append(sectors_and_subsectors[i].replace("/", "").strip())
    return ";".join(sectors)

# Function to retrieve additional data from Project-Links
def update_additional_project_details(table, project_details):
    table_info = {}
    table_rows = table.find_all('tr')
    project_details_fields = ['Project Number', 'Project Status','Project Type / Modality of Assistance','Sector / Subsector', 'Description']
    for i in range(0,len(table_rows)):
        row = table_rows[i]
        data = row.find_all('td')
        if len(data) == 2 and data[0].text.strip() in project_details_fields:
            table_info[data[0].text.strip()] = data[1].text
    
    # Adding the Project-Number
    project_details[Fields.PROJECT_NUMBER.name] = table_info['Project Number']
    
    # Adding the Project-Status
    project_details[Fields.COMPLAINT_STATUS.name] = table_info.get('Project Status')
    
    # Adding the Project-Type
    if  table_info.get('Project Type / Modality of Assistance') is not None:
        project_details[Fields.PROJECT_TYPE.name] = table_info.get('Project Type / Modality of Assistance').split('\n')[:-1]
        
    # Adding the Project-Sector  
    project_details[Fields.SECTOR.name] = extract_sector_info(table_info.get('Sector / Subsector'))

# Function to retrive the Project-timeline
def update_project_lifecycle_details(lifecycle_info, project_details):
    lifecycle_stages = lifecycle_info.find_all('li')
    for stage in lifecycle_stages:
        if 'received' in stage.text:
            project_details[Fields.FILING_DATE.name] = stage.text.split(",")[1]
            project_details[Fields.REGISTRATION_START_DATE.name] = project_details[Fields.FILING_DATE.name] 
        if 'registered' in stage.text:
            project_details[Fields.REGISTRATION_END_DATE.name] = stage.text.split(",")[1]
            project_details[Fields.ELIGIBILITY_START_DATE.name] = project_details[Fields.REGISTRATION_END_DATE.name]
        if 'eligibility' in stage.text:
            project_details[Fields.ELIGIBILITY_END_DATE.name] = stage.text.split(",")[1]
        if 'Assessment' in stage.text:
            project_details[Fields.DISPUTE_RESOLUTION_START_DATE.name] = stage.text.split(",")[1]
        
        if project_details.get(Fields.COMPLAINT_STATUS.name) == 'Closed':
            project_details[Fields.DATE_CLOSED.name] = 'Completed'
            
            if project_details.get(Fields.DISPUTE_RESOLUTION_START_DATE.name) is not None:
                project_details[Fields.DISPUTE_RESOLUTION_END_DATE.name] = 'Completed'
    

In [7]:
tables = soup.find_all('table', {'class':['table-striped table-secondary']})
project_output = []

for table in tables:
    rows = table.find_all('tr')
    for i in range (1,len(rows)):
        project_details = dict()
        data = rows[i].find_all('td')
        
        # Adding the Project-Year        
        project_details[Fields.YEAR.name] = data[0].text.split('/')[1]
    
        # Adding the Project-Country
        project_details[Fields.COUNTRY.name] = data[1].text.split(':')[0]

        # Adding the Project-Name 
        project_details[Fields.PROJECT_NAME.name] = data[1].text.split(':')[1]
        
        # Adding the Project-ID 
        project_details[Fields.PROJECT_ID.name] = data[1].text
        
        # Adding the Project-Issues 
        project_details[Fields.ISSUES.name] = data[2].ul.text.split('\n')[:-1]
        
        # Adding the Project-IAM 
        project_details[Fields.IAM.name] = "ADB-SPF"
        
        # Adding the Project-IAM-ID 
        project_details[Fields.IAM_ID.name] = "25"
        
        # Adding the Project-ProjectLink 
        if (data[1].find('a') is not None):
            project_details[Fields.HYPERLINK.name] = data[1].find('a').get("href")
            i_page = None
            # Getting each project details
            if ("https://www.adb.org/" in project_details[Fields.HYPERLINK.name]):
                i_page = requests.get(project_details[Fields.HYPERLINK.name]+"#project-pds", headers=headers)
            else: 
                i_page = requests.get("https://www.adb.org/"+project_details[Fields.HYPERLINK.name]+"#project-pds", headers=headers)
                
            soup1 = BeautifulSoup(i_page.content, 'html.parser')
            table = soup1.find('table',{'class':'pds'})
            update_additional_project_details(table, project_details)
            
        # Adding the Project-Complaint-Documents 
        if (data[3].find('a') is not None):
            project_details[Fields.DOCUMENTS.name] = data[3].find('a').get("href")
        
        update_project_lifecycle_details(data[3], project_details)
        project_output.append(get_complete_project_row(project_details))
        
        print("Processing..Project:"+project_details[Fields.PROJECT_NAME.name])

write_csv("adb_spf_scraper", project_output)




Processing..Project: Flood Emergency Reconstruction and Resilience Project
Processing..Project: Peshawar Sustainable Bus Rapid Transit Corridor Project
Processing..Project: National Highway Network Development in Balochistan Project
Processing..Project: Ulaanbaatar Urban Services and Ger Areas Development Investment Program - Tranche 1
Processing..Project: Adjaristsqali Hydropower Project
Processing..Project: SASEC Second Bangladesh-India Electrical Grid Interconnection Project
Processing..Project: Sustainable Urban Development Investment Program - Tranche 1
Processing..Project: Akmola Electricity Distribution Network Modernization and Expansion Project
Processing..Project: Third Urban Governance and Infrastructure Improvement (Sector) Project – Additional Financing
Processing..Project: Clean Energy and Network Efficiency Improvement Project
Processing..Project: Sustainable Urban Development Investment Program – Tranche 1

Processing..Project: Batumi Bypass Road Project

Processing..Pr

datetime.datetime(2018, 6, 23, 17, 40, 24, 796783)