In [1]:
import requests
from scraperutils import *
from datamodel import Fields
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
page = requests.get("https://www.adb.org/site/accountability-mechanism/problem-solving-function/complaint-registry-year", headers=headers)
page

<Response [200]>

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
# Function to get Project-Sector 
def extract_sector_info(sector_data):
    sectors = []
    sectors_and_subsectors = sector_data.split("\n")
    for i in range(len(sectors_and_subsectors)):
        sectors_and_subsectors[i] = sectors_and_subsectors[i].replace("\n", "")
        if '/' in sectors_and_subsectors[i]:
            sectors.append(sectors_and_subsectors[i].replace("/", "").strip())
    return ";".join(sectors)

# Function to retrieve additional data from Project-Links
def update_additional_project_details(table, project_details):
    table_info = {}
    table_rows = table.find_all('tr')
    project_details_fields = ['Project Number', 'Project Status','Project Type / Modality of Assistance','Sector / Subsector', 'Description']
    for i in range(0,len(table_rows)):
        row = table_rows[i]
        data = row.find_all('td')
        if len(data) == 2 and data[0].text.strip() in project_details_fields:
            table_info[data[0].text.strip()] = data[1].text
    
    # Adding the Project-Number
    project_details[Fields.PROJECT_NUMBER.name] = table_info['Project Number']
    
    # Adding the Project-Status
    project_details[Fields.COMPLAINT_STATUS.name] = table_info.get('Project Status')
    
    # Adding the Project-Type
    if  table_info.get('Project Type / Modality of Assistance') is not None:
        project_details[Fields.PROJECT_TYPE.name] = table_info.get('Project Type / Modality of Assistance').split('\n')[:-1]
        
    # Adding the Project-Sector  
    project_details[Fields.SECTOR.name] = extract_sector_info(table_info.get('Sector / Subsector'))

# Function to retrive the Project-timeline
def update_project_lifecycle_details(lifecycle_info, project_details):
    lifecycle_stages = lifecycle_info.find_all('li')
    for stage in lifecycle_stages:
        if 'received' in stage.text:
            project_details[Fields.FILING_DATE.name] = stage.text.split(",")[1]
            project_details[Fields.REGISTRATION_START_DATE.name] = project_details[Fields.FILING_DATE.name] 
        if 'registered' in stage.text:
            project_details[Fields.REGISTRATION_END_DATE.name] = stage.text.split(",")[1]
            project_details[Fields.ELIGIBILITY_START_DATE.name] = project_details[Fields.REGISTRATION_END_DATE.name]
        if 'eligibility' in stage.text:
            project_details[Fields.ELIGIBILITY_END_DATE.name] = stage.text.split(",")[1]
        if 'Assessment' in stage.text:
            project_details[Fields.DISPUTE_RESOLUTION_START_DATE.name] = stage.text.split(",")[1]
        
        if project_details.get(Fields.COMPLAINT_STATUS.name) == 'Closed':
            project_details[Fields.DATE_CLOSED.name] = 'Completed'
            
            if project_details.get(Fields.DISPUTE_RESOLUTION_START_DATE.name) is not None:
                project_details[Fields.DISPUTE_RESOLUTION_END_DATE.name] = 'Completed'
    

In [4]:
tables = soup.find_all('table', {'class':['table-striped table-secondary']})
project_output = []

for table in tables:
    rows = table.find_all('tr')
    for i in range (1,len(rows)):
        project_details = dict()
        data = rows[i].find_all('td')
        
        # Adding the Project-Year        
        project_details[Fields.YEAR.name] = data[0].text.split('/')[1]
    
        # Adding the Project-Country
        project_details[Fields.COUNTRY.name] = data[1].text.split(':')[0]

        # Adding the Project-Name 
        project_details[Fields.PROJECT_NAME.name] = data[1].text.split(':')[1]
        
        # Adding the Project-ID 
        project_details[Fields.PROJECT_ID.name] = data[1].text
        
        # Adding the Project-Issues 
        project_details[Fields.ISSUES.name] = data[2].ul.text.split('\n')[:-1]
        
        # Adding the Project-ProjectLink 
        if (data[1].find('a') is not None):
            project_details[Fields.HYPERLINK.name] = data[1].find('a').get("href")
            i_page = None
            # Getting each project details
            if ("https://www.adb.org/" in project_details[Fields.HYPERLINK.name]):
                i_page = requests.get(project_details[Fields.HYPERLINK.name]+"#project-pds", headers=headers)
            else: 
                i_page = requests.get("https://www.adb.org/"+project_details[Fields.HYPERLINK.name]+"#project-pds", headers=headers)
                
            soup1 = BeautifulSoup(i_page.content, 'html.parser')
            table = soup1.find('table',{'class':'pds'})
            update_additional_project_details(table, project_details)
            
        # Adding the Project-Complaint-Documents 
        if (data[3].find('a') is not None):
            project_details[Fields.DOCUMENTS.name] = data[3].find('a').get("href")
        
        update_project_lifecycle_details(data[3], project_details)
        project_output.append(project_details)
        
        print(get_complete_project_row(project_details))

write_csv("adb_spf_scraper", project_output)
#print (project_output)



OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2018'), ('COUNTRY', 'Pakistan'), ('PROJECT_NAME', ' Flood Emergency Reconstruction and Resilience Project'), ('PROJECT_ID', 'Pakistan: Flood Emergency Reconstruction and Resilience Project'), ('PROJECT_NUMBER', '49038-001'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['LoanTechnical Assistance']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', 'Agriculture, natural resources and rural development;Transport'), ('ISSUES', ['Substandard reconstruction of road']), ('FILERS', None), ('FILING_DATE', ' 11 June 2018'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Active'), ('DATE_CLOSED', None), ('HYPERLINK', '/projects/49038-001/main'), ('REGISTRATION_START_DATE', ' 11 June 2018'), ('REGISTRATION_END_DATE', ' 14 June 2018'), ('ELIGIBILITY_START_DATE', ' 14 June 2018'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', None), ('DISPUTE_RESOLUTION_END_DATE', None), ('COMPLIANCE_REVIEW_START_DATE', None), ('COM

OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2014'), ('COUNTRY', 'Samoa'), ('PROJECT_NAME', ' TAs 8481 and 7387 (SAM)'), ('PROJECT_ID', 'Samoa: TAs 8481 and 7387 (SAM): Promoting Economic Use of Customary Land and Grant No. 0392 (SAM): Samoa Agribusiness Support Project'), ('PROJECT_NUMBER', '46436-002'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['Grant']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', 'Agriculture, natural resources and rural development'), ('ISSUES', ['Lack of Consultation', 'Use of customary land', 'Inadequate environment and social due diligence', 'Disclosure of documents']), ('FILERS', None), ('FILING_DATE', ' 9 Sep 2014'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Active'), ('DATE_CLOSED', None), ('HYPERLINK', '/projects/46436-002/main'), ('REGISTRATION_START_DATE', ' 9 Sep 2014'), ('REGISTRATION_END_DATE', ' 9 Sep 2014'), ('ELIGIBILITY_START_DATE', ' 9 Sep 2014'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', ' 

OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2012'), ('COUNTRY', 'Cambodia'), ('PROJECT_NAME', ' Rehabilitation of the Railway in Cambodia Project'), ('PROJECT_ID', 'Cambodia: Rehabilitation of the Railway in Cambodia Project'), ('PROJECT_NUMBER', '37269-023'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['GrantLoan']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', 'Transport'), ('ISSUES', ['Resettlement']), ('FILERS', None), ('FILING_DATE', ' 13 Apr 2012\nEnglish\nKhmer\n'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Closed'), ('DATE_CLOSED', 'Completed'), ('HYPERLINK', '/projects/37269-023/main'), ('REGISTRATION_START_DATE', ' 13 Apr 2012\nEnglish\nKhmer\n'), ('REGISTRATION_END_DATE', ' 19 Apr 2012'), ('ELIGIBILITY_START_DATE', ' 19 Apr 2012'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', None), ('DISPUTE_RESOLUTION_END_DATE', None), ('COMPLIANCE_REVIEW_START_DATE', None), ('COMPLIANCE_REVIEW_END_DATE', None), ('MONITORING_START_DATE', 

OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2010'), ('COUNTRY', 'Kyrgyz Republic'), ('PROJECT_NAME', ' Technical Assistance'), ('PROJECT_ID', 'Kyrgyz Republic: Technical Assistance: Regional Economic Integration in Central Asia – Stocktaking and Experience Sharing'), ('PROJECT_NUMBER', '43193-012'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['Technical Assistance']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', 'Public sector management'), ('ISSUES', ['Termination of Contract']), ('FILERS', None), ('FILING_DATE', ' 27 Sep 2010'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Closed'), ('DATE_CLOSED', 'Completed'), ('HYPERLINK', '/projects/43193-012/main'), ('REGISTRATION_START_DATE', ' 27 Sep 2010'), ('REGISTRATION_END_DATE', ' 29 Sep 2010'), ('ELIGIBILITY_START_DATE', ' 29 Sep 2010'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', None), ('DISPUTE_RESOLUTION_END_DATE', None), ('COMPLIANCE_REVIEW_START_DATE', None), ('COMPLIANCE_REVIEW_END

OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2009'), ('COUNTRY', 'Azerbaijan'), ('PROJECT_NAME', ' East-West Highway Improvement Project'), ('PROJECT_ID', 'Azerbaijan: East-West Highway Improvement Project'), ('PROJECT_NUMBER', '35457-013'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['Loan']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', 'Transport'), ('ISSUES', ['Consultation', 'Information', 'Resettlement']), ('FILERS', None), ('FILING_DATE', ' 2 Sep 2009'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Closed'), ('DATE_CLOSED', 'Completed'), ('HYPERLINK', '/projects/35457-013/main'), ('REGISTRATION_START_DATE', ' 2 Sep 2009'), ('REGISTRATION_END_DATE', ' 3 Sep 2009'), ('ELIGIBILITY_START_DATE', ' 3 Sep 2009'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', None), ('DISPUTE_RESOLUTION_END_DATE', None), ('COMPLIANCE_REVIEW_START_DATE', None), ('COMPLIANCE_REVIEW_END_DATE', None), ('MONITORING_START_DATE', None), ('MONITORING_END_DATE', Non

OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2009'), ('COUNTRY', "People's Republic of China"), ('PROJECT_NAME', ' Fuzhou Environmental Improvement Project'), ('PROJECT_ID', "People's Republic of China: Fuzhou Environmental Improvement Project"), ('PROJECT_NUMBER', '35340-013'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['Loan']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', 'Water and other urban infrastructure and services'), ('ISSUES', ['Resettlement']), ('FILERS', None), ('FILING_DATE', ' 15 Jan 2009\nEnglish\nChinese\n'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Closed'), ('DATE_CLOSED', 'Completed'), ('HYPERLINK', '/projects/35340-013/main'), ('REGISTRATION_START_DATE', ' 15 Jan 2009\nEnglish\nChinese\n'), ('REGISTRATION_END_DATE', ' 19 Jan 2009'), ('ELIGIBILITY_START_DATE', ' 19 Jan 2009'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', ' 23 Mar 2009\nEnglish\nChinese\n'), ('DISPUTE_RESOLUTION_END_DATE', 'Completed'), ('COMPLIAN

OrderedDict([('IAM', None), ('IAM_ID', None), ('YEAR', '2006'), ('COUNTRY', 'Bangladesh'), ('PROJECT_NAME', ' Khulna-Jessore Drainage Rehabilitation Project'), ('PROJECT_ID', 'Bangladesh: Khulna-Jessore Drainage Rehabilitation Project'), ('PROJECT_NUMBER', '21087'), ('RELATED_PROJECT_NUMBER', None), ('PROJECT_TYPE', ['Loan']), ('PROJECT_LOAN_AMOUNT', None), ('SECTOR', ''), ('ISSUES', ['Agricultural production', 'Community Participation']), ('FILERS', None), ('FILING_DATE', ' 4 Jan 2006'), ('ENVIRONMENTAL_CATEGORY', None), ('COMPLAINT_STATUS', 'Closed'), ('DATE_CLOSED', 'Completed'), ('HYPERLINK', '/projects/ln1289/main'), ('REGISTRATION_START_DATE', ' 4 Jan 2006'), ('REGISTRATION_END_DATE', ' 4 Jan 2006'), ('ELIGIBILITY_START_DATE', ' 4 Jan 2006'), ('ELIGIBILITY_END_DATE', None), ('DISPUTE_RESOLUTION_START_DATE', None), ('DISPUTE_RESOLUTION_END_DATE', None), ('COMPLIANCE_REVIEW_START_DATE', None), ('COMPLIANCE_REVIEW_END_DATE', None), ('MONITORING_START_DATE', None), ('MONITORING_END_D

datetime.datetime(2018, 6, 23, 17, 40, 24, 796783)