# Web-Scraping Project

### Preparing for Web-Scraping on the World Bank.

https://openknowledge.worldbank.org/discover

In [1]:
# Import Dependencies

# !pip install BeautifulSoup4 as bs4
# !pip install pandas
# !pip install splinter

from splinter import Browser
from bs4 import BeautifulSoup
import requests
import pandas as pd


# Windows Users

In [2]:
# I want to check my work
executable_path = {'executable_path': './chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
# Using googlechrom drive open test explorer
url = 'https://openknowledge.worldbank.org/discover'
browser.visit(url)

In [4]:
# Retrieve page from the request module
response = requests.get(url)

In [5]:
# Create BeautifulSoup Object; parse with 'html.parser'
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
# Print the main page html, inspect elements
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="no-cache" http-equiv="Pragma"/>
  <meta content="no-cache,no-Store" http-equiv="Cache-Control"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="/themes/OKR2/images/wb-favicon.png" rel="shortcut icon" type="image/png"/>
  <link href="/themes/OKR2/images/apple_touch_icons/AppIcon76x76@2x.png" rel="apple-touch-icon-precomposed"/>
  <link href="/themes/OKR2/images/apple_touch_icons/AppIcon72x72.png" rel="apple-to

In [7]:
# Results are returned as an iterable list
headers = soup.find_all('h4')
#     Check header
print(headers[0])

<h4>
<a class="no-decor" href="/handle/10986/31327">Women, Business and the Law 2019 : A Decade of Reform</a>
</h4>


In [8]:
# Loop through the headers on the main page

#         THIS PART NEEDS WORK... 'NoneType'  (solved, the class name is called "no-decor" not "no decor")
for i in headers:
    try:
        title = i.find('a', class_='no-decor').text
        if (title):
            print(title)
    except AttributeError as e:
        print(e)

Women, Business and the Law 2019 : A Decade of Reform
Global Economic Prospects, June 2019
Quality Unknown : The Invisible Water Crisis
Harvesting Prosperity : Technology and Productivity Growth in Agriculture
Belt and Road Economics : Opportunities and Risks of Transport Corridors
Innovative China : New Drivers of Growth
The World Bank Annual Report 2018
Poverty and Shared Prosperity 2018 : Piecing Together the Poverty Puzzle
Indonesia Economic Quarterly, June 2019 : Oceans of Opportunity
Commodity Markets Outlook, April 2019
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'


In [11]:
# Find the next page button and click the link to the next page
# page = soup.find('a', class_='next-page-link')
# print(page)

browser.is_element_present_by_css('a[class="next-page-link"]')
next_page = browser.find_by_css('a[class="next-page-link"]')
print(next_page[1]) # Check
next_page[1].click()

<splinter.driver.webdriver.WebDriverElement object at 0x0000021D2F461160>


In [13]:
# Find the first title and click the link
browser.is_element_present_by_css('a[class="no-decor"]')
report_link = browser.find_by_css('a[class="no-decor"]')
report_link.click() 
print(report_link) # Check 

<splinter.element_list.ElementList object at 0x0000021D2F457A58>


In [17]:
# Creates a temporary soup for the current page using similar code from above
html = browser.html
reports_soup = BeautifulSoup(html, 'html.parser')

In [18]:
# Create slide element to find the information to store
slide_element = reports_soup.select_one('div.ds-content-slider')
print(slide_element) # Check

<div class="ds-content-slider">
<div class="col-sm-3 col-md-2 sidebar-col vertical-slider invisible-xs">
<div class="word-break row" id="ds-options">
<div class="ds-sidebar-slider">
<ul class="nav col-xs-6 visible-xs">
<li>
<h2 class="sidebar-menu-trigger" data-target="aspect_browseArtifacts_Navigation_list_global">Browse<span class="glyphicon glyphicon-chevron-right pull-right"></span>
</h2>
</li>
<li>
<h2 class="sidebar-menu-trigger" data-target="aspect_browseArtifacts_Navigation_list_context">Browse this collection<span class="glyphicon glyphicon-chevron-right pull-right"></span>
</h2>
</li>
<li>
<h2 class="sidebar-menu-trigger" data-target="aspect_authorprofile_Navigation_list_authorprofile">Author profiles<span class="glyphicon glyphicon-chevron-right pull-right"></span>
</h2>
</li>
<li>
<h2 class="sidebar-menu-trigger" data-target="aspect_statistics_Navigation_list_statistics">Site statistics<span class="glyphicon glyphicon-chevron-right pull-right"></span>
</h2>
</li>
<li>
<h2 c

In [19]:
# Target the title header text
slide_element.find('h2', class_='ds-div-head')
print(slide_element.find('h2', class_='ds-div-head'))

<h2 class="ds-div-head">Women, Business and the Law 2019 : A Decade of Reform</h2>


In [21]:
# Store the journal title, summary, citation, link, publish date, and author

title = slide_element.find('a', class_='no-decor').get_text()
summary = slide_element.find('div', class_='okr-item-page-field-wrapper abstract').get_text()
citation = slide_element.find('div', class_='citation').get_text()
link = slide_element.find('div', class_='okr-item-page-field-wrapper uri').get_text() # can be problematic due to it scraping "URI" as well as the link
publish_date = slide_element.find('div', class_='simple-item-view-other word-break').get_text()
author = slide_element.find('div', class_='authorprofile-item-view-link').get_text()

#     Check
print("Title: " + title)
print("Abstract:" + summary)
print("Citation:" + citation)
print("Link:" + link)
print("Date:" + publish_date)
print("Author:" + author)

Title: World Development Report 2018 : Learning to Realize Education's Promise
Abstract:
AbstractThe World Bank Group’s Women, Business and the Law examines laws and regulations affecting women’s prospects as entrepreneurs and employees across 187 economies. Its goal is to inform policy discussions on how to remove legal restrictions on women and promote research on how to improve women’s economic inclusion. Women, Business and the Law 2019: A Decade of Reform introduces a new index measuring legal rights for women throughout their working lives in 187 economies. The index is composed of 35 data points grouped into eight indicators. The data covers a 10-year period not only to understand the current situation but to see how laws affecting women’s equality of opportunity have evolved over time. The index assesses economic rights at milestones spanning the arc of a woman’s working life: the ability to move freely; starting a job; getting paid; legal capacity within marriage; having child

In [22]:
# Go back to the previous page
browser.back()

In [23]:
# Storing the variables into a dataframe
data_df = pd.DataFrame([[title, summary, link, publish_date, author]], columns=["title", "summary", "link", "publish_date", "author"])


In [24]:
# Replace the "\n" in the dataframe
data_df = data_df.replace('\n',' ', regex=True)
data_df # Check

Unnamed: 0,title,summary,link,publish_date,author
0,World Development Report 2018 : Learning to Re...,"AbstractThe World Bank Group’s Women, Busines...",URI http://hdl.handle.net/10986/31327,2019-02-27,World Bank Group


In [25]:
# Saving the dataframe into CSV
data_df.to_csv("data.csv", index=False, encoding='utf-8')