In [8]:
!python3 -m jupyter_contrib_nbextensions install --user 

/Library/Frameworks/Python.framework/Versions/3.7/bin/python3: No module named jupyter_contrib_nbextensions.__main__; 'jupyter_contrib_nbextensions' is a package and cannot be directly executed


# Task 1: Data Acquisition

#### Please extract bilateral investment treaty data from the following url: https://investmentpolicy.unctad.org/international-investment-agreements/by-economy.

#### Specifically, you should scrape and build two datasets: 
#### 1. A country-level dataset containing the following: (points: 5)

##### • Variables:
###### – country: the name of the country (dtype: str)
###### – n_bits: the total number of bilateral investment treaties (dtype: int);
###### – n_bits_active: the total number of active bilateral investment treaties (dtype: int).

In [666]:
# Import libraries 
import requests 
import country_converter as coco
from bs4 import BeautifulSoup
import pandas as pd
import time 
import random

In [667]:
# URL that we'll scrape 
url = "https://investmentpolicy.unctad.org/international-investment-agreements/by-economy"
page = requests.get(url)
# Check connection 
page.status_code

200

In [668]:
soup = BeautifulSoup(page.content, 'html.parser')

In [669]:
# Country Name 
soup.find_all('td')[0].get_text()

'Afghanistan'

In [670]:
# BIT information 
soup.find_all('td')[1].get_text()

'4 (3 in force)'

In [671]:
content1 = [i.get_text() for i in soup.find_all('td')]

In [678]:
some_names == content1[::3]

True

In [672]:
# Check results 
content1[0:6]

['Afghanistan',
 '4 (3 in force)',
 '5 (4 in force)',
 'Albania',
 '44 (39 in force)',
 '7 (7 in force)']

In [673]:
# Create separate country name list 
country_name = content1[::3]
# Standardize country names 
country_name = coco.convert(names=some_names, to='name_short')
# Check data 
country_name[0:6]
# Create panda df of country_name 
df_country_name = pd.DataFrame(country_name)
df_country_name.columns = ['country']
df_country_name.head()

Channel Islands not found in regex
Yugoslavia (former) not found in regex


Unnamed: 0,country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [414]:
# Create separate valuelist 
BIT_name = content1[1::3]
# Check data 
BIT_name[0:6]

['4 (3 in force)',
 '44 (39 in force)',
 '45 (29 in force)',
 '1 (1 in force)',
 '17 (6 in force)',
 '0 ']

In [418]:
# Create pd of 
df_n_bits = pd.DataFrame(BIT_name)
# Specify column name 
df_n_bits.columns = ['n_bits']
# Split string 
df_n_bits["n_bits"] = (df_n_bits.n_bits.str.split('\(').str[0])
# Check data
df_n_bits.head()

Unnamed: 0,n_bits
0,4
1,44
2,45
3,1
4,17


In [470]:
# Convert Total Column to int 
df_n_bits["n_bits"] = pd.to_numeric(df_bit["n_bits"])
# Check type
type(df_n_bits["n_bits"][4])

numpy.int64

In [471]:
# Create pd of In_force values 
df_n_bits_active = pd.DataFrame(BIT_name)
# Specify column name 
df_n_bits_active.columns = ["n_bits_active"]
# Split string to locate 
df_n_bits_active["n_bits_active"] = (df_n_bits_active.n_bits_active.str.split('\(').str[1])
# Split string on the end 
df_n_bits_active["n_bits_active"] = (df_n_bits_active.n_bits_active.str.split(' ').str[0]).fillna(0)
# Check data 
df_n_bits_active.head()

Unnamed: 0,n_bits_active
0,3
1,39
2,29
3,1
4,6


In [472]:
# Convert Total Column to int 
df_n_bits_active['n_bits_active'] = pd.to_numeric(df_n_bits_active['n_bits_active'])
# Check type
type(df_n_bits_active['n_bits_active'][4])

numpy.int64

In [484]:
# Merge values 
value_merge = df_n_bits.merge(df_n_bits_active, left_index = True, right_index = True)
# Merge full df 
df_country_level = df_country_name.merge(value_merge, left_index = True, right_index = True)
df_country_level.dtypes

country          object
n_bits            int64
n_bits_active     int64
dtype: object

In [474]:
# Check data 
pd.options.display.max_rows = 500
df_country_level

Unnamed: 0,country,n_bits,n_bits_active
0,Afghanistan,4,3
1,Albania,44,39
2,Algeria,45,29
3,Andorra,1,1
4,Angola,17,6
5,Anguilla,0,0
6,Antigua and Barbuda,3,2
7,Argentina,55,49
8,Armenia,42,39
9,Aruba,0,0


##### • Restrict this dataset to only include countries that are active member states in the United Nations.

In [682]:
# Assign U.N. URL to object 
url = 'https://www.un.org/en/about-us/member-states'
page = requests.get(url)

In [683]:
# Check connection 
page.status_code

200

In [684]:
# Parse content 
soup = (BeautifulSoup(page.content, 'html.parser'))

In [685]:
soup.find_all('h2')[1].get_text()

'Afghanistan'

In [686]:
# Create list of countries
content1 = [i.get_text() for i in soup.find_all('h2')]
content1[0:6]

['Search the United Nations',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola']

In [687]:
len(content1)

194

In [681]:
# Convert country names to standarized format 
final_df = coco.convert(names=some_names, to='name_short')
# Convert list to pd df 
final_df = pd.DataFrame(final_df)
# Drop first row 
final_df = final_df.iloc[0: , :]
# Name column 
final_df.columns = ['country']
#Check data
final_df

Channel Islands not found in regex
Yugoslavia (former) not found in regex


Unnamed: 0,country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
5,Anguilla
6,Antigua and Barbuda
7,Argentina
8,Armenia
9,Aruba


In [491]:
# Merge U.N. country data with value data
final_df = final_df.merge(df_country_level, how ='left')
# Drop not found country names
final_df = final_df[final_df['country'] != 'not found']
# Check data
final_df.sample(10, random_state = 123)

Unnamed: 0,country,n_bits,n_bits_active
187,Seychelles,4,2
151,Nicaragua,18,13
134,Mayotte,0,0
210,Tokelau,0,0
83,Grenada,2,2
91,Vatican,0,0
195,South Sudan,1,0
82,Greenland,0,0
202,Switzerland,112,111
20,Bermuda,0,0


In [492]:
# Check that data types fulfill requirements 
final_df.dtypes

country          object
n_bits            int64
n_bits_active     int64
dtype: object

In [679]:
final_df

Unnamed: 0,country,n_bits,n_bits_active
0,Albania,44,39
1,Algeria,45,29
2,Andorra,1,1
3,Angola,17,6
4,Anguilla,0,0
5,Antigua and Barbuda,3,2
6,Argentina,55,49
7,Armenia,42,39
8,Aruba,0,0
9,Australia,15,15


#### 2. A dyad-level dataset containing the following: (points: 10)
##### •Variables:
##### – country_A: the name of the signatory country (dtype: str);
##### – country_B: the name of the party country that signed a bilateral investment treaty with the signatory (dtype: str);
##### – status: categorical field tracking the current status of the BITs agreement (e.g. “Signed”, “In force”, and “Terminated”) these fields should be coded "signed", "active", and "terminated". (dtype: str);
##### – year_signed: year the agreement was signed (dtype: int/float);
#####  –year_enforced: year the agreement was enforced (dtype: int/float);
##### – year_terminated: year the agreement was terminated (if applicable) (dtype: int/float); Note that this data can be found by following the hyperlinks for each country. For example, if you click on “Afghanistan” you link to the following page. You’ll need to systematically scrape each country’s information from these pages. Note that we always know who the signatory country is (i.e. country_A) as they are the country name with the hyperlink.
##### • Restrict this dataset to only include countries (both in terms of the country_A and country_B fields) that are active member states in the United Nations.
##### • Note that every relevant country present in the first dataset should be present in the country_A column in the second dataset. However, countries where n_bits == 0 will not have any dyadic data. Ignore these countries (in terms of the country_A field) when building dataset 2.
##### • Finally, be sure to put your scraper to sleep randomly using a uniform distribution to determine the rest time upon each extraction iteration.

In [575]:
url =  'https://investmentpolicy.unctad.org/international-investment-agreements/treaties/bilateral-investment-treaties/596/bolivia-plurinational-state-of---united-states-of-america-bit-1998-'

In [577]:
page = requests.get(url)
# Check connection 
page.status_code

200

In [578]:
soup = BeautifulSoup(page.content, 'html.parser')

In [619]:
soup.find_all('a')

[<a class="navbar-brand" href="/">
 <img alt="logo" src="/Content/images/logo.png"/><h2>Investment Policy Hub</h2>
 </a>,
 <a alt="" href="#" id="lnkLogin"><img alt="" id="imglogin" src="/Content/images/user.png"/></a>,
 <a href="#"><img alt="" src="/Content/images/translate.png"/></a>,
 <a href="#" id="recover-password">Lost password?</a>,
 <a class="close" href="#">+</a>,
 <a href="/">Home</a>,
 <a aria-expanded="false" aria-haspopup="true" class="dropdown-toggle" href="#" role="button">Country Navigator</a>,
 <a class="dropdown-button" href="/country-navigator">Country Navigator</a>,
 <a class="" href="/country-navigator">Country Navigator</a>,
 <a aria-expanded="false" aria-haspopup="true" class="dropdown-toggle" data-toggle="dropdown" href="#" id="lnkMenuPolicyTools" role="button">Policy Tools</a>,
 <a href="/investment-policy-framework">
 <div class="title">
 <div class="title__img"><img alt="ipf" src="/Content/images/ipf.png"/></div>
 <div class="title__text"><b>Investment Polic

In [642]:
x = soup.find_all(class_='col-sm-10')
x

[<div class="col-sm-10">
 <ul>
 <li>1. <a href="/international-investment-agreements/countries/1/afghanistan">Afghanistan</a></li>
 <li>2. <a href="/international-investment-agreements/countries/98/iran-islamic-republic-of">Iran, Islamic Republic of</a></li>
 </ul>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                     Bilateral Investment Treaties
                 </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                     In force
                 </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                         28/05/2006
                     </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                         02/02/2008
                     </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                     Not mapped
                 </p>
 </div>]

In [638]:
story_content = [i.get_text() for i in soup.select("div > p ")] 
story_content[10]

'\nTotal in force:\n2270\n'

In [606]:
story_content = [i.get_text() for i in soup.select("div > div > div > div > div > ul > li")] 

In [607]:
story_content

["\n\n\n\n                        About\n                    \n\n\n\nUNCTAD's Work Programme on International Investment Agreements (IIAs) actively assists policymakers, government officials and other IIA stakeholders to reform IIAs with a view to making them more conducive to sustainable development and inclusive growth.\n\n\nInternational investment rulemaking is taking place at the bilateral, regional, interregional and multilateral levels. It requires policymakers, negotiators, civil society and other stakeholders to be well informed about foreign direct investment, international investment agreements (IIAs) and their impact on sustainable development. \n\n\nMain goals of UNCTAD’s Work Programme on IIAs\n\n•   Reform the international investment agreements (IIAs) regime to enhance its sustainable development dimension, \n•   Provide comprehensive analysis on key issues arising from the complexity of the international investment regime, \n•   Develop a wide range of tools to support

In [649]:
story_text = "\n".join(story_content)
print(story_text)




1. Afghanistan
2. Iran, Islamic Republic of




                    Bilateral Investment Treaties
                



                    In force
                



                        28/05/2006
                    



                        02/02/2008
                    



                    Not mapped
                



In [639]:
url  = 'https://investmentpolicy.unctad.org/international-investment-agreements/treaties/bilateral-investment-treaties/2/afghanistan---iran-islamic-republic-of-bit-2006-'

page = requests.get(url)
# Check connection 
page.status_code

200

In [640]:
soup = BeautifulSoup(page.content, 'html.parser')

In [641]:
x = soup.find_all(class_='col-sm-10')
x[1:7]

[<div class="col-sm-10">
 <p class="form-control-static">
                     Bilateral Investment Treaties
                 </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                     In force
                 </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                         28/05/2006
                     </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                         02/02/2008
                     </p>
 </div>, <div class="col-sm-10">
 <p class="form-control-static">
                     Not mapped
                 </p>
 </div>]

In [644]:
main_bbc_page_url = 'https://investmentpolicy.unctad.org/international-investment-agreements/by-economy'
main_page = requests.get(main_bbc_page_url)
main_page.status_code
main_soup = BeautifulSoup(main_page.content,'html.parser')


tag = main_soup.find_all("a")[10]
tag.attrs.get("href")


# Extract relevant links
links = set()
for tag in main_soup.find_all("a"):
    href = tag.attrs.get("href")
    #if "world-us-canada" in href and "https:" not in href:
        #links.update(["https://www.bbc.com" + href])
links

set()

In [648]:
def bbc_scraper(url=None):

    # Download the webpage
    page = requests.get(url)

    # If a connection was reached
    if page.status_code == 200:

        # Parse
        soup = BeautifulSoup(page.content, 'html.parser')

        # Pull Headline
        #story_headline = soup.find_all("h1")[0].get_text()

        # Pull Date
        #story_date = soup.select("article > header > div:nth-child(2) > dl > div > dd > span > time")[0].get_text()

        # Pull story content
        story_content = [i.get_text() for i in soup.select("article > div > div")]

        # Return data
        return [story_content]
bbc_scraper

In [None]:
https://investmentpolicy.unctad.org/international-investment-agreements/countries/1/afghanistan

In [647]:
story_content = [i.get_text() for i in soup.find_all(class_='col-sm-10')]
story_text = " ".join(story_content)
story_text

'\n\n1. Afghanistan\n2. Iran, Islamic Republic of\n\n \n\r\n                    Bilateral Investment Treaties\r\n                \n \n\r\n                    In force\r\n                \n \n\r\n                        28/05/2006\r\n                    \n \n\r\n                        02/02/2008\r\n                    \n \n\r\n                    Not mapped\r\n                \n'

In [None]:
def link_scrape(urls=None,sleep=3):
    """Scrape multiple BBC URLS.

    Args:
        urls (list): list of valid BBC news urls.
        sleep (int): Integer value specifying how long the machine should be
                    put to sleep (random uniform). Defaults to 3.

    Returns:
        DataFrame: frame containing headline, date, and content fields
    """
    scraped_data = []
    for url in urls:

        print(url) # Keep track of where we are at.

        try:
            # Scrape the content This will break on URLs that we haven't
            # accounted for the structure on. So we'll use a try and except 
            # clause so our code continues even though it breaks on some urls. 
            scraped_data.append(bbc_scraper(url))
        except:
            print("URL doesn't work with scraper")

        # Put the system to sleep for a random draw of time (be kind)
        time.sleep(random.uniform(0,sleep))

    dat = pd.DataFrame(scraped_data,columns=["headline","date","content"])
    return dat


dat_content = link_scrape(urls=links)
dat_content