In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#grab the html of the weboage:
url = requests.get("https://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population").text
#parse:
soup = BeautifulSoup(url, 'html5lib')

In [3]:
# You need to  find all objects that include the css class “wikitable” within the soup object.
tables = soup.find_all(class_='wikitable')[0:2]   #0 <= i < 2

In [4]:
#Creating a nested dictionary from wikipedia table html:

#make a list of all country names:
names = []
for entry in tables[0].find_all('tr')[1:]:
    name = entry.find_all('a')[0]
    names.append(name.text)

In [5]:
#TABLE 1985-2020 - Table "1"

#columns to be kept later - only populations, not average annual growth (%)
goodColumnsT2 = [0,2,4,6,8,10,12]

#make a list of all dates from the header:
dates = [] 

#will append to the same dates[] list from Table 1
header = tables[1].find_all('tr')[0]   
i = 0
for cell in header.find_all('th')[1:]:
    if (i in goodColumnsT2):             #append the correct column values to dates[]
        dates.append(cell.text)    
    i += 1
    
#popsAll2 will store population lists for all countries
popsAll2 = [] 

for entry in tables[1].find_all('tr')[1:]:
    #
    #pops1 is a population list for a given country
    pops1 = []
    i = 0
    for cell in entry.find_all('td')[1:]:
        if (i in goodColumnsT2):             #append the correct column values to pops1[]
            pop = int(cell.text.replace(',', ''))
            pops1.append(pop)
        i += 1
        
    popsAll2.append(pops1)   #append each list to outer list

In [6]:
#Zip all data together into a nested dictionary:

#zip each date with its population into an innerDict
innerDicts = []
for pops1 in popsAll2:
    innerDict = dict(zip(dates, pops1))
    innerDicts.append(innerDict)

#zip the country names with the innerDicts
finalDict = dict(zip(names, innerDicts))

#Now we have a dictionary!

In [7]:
#Create a dataframe from the dictionary:
populationDF = pd.DataFrame.from_dict(finalDict, orient='index')
populationDF = populationDF.fillna(0)      #fill missing values with NA
populationDF = populationDF.reset_index()  #put countries from index into column
populationDF.head(8)

Unnamed: 0,index,1985,1990,1995,2000,2005,2010,2015
0,Afghanistan,13120,13569,19446,22462,26335,29121,32565
1,Albania,2957,3245,3159,3159,3025,2987,3030
2,Algeria,22009,25191,28322,30639,32918,35950,39543
3,American Samoa,39,48,54,58,57,56,55
4,Andorra,45,53,64,66,77,85,86
5,Angola,8390,9486,11000,12683,14770,17043,19626
6,Anguilla,7,9,10,12,14,15,17
7,Antigua and Barbuda,65,65,69,76,82,87,93


In [8]:
countryList = ['India', 'Philippines', 'Bangladesh', 'China', 'Saudi Arabia', 'Poland', 'Russia', 'Germany', 'Ukraine', 'Serbia', 'Albania', 'Ecuador', 'Colombia', 'Brazil', 'Chile', 'Uganda', 'Kenya', 'Ethiopia', 'Morocco', 'South Africa', 'Nigeria', 'Burundi', 'United States', 'Canada', 'Mexico', 'Dominican Republic', 'Guatemala', 'Haiti', 'Australia', 'New Zealand', 'Solomon Islands', 'Fiji']

In [9]:
populationDF = populationDF[['index', '2000','2005', '2010', '2015']]
populationDF = populationDF[populationDF['index'].isin(countryList)]
populationDF.columns = ['country', '2000', '2005', '2010', '2015']# = ['indexx', '2000','2005', '2010', '2015']
populationDF

Unnamed: 0,country,2000,2005,2010,2015
1,Albania,3159,3025,2987,3030
11,Australia,19054,20233,21516,22752
16,Bangladesh,132151,144139,156119,168958
27,Brazil,174316,186021,195835,204260
32,Burundi,6716,7789,9121,10743
35,Canada,31100,32387,33760,35100
40,Chile,15175,15980,16760,17509
41,China,1268302,1302285,1336681,1367486
42,Colombia,38911,41488,44206,46737
55,Dominican Republic,8469,9165,9824,10479


In [10]:
populationDF.reset_index(drop=True, inplace=True)
populationDF

Unnamed: 0,country,2000,2005,2010,2015
0,Albania,3159,3025,2987,3030
1,Australia,19054,20233,21516,22752
2,Bangladesh,132151,144139,156119,168958
3,Brazil,174316,186021,195835,204260
4,Burundi,6716,7789,9121,10743
5,Canada,31100,32387,33760,35100
6,Chile,15175,15980,16760,17509
7,China,1268302,1302285,1336681,1367486
8,Colombia,38911,41488,44206,46737
9,Dominican Republic,8469,9165,9824,10479


In [11]:
# Save Curated Dataset
populationDF.to_csv('Clean_population.csv', index = False)