<h2> Web Scrapping of Location Dataset with geopy Python Packages</h2>

<b> By Michael Kumakech</b>

In [51]:
import requests # helps us to browse and request web content
import lxml.html as lh # An API that communicate with the html code
import pandas as pd # used for uploading the data in a frame for analysis
import numpy as np # used for solving scientific, data science and computation

In [52]:
africa_url = 'https://www.worldometers.info/population/countries-in-africa-by-population/' #assign the wiki page

page = requests.get(africa_url) # create a handle to for contents of the wiki page
doc = lh.fromstring(page.content) # store content of the wiki page under doc

tr_elements = doc.xpath('//tr') # parse data stored between tr in the html

[len(T) for T in tr_elements[:12]] # check the length of the first 12

[12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]

In [53]:
tr_elements = doc.xpath('//tr') # parse first row as header

col = [] # create empty list
i = 0

for t in tr_elements[0]: # for each row, store each first element (header) and an empty list
    i+=1
    name=t.text_content()
    print("%d:%s" % (i,name))
    col.append((name,[]))

1:#
2:Country (or dependency)
3:Population (2020)
4:Yearly Change
5:Net Change
6:Density (P/Km²)
7:Land Area (Km²)
8:Migrants (net)
9:Fert. Rate
10:Med. Age
11:Urban Pop %
12:World Share


In [54]:
for j in range(1,len(tr_elements)): # Because header is the first row, data would be store in the subsequent rows.
    T = tr_elements[j] #T is j'th row
    
    if len(T)!=12: #if row is not size 3, //tr data is not from the table.
        break

    i = 0 #i is the index of the first column
    
    for t in T.iterchildren(): #iterate through each element of the row
        data=t.text_content()
            
        col[i][1].append(data) #append the data to the empty list of the i'th column
            
        i+=1 #increment i for the next column

In [55]:
[len(C) for (title,C) in col]

[58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58]

In [56]:
Dict = {title:column for (title,column) in col}
df = pd.DataFrame(Dict)

In [57]:
df.head()

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,Nigeria,206139589,2.58 %,5175990,226,910770,-60000,5.4,18,52 %,2.64 %
1,2,Ethiopia,114963588,2.57 %,2884858,115,1000000,30000,4.3,19,21 %,1.47 %
2,3,Egypt,102334404,1.94 %,1946331,103,995450,-38033,3.3,25,43 %,1.31 %
3,4,DR Congo,89561403,3.19 %,2770836,40,2267050,23861,6.0,17,46 %,1.15 %
4,5,South Africa,59308690,1.28 %,750420,49,1213090,145405,2.4,28,67 %,0.76 %


In [58]:
df.tail()

Unnamed: 0,#,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
53,54,Cabo Verde,555987,1.10 %,6052,138,4030,-1342.0,2.3,28,68 %,0.01 %
54,55,Mayotte,272815,2.50 %,6665,728,375,0.0,3.7,20,46 %,0.00 %
55,56,Sao Tome & Principe,219159,1.91 %,4103,228,960,-1680.0,4.4,19,74 %,0.00 %
56,57,Seychelles,98347,0.62 %,608,214,460,-200.0,2.5,34,56 %,0.00 %
57,58,Saint Helena,6077,0.30 %,18,16,390,,N.A.,N.A.,27 %,0.00 %


<h2> PART2: Clean the Data set </h2>

In [59]:
df1 =df[['Country (or dependency)','Population (2020)','Yearly Change','Net Change','Density (P/Km²)','Land Area (Km²)','Fert. Rate','Med. Age']]

In [60]:
df1.head()

Unnamed: 0,Country (or dependency),Population (2020),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Fert. Rate,Med. Age
0,Nigeria,206139589,2.58 %,5175990,226,910770,5.4,18
1,Ethiopia,114963588,2.57 %,2884858,115,1000000,4.3,19
2,Egypt,102334404,1.94 %,1946331,103,995450,3.3,25
3,DR Congo,89561403,3.19 %,2770836,40,2267050,6.0,17
4,South Africa,59308690,1.28 %,750420,49,1213090,2.4,28


<b> Drop column " Yearly Change"</b>

In [61]:
df1 =df1.drop(columns = ['Yearly Change'])
df1.head()

Unnamed: 0,Country (or dependency),Population (2020),Net Change,Density (P/Km²),Land Area (Km²),Fert. Rate,Med. Age
0,Nigeria,206139589,5175990,226,910770,5.4,18
1,Ethiopia,114963588,2884858,115,1000000,4.3,19
2,Egypt,102334404,1946331,103,995450,3.3,25
3,DR Congo,89561403,2770836,40,2267050,6.0,17
4,South Africa,59308690,750420,49,1213090,2.4,28


<b> Change the columns names</b>

In [62]:
df2 =df1.rename(columns ={'Country (or dependency)':'Country','Population (2020)':'Population','Net Change':'NetChange','Density (P/Km²)':'PopDensity','Land Area (Km²)':'LandArea','Fert. Rate':'FertRate','Med. Age':'MedAge'})

In [63]:
df2.head()

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
0,Nigeria,206139589,5175990,226,910770,5.4,18
1,Ethiopia,114963588,2884858,115,1000000,4.3,19
2,Egypt,102334404,1946331,103,995450,3.3,25
3,DR Congo,89561403,2770836,40,2267050,6.0,17
4,South Africa,59308690,750420,49,1213090,2.4,28


<b>Remove the special charater ','</b>

In [64]:
df2['Population'] = df2.Population.str.replace(',','')
df2['NetChange'] = df2.NetChange.str.replace(',','')
df2['LandArea'] = df2.LandArea.str.replace(',','')
df2.head()

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
0,Nigeria,206139589,5175990,226,910770,5.4,18
1,Ethiopia,114963588,2884858,115,1000000,4.3,19
2,Egypt,102334404,1946331,103,995450,3.3,25
3,DR Congo,89561403,2770836,40,2267050,6.0,17
4,South Africa,59308690,750420,49,1213090,2.4,28


In [65]:
df2.tail()

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
53,Cabo Verde,555987,6052,138,4030,2.3,28
54,Mayotte,272815,6665,728,375,3.7,20
55,Sao Tome & Principe,219159,4103,228,960,4.4,19
56,Seychelles,98347,608,214,460,2.5,34
57,Saint Helena,6077,18,16,390,N.A.,N.A.


<b>Check the data types</b>

In [66]:
df2.dtypes

Country       object
Population    object
NetChange     object
PopDensity    object
LandArea      object
FertRate      object
MedAge        object
dtype: object

<b> Deal with Missing Values</b>

In [67]:
import numpy as np

# replace "N.A." to NaN
df2.replace("N.A.", np.nan, inplace = True)
df2.head(5)

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
0,Nigeria,206139589,5175990,226,910770,5.4,18
1,Ethiopia,114963588,2884858,115,1000000,4.3,19
2,Egypt,102334404,1946331,103,995450,3.3,25
3,DR Congo,89561403,2770836,40,2267050,6.0,17
4,South Africa,59308690,750420,49,1213090,2.4,28


In [68]:
missing_data = df2.isnull()
missing_data.tail(5)

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
53,False,False,False,False,False,False,False
54,False,False,False,False,False,False,False
55,False,False,False,False,False,False,False
56,False,False,False,False,False,False,False
57,False,False,False,False,False,True,True


In [69]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("") 

Country
False    58
Name: Country, dtype: int64

Population
False    58
Name: Population, dtype: int64

NetChange
False    58
Name: NetChange, dtype: int64

PopDensity
False    58
Name: PopDensity, dtype: int64

LandArea
False    58
Name: LandArea, dtype: int64

FertRate
False    57
True      1
Name: FertRate, dtype: int64

MedAge
False    57
True      1
Name: MedAge, dtype: int64



<b> Replace the missing values with average</b>

In [70]:
avg_FertRate = df2["FertRate"].astype("float").mean(axis=0)
print("Average of FertRate:", avg_FertRate)

Average of FertRate: 4.143859649122808


<b> Convert the 'object' to 'int' and 'float'</b>

In [71]:
avg_MedAge = df2["MedAge"].astype("float").mean(axis=0)
print("Average of MedAge:", avg_MedAge)

Average of MedAge: 21.45614035087719


In [72]:
df2["FertRate"].replace(np.nan, avg_FertRate, inplace=True)

In [73]:
df2["MedAge"].replace(np.nan, avg_MedAge, inplace=True)

In [74]:
df2.tail()

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
53,Cabo Verde,555987,6052,138,4030,2.3,28.0
54,Mayotte,272815,6665,728,375,3.7,20.0
55,Sao Tome & Principe,219159,4103,228,960,4.4,19.0
56,Seychelles,98347,608,214,460,2.5,34.0
57,Saint Helena,6077,18,16,390,4.14386,21.45614


In [75]:
df2[["Population","NetChange"]] = df2[["Population","NetChange"]].astype(str).astype(int)
df2[["PopDensity","LandArea"]] = df2[["PopDensity","LandArea"]].astype(str).astype(int)
df2[["FertRate"]] = df2[["FertRate"]].astype(str).astype(float)
df2[["MedAge"]] = df2[["MedAge"]].astype(str).astype(float)

In [76]:
df2.dtypes

Country        object
Population      int32
NetChange       int32
PopDensity      int32
LandArea        int32
FertRate      float64
MedAge        float64
dtype: object

In [77]:
df2.describe()

Unnamed: 0,Population,NetChange,PopDensity,LandArea,FertRate,MedAge
count,58.0,58.0,58.0,58.0,58.0,58.0
mean,23113760.0,560930.2,119.0,511180.7,4.14386,21.45614
std,35061690.0,883014.2,158.623907,584848.6,1.16318,5.157417
min,6077.0,18.0,2.0,375.0,1.4,15.0
25%,2257207.0,47291.75,25.0,28680.0,3.325,18.0
50%,12006990.0,269752.5,61.5,269800.0,4.35,19.0
75%,27404730.0,667544.8,131.5,814062.5,4.775,22.75
max,206139600.0,5175990.0,728.0,2381740.0,7.0,37.0


<b> Good! Now, we obtain cleaned dataset with no missing values.</b>

<h2> PART 3: Location Data</h2>

In [35]:
!pip install geopy
from geopy.geocoders import Nominatim # library to covert address to latitude and longitude
!pip install geocoder
import geocoder



In [36]:
!pip install geopandas



<b> Get latitude and longitude for each rows of the dataframe.</b>

In [78]:
import geocoder
def get_latlng(arcgis_geocoder): # defining the function
    
    lat_lng_coords = None # initialising location to None
    
    while(lat_lng_coords is None): # geocode while loop to create latitude and longitude for each rows
        g = geocoder.arcgis('{}, Africa, World'.format(arcgis_geocoder))
        lat_lng_coords = g.latlng
    return lat_lng_coords

<b> Get the latitude and longitude based on Region_district</b>

In [79]:
african_country_code = df2['Country']
coordinates = [get_latlng(african_country_code) for african_country_code in african_country_code.tolist()]

<h2> PART 4: Latitude and Longitude parameters </h2>

<b> Put the Latitude and Longitude columns in the dataframe and print the first 12 rows.</b>

In [80]:
df_loc = df2

df_coordinates = pd.DataFrame(coordinates, columns = ['Latitude', 'Longitude'])

df_loc['Latitude'] = df_coordinates['Latitude']

df_loc['Longitude'] = df_coordinates['Longitude']

df_loc.head(12)

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge,Latitude,Longitude
0,Nigeria,206139589,5175990,226,910770,5.4,18.0,-29.66972,31.00363
1,Ethiopia,114963588,2884858,115,1000000,4.3,19.0,-22.83571,30.54837
2,Egypt,102334404,1946331,103,995450,3.3,25.0,-28.19261,28.33044
3,DR Congo,89561403,2770836,40,2267050,6.0,17.0,-29.74187,30.87469
4,South Africa,59308690,750420,49,1213090,2.4,28.0,-33.97981,18.46509
5,Tanzania,59734218,1728755,67,885800,4.9,18.0,-29.68723,30.9751
6,Kenya,53771296,1197323,94,569140,3.5,20.0,-0.08534,34.77269
7,Uganda,45741007,1471413,229,199810,5.0,17.0,0.35128,32.53793
8,Algeria,43851044,797990,18,2381740,3.1,29.0,-32.3731,19.05766
9,Sudan,43849260,1036022,25,1765048,4.4,20.0,-27.93038,27.78379


In [43]:
df_loc.tail()

Unnamed: 0,Country,Population,NetChange,PopDensity,LandArea,FertRate,MedAge,Latitude,Longitude
53,Cabo Verde,555987,6052,138,4030,2.3,28.0,15.083629,-23.624672
54,Mayotte,272815,6665,728,375,3.7,20.0,-12.820609,45.147849
55,Sao Tome & Principe,219159,4103,228,960,4.4,19.0,0.239311,6.60206
56,Seychelles,98347,608,214,460,2.5,34.0,-29.75948,31.0658
57,Saint Helena,6077,18,16,390,4.14386,21.45614,-26.24269,29.70601


In [81]:
import matplotlib.cm as cm
import matplotlib.colors as colors

import numpy as np

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from sklearn.cluster import KMeans

!pip -q install folium
print('folium installed...')
import folium # library for map rendering
print('folium imported...')
print('Done')

folium installed...
folium imported...
Done


<b> Using the geopy library to get the latitude and longitude values of Lesotho</b>

In [83]:
from geopy.geocoders import Nominatim

address = 'Lesotho, Africa'

geolocator = Nominatim(user_agent="ln_explorer")

location = geolocator.geocode(address)

latitude = location.latitude

longitude = location.longitude

print('The geographical coordinates of Lesotho are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Lesotho are -29.6039267, 28.3350193.


<b>Coming up with the map of Africa with folium </b>

In [84]:
map_africa = folium.Map(location = [latitude, longitude], zoom_start=4)

map_africa

<folium.map.Marker at 0x23797dcbe50>