My Chrome extension, which extracts webpage data ethically, created a json file that contains a list of urls to each state's Wikipedia page. I then import it.

In [1]:
import json

with open('wikipedia_states_urls.json') as f:
    urls = json.load(f)[0]['items']

print(urls)

[{'name': 'Johor', 'url': '/wiki/Johor'}, {'name': 'Kedah', 'url': '/wiki/Kedah'}, {'name': 'Kelantan', 'url': '/wiki/Kelantan'}, {'name': 'Malacca', 'url': '/wiki/Malacca'}, {'name': 'Negeri Sembilan', 'url': '/wiki/Negeri_Sembilan'}, {'name': 'Pahang', 'url': '/wiki/Pahang'}, {'name': 'Perak', 'url': '/wiki/Perak'}, {'name': 'Perlis', 'url': '/wiki/Perlis'}, {'name': 'Penang', 'url': '/wiki/Penang'}, {'name': 'Selangor', 'url': '/wiki/Selangor'}, {'name': 'Terengganu', 'url': '/wiki/Terengganu'}, {'name': 'Sabah', 'url': '/wiki/Sabah'}, {'name': 'Sarawak', 'url': '/wiki/Sarawak'}, {'name': 'Kuala Lumpur', 'url': '/wiki/Kuala_Lumpur'}, {'name': 'Labuan', 'url': '/wiki/Labuan'}, {'name': 'Putrajaya', 'url': '/wiki/Putrajaya'}]


With requests and bs4, every 5 seconds, I visit each page and obtain the coordinates.

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import re

for url in urls:
    time.sleep(5)
    page = requests.get('https://en.wikipedia.org/' + url['url'])
    soup = BeautifulSoup(page.content, 'html.parser')
    url['geo'] = soup.find('span', {'class' :'geo'}).text
    ths = soup.find_all("th", {'class' :'infobox-header'})
    for th in ths:
        if "Population" in th.text:
            url['pop'] = th.parent.findNext('tr').findChildren("td" , recursive=False)[0].text
    print(url)

{'name': 'Johor', 'url': '/wiki/Johor', 'geo': '1.99083; 103.48278', 'pop': '3,700,000 (3rd)'}
{'name': 'Kedah', 'url': '/wiki/Kedah', 'geo': '6.12833; 100.36278', 'pop': '2,071,900'}
{'name': 'Kelantan', 'url': '/wiki/Kelantan', 'geo': '5.250; 102.000', 'pop': '2,001,000'}
{'name': 'Malacca', 'url': '/wiki/Malacca', 'geo': '2.200; 102.250', 'pop': '932,700'}
{'name': 'Negeri Sembilan', 'url': '/wiki/Negeri_Sembilan', 'geo': '2.750; 102.250', 'pop': '1,098,500'}
{'name': 'Pahang', 'url': '/wiki/Pahang', 'geo': '3.750; 102.500', 'pop': '1,623,200'}
{'name': 'Perak', 'url': '/wiki/Perak', 'geo': '4.750; 101.000', 'pop': '2,500,000 (5th)'}
{'name': 'Perlis', 'url': '/wiki/Perlis', 'geo': '6.500; 100.250', 'pop': '254,400'}
{'name': 'Penang', 'url': '/wiki/Penang', 'geo': '5.40250; 100.36500', 'pop': '1.783 million (as at end of year 2,020)'}
{'name': 'Selangor', 'url': '/wiki/Selangor', 'geo': '3.333; 101.500', 'pop': '6,448,400 (1st)'}
{'name': 'Terengganu', 'url': '/wiki/Terengganu', 'g

# Data cleaning

In [3]:
import pandas as pd

df = pd.DataFrame(urls)
df

Unnamed: 0,name,url,geo,pop
0,Johor,/wiki/Johor,1.99083; 103.48278,"3,700,000 (3rd)"
1,Kedah,/wiki/Kedah,6.12833; 100.36278,2071900
2,Kelantan,/wiki/Kelantan,5.250; 102.000,2001000
3,Malacca,/wiki/Malacca,2.200; 102.250,932700
4,Negeri Sembilan,/wiki/Negeri_Sembilan,2.750; 102.250,1098500
5,Pahang,/wiki/Pahang,3.750; 102.500,1623200
6,Perak,/wiki/Perak,4.750; 101.000,"2,500,000 (5th)"
7,Perlis,/wiki/Perlis,6.500; 100.250,254400
8,Penang,/wiki/Penang,5.40250; 100.36500,"1.783 million (as at end of year 2,020)"
9,Selangor,/wiki/Selangor,3.333; 101.500,"6,448,400 (1st)"


In [4]:
df.loc[8,'pop'] = '1,783,000'
df

Unnamed: 0,name,url,geo,pop
0,Johor,/wiki/Johor,1.99083; 103.48278,"3,700,000 (3rd)"
1,Kedah,/wiki/Kedah,6.12833; 100.36278,2071900
2,Kelantan,/wiki/Kelantan,5.250; 102.000,2001000
3,Malacca,/wiki/Malacca,2.200; 102.250,932700
4,Negeri Sembilan,/wiki/Negeri_Sembilan,2.750; 102.250,1098500
5,Pahang,/wiki/Pahang,3.750; 102.500,1623200
6,Perak,/wiki/Perak,4.750; 101.000,"2,500,000 (5th)"
7,Perlis,/wiki/Perlis,6.500; 100.250,254400
8,Penang,/wiki/Penang,5.40250; 100.36500,1783000
9,Selangor,/wiki/Selangor,3.333; 101.500,"6,448,400 (1st)"


In [5]:
df['pop'] = df['pop'].apply(lambda x: x.replace(',', ''))
df

Unnamed: 0,name,url,geo,pop
0,Johor,/wiki/Johor,1.99083; 103.48278,3700000 (3rd)
1,Kedah,/wiki/Kedah,6.12833; 100.36278,2071900
2,Kelantan,/wiki/Kelantan,5.250; 102.000,2001000
3,Malacca,/wiki/Malacca,2.200; 102.250,932700
4,Negeri Sembilan,/wiki/Negeri_Sembilan,2.750; 102.250,1098500
5,Pahang,/wiki/Pahang,3.750; 102.500,1623200
6,Perak,/wiki/Perak,4.750; 101.000,2500000 (5th)
7,Perlis,/wiki/Perlis,6.500; 100.250,254400
8,Penang,/wiki/Penang,5.40250; 100.36500,1783000
9,Selangor,/wiki/Selangor,3.333; 101.500,6448400 (1st)


In [6]:
import re

df['pop'] = df['pop'].apply(lambda x: int(re.findall(r"[0-9]+", x)[0]))
df

Unnamed: 0,name,url,geo,pop
0,Johor,/wiki/Johor,1.99083; 103.48278,3700000
1,Kedah,/wiki/Kedah,6.12833; 100.36278,2071900
2,Kelantan,/wiki/Kelantan,5.250; 102.000,2001000
3,Malacca,/wiki/Malacca,2.200; 102.250,932700
4,Negeri Sembilan,/wiki/Negeri_Sembilan,2.750; 102.250,1098500
5,Pahang,/wiki/Pahang,3.750; 102.500,1623200
6,Perak,/wiki/Perak,4.750; 101.000,2500000
7,Perlis,/wiki/Perlis,6.500; 100.250,254400
8,Penang,/wiki/Penang,5.40250; 100.36500,1783000
9,Selangor,/wiki/Selangor,3.333; 101.500,6448400


In [7]:
df[['latitude', 'longitude']] = df['geo'].str.split('; ', 1, expand=True)
df = df.drop(['geo', 'url'], axis=1)
df.head()

Unnamed: 0,name,pop,latitude,longitude
0,Johor,3700000,1.99083,103.48278
1,Kedah,2071900,6.12833,100.36278
2,Kelantan,2001000,5.25,102.0
3,Malacca,932700,2.2,102.25
4,Negeri Sembilan,1098500,2.75,102.25


Save cleaned data to csv

In [8]:
df.to_csv('states.csv', index=False)