In [None]:
import pandas as pd
import unicodedata 
from bs4 import BeautifulSoup as bs
import requests

# make a list of cities we want to do webscraping 
cities = ['Berlin','Hamburg','Munich','Cologne','Frankfurt']

# a function to get city info
def City_info(soup):
    
    ret_dict = {}
    ret_dict['city'] = soup.h1.get_text()
    
    if soup.select_one('.mergedrow:-soup-contains("Mayor")>.infobox-label') != None:
        i = soup.select_one('.mergedrow:-soup-contains("Mayor")>.infobox-label')
        mayor_name_html = i.find_next_sibling()
        mayor_name = unicodedata.normalize('NFKD',mayor_name_html.get_text())
        ret_dict['mayor']  = mayor_name
    
    if soup.select_one('.mergedrow:-soup-contains("City")>.infobox-label') != None:
        j =  soup.select_one('.mergedrow:-soup-contains("City")>.infobox-label')
        area = j.find_next_sibling('td').get_text()
        ret_dict['city_size'] = unicodedata.normalize('NFKD',area)

    if soup.select_one('.mergedtoprow:-soup-contains("Elevation")>.infobox-data') != None:
        k = soup.select_one('.mergedtoprow:-soup-contains("Elevation")>.infobox-data')
        elevation_html = k.get_text()
        ret_dict['elevation'] = unicodedata.normalize('NFKD',elevation_html)
    
    if soup.select_one('.mergedtoprow:-soup-contains("Population")') != None:
        l = soup.select_one('.mergedtoprow:-soup-contains("Population")')
        c_pop = l.findNext('td').get_text()
        ret_dict['city_population'] = c_pop
    
    if soup.select_one('.infobox-label>[title^=Urban]') != None:
        m = soup.select_one('.infobox-label>[title^=Urban]')
        u_pop = m.findNext('td')
        ret_dict['urban_population'] = u_pop.get_text()

    if soup.select_one('.infobox-label>[title^=Metro]') != None:
        n = soup.select_one('.infobox-label>[title^=Metro]')
        m_pop = n.findNext('td')
        ret_dict['metro_population'] = m_pop.get_text()
    
    if soup.select_one('.latitude') != None:
        o = soup.select_one('.latitude')
        ret_dict['lat'] = o.get_text()

    if soup.select_one('.longitude') != None:    
        p = soup.select_one('.longitude')
        ret_dict['long'] = p.get_text()
    
    return ret_dict

### a loop to do web-scraping
### to get the results in English, we can add a 'header' to our 'requests.get()' line, bellow + this line:
# header = {"Accept-Language": "en-US,en;q=0.5"}

list_of_city_info = []
for city in cities:
    url = f'https://en.wikipedia.org/wiki/{city}'
    web = requests.get(url,'html.parser')
    soup = bs(web.content)
    list_of_city_info.append(City_info(soup))

df_cities = pd.DataFrame(list_of_city_info)

In [None]:
new_cities= (
    df_cities
    .assign(area2 = lambda x: x['city_size'].str.split('km2'))
    .assign(area3 = lambda x:[x['area2'][i][0] for i in range(5)])
    .filter(['city','elevation','city_population','urban_population','metro_population','lat','long','area3'])
    .reset_index()
    .rename(columns={'area3':'city_area','index':'city_id','city':'city_name','population':'city_population','lat':'latitude','long':'longitude'})
    .assign(city_population = lambda x: x['city_population'].replace(to_replace="\,", value="", regex=True))
    .assign(urban_population = lambda x: x['urban_population'].replace(to_replace="\,", value="", regex=True))
    .assign(urban_population = lambda x: x['urban_population'].str.split('['))
    .assign(urban_population = lambda x: [x['urban_population'][i][0] for i in range(5)]) 
    .assign(urban_population = lambda x: x['urban_population'].str.split("\("))
    .assign(urban_population = lambda x: [x['urban_population'][i][0] for i in range(5)]) 
    .assign(metro_population = lambda x: x['metro_population'].replace(to_replace="\,", value="", regex=True))
    .assign(metro_population = lambda x: x['metro_population'].str.split('[')) 
    .assign(metro_population = lambda x: [x['metro_population'][i][0] for i in range(5)]) 
    .assign(metro_population = lambda x: x['metro_population'].str.split("\("))
    .assign(metro_population = lambda x: [x['metro_population'][i][0] for i in range(5)]) 
    .assign(city_area = lambda x: pd.to_numeric(x['city_area']))
    .assign(city_population = lambda x: pd.to_numeric(x['city_population']))
    .assign(urban_population = lambda x: pd.to_numeric(x['urban_population']))
    .assign(metro_population = lambda x: pd.to_numeric(x['metro_population']))
    .assign(municipality_iso_country = lambda x: x['city_name']+ ",DE") 
)
new_cities

In [None]:
new_cities.to_csv('CSV/demographics.csv')