In [1]:
!pip install --upgrade beautifulsoup4



In [2]:
# Import necessary modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Function to extract website data from the soup object
def get_website(soup):
  # Find the element with class 'infobox-label' and text 'Website'
  website_elem = soup.find(class_="infobox-label", string="Website")
  if website_elem:
    # If found, return the next sibling element with class 'infobox-data'
    next_elem = website_elem.find_next(class_='infobox-data')
    return next_elem.text
  else:
      # If not found, return None
      return None

# Function to extract population data from the soup object
def get_population(soup):
    # Find the table header element containing the text 'Population'
    population_elem = soup.select_one('th.infobox-header:-soup-contains("Population")')
    # If found, find the next sibling and extract the first numerical data
    return population_elem.parent.find_next_sibling().find(string=re.compile(r'\d+')) if population_elem else None # note the different syntax for the if/else clause here

# Function to clean the data in the DataFrame
def clean_data(df):
    # Remove unnecessary characters from 'latitude' and 'longitude' columns
    df['latitude'] = df['latitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
    df['longitude'] = df['longitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)

# Function to extract city info from its Wikipedia page
def get_city_info(city):
    # Construct the URL
    url = f'https://en.wikipedia.org/wiki/{city}'
    # Send a GET request
    r = requests.get(url)
    # Parse the response content with BeautifulSoup
    soup = BeautifulSoup(r.content, 'html.parser')

    try:
        # Construct a dictionary with necessary details
        response_dict = {
            'city': soup.select_one(".firstHeading").get_text(),
            'country': soup.select_one(".infobox-data").get_text(),
            'latitude': soup.select_one(".latitude").get_text(),
            'longitude': soup.select_one(".longitude").get_text(),
            'website': get_website(soup),
            'population': get_population(soup)
        }
    except AttributeError:
        # If any data is missing, print an error message and return None
        print(f'Failed to get data for {city}')
        return None

    return response_dict

# Function to scrape data for a list of cities and return a DataFrame
def recreate_wiki(cities):
    # Get info for each city
    city_data = [get_city_info(city) for city in cities]
    # Convert the list of dictionaries to a DataFrame
    cities_df = pd.DataFrame(city_data)
    # Clean the data
    clean_data(cities_df)
    # Return the DataFrame
    return cities_df

In [3]:
list_of_cities = ['Berlin', 'Hamburg', 'Bremen', 'Munich', 'Stuttgart']
recreate_wiki(list_of_cities)

Unnamed: 0,city,country,latitude,longitude,website,population
0,Berlin,Germany,52.3112,13.2418,berlin.de,3850809
1,Hamburg,Germany,53.33N,10.00E,hamburg.com,1906411
2,Bremen,Germany,53.5N,8.48E,Bremen online,563290
3,Munich,Germany,48.0815,11.3430,stadt.muenchen.de,1512491
4,Stuttgart,Germany,48.4639,09.1048,www.stuttgart-tourist.de,626275
