In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from geopy import Nominatim

In [46]:
url ='https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_capacity'
NO_IMAGE = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/No-image-available.png/480px-No-image-available.png'

In [47]:
def get_html_page(url):
    print(f'Getting wikipedia html page {url}')
    try:
        response = requests.get(url,timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f'Error getting wikipedia data {e}')

In [48]:
def get_table_data(url):
    soup = get_html_page(url)
    table = soup.find_all("table", {"class":"wikitable"})[1]
    table_rows = table.find_all("tr")
    return table_rows

In [49]:
def get_geo_loc(country, city):
    geolocator = Nominatim(user_agent='geoapiExercises')
    location = geolocator.geocode(f'{city}, {country}')
    if location:
        return location.latitude, location.longitude
    return None

In [50]:
data = []
def extract_data(url):
    table_rows = get_table_data(url)
    for i in range(1,len(table_rows)):
        table_data = table_rows[i].find_all('td')
    
        description_url = 'https://en.m.wikipedia.org' + table_data[0].find('a').get('href')
        desc_html = get_html_page(description_url)
        section_content = desc_html.find_all("section",{"class":"mf-section-0"})
        if section_content:
            p_len = len(section_content[0].find_all("p"))
            if p_len == 1:
                description = section_content[0].find_all("p")[0].text
            elif p_len == 2:
                description = section_content[0].find_all("p")[1].text
            elif p_len > 2:
                description = section_content[0].find_all("p")[1].text + "-" + section_content[0].find_all("p")[2].text
        else :
            description = 'No description available for this stadium'
            
        table_values = {
            'rank' : i,
            'stadium' : table_data[0].text,
            'description' : description,
            'capacity' : table_data[1].text,
            'region' : table_data[2].text,
            'country' : table_data[3].text,
            'city' : table_data[4].text,
            'home_team' : table_data[6].text,
            'image_url' : 'https:' + table_data[5].find('img').get('src') if table_data[5].find('img') else NO_IMAGE
        }
        data.append(table_values)
    return pd.DataFrame(data)

In [51]:
df_stadiums = extract_data(url)

Getting wikipedia html page https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_capacity
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Rungrado_1st_of_May_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Michigan_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Ohio_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Melbourne_Cricket_Ground
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Camp_Nou
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Estadio_Azteca
Getting wikipedia html page https://en.m.wikipedia.org/wiki/FNB_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/New_Administrative_Capital_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Rose_Bowl_(stadium)
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Cotton_Bowl_(stadium)
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Wembley_Stadium
Getting wikipedia ht

In [52]:
df_stadiums.to_csv('./data/output.csv', index=False)

In [90]:
df = pd.read_csv('./data/output.csv')

In [54]:
import re
def handle_capacity(text):
    cleaned = re.sub("[\(\[].*?[\)\]]", "", text)
    cleaned_text = cleaned.replace(",","")
    return cleaned_text

In [91]:
def transform_data(data: pd.DataFrame):
    data['capacity'] = data['capacity'].apply(lambda x: handle_capacity(x))
    data['city'] = data['city'].apply(lambda x: x.strip())
    data['country'] = data['country'].apply(lambda x: x.strip())
    data['home_team'] = data['home_team'].apply(lambda x: x.strip())
    data['stadium'] = data['stadium'].apply(lambda x: x.replace("♦","").strip())
    data['capacity'] = data['capacity'].astype(int)
    data['description'] = data['description'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x))
    data['description'] = data['description'].apply(lambda x: x.replace('\n',' ').strip())
    #data['location'] = data.apply(lambda x: get_geo_loc(x['country'],x['city']), axis=1 )
    return data

In [92]:
transformed_data = transform_data(df)

In [93]:
transformed_data.head()

Unnamed: 0,rank,stadium,description,capacity,region,country,city,home_team,image_url
0,1,Rungrado 1st of May Stadium,The Rungrado 1st of May Stadium is a multi-pur...,114000,East Asia,North Korea,Pyongyang,"Korea DPR national football team, Korea DPR wo...",https://upload.wikimedia.org/wikipedia/commons...
1,2,Michigan Stadium,Formerly -Michigan Stadium was built in 1927 a...,107601,North America,United States,"Ann Arbor, Michigan",Michigan Wolverines football,https://upload.wikimedia.org/wikipedia/commons...
2,3,Ohio Stadium,Ohio Stadium is an American football stadium i...,102780,North America,United States,"Columbus, Ohio",Ohio State Buckeyes football,https://upload.wikimedia.org/wikipedia/commons...
3,4,Melbourne Cricket Ground,"The Melbourne Cricket Ground , also known loc...",100024,Oceania,Australia,"Melbourne, Victoria","Australia national cricket team, Victoria cric...",https://upload.wikimedia.org/wikipedia/commons...
4,5,Camp Nou,"-Camp Nou , meaning New Field, often referred ...",99354,Europe,Spain,"Barcelona, Catalonia",FC Barcelona,https://upload.wikimedia.org/wikipedia/commons...
