In [293]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [294]:
url ='https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_capacity'
NO_IMAGE = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0a/No-image-available.png/480px-No-image-available.png'

In [297]:
def get_html_page(url):
    print(f'Getting wikipedia html page {url}')
    try:
        response = requests.get(url,timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f'Error getting wikipedia data {e}')

In [298]:
def get_table_data(url):
    soup = get_html_page(url)
    table = soup.find_all("table", {"class":"wikitable"})[1]
    table_rows = table.find_all("tr")
    return table_rows

In [299]:
data = []
def extract_data(url):
    table_rows = get_table_data(url)
    for i in range(1,len(table_rows)):
        table_data = table_rows[i].find_all('td')
    
        description_url = 'https://en.m.wikipedia.org' + table_data[0].find('a').get('href')
        desc_html = get_html_page(description_url)
        section_content = desc_html.find_all("section",{"class":"mf-section-0"})
        if section_content:
            p_len = len(section_content[0].find_all("p"))
            if p_len == 1:
                description = section_content[0].find_all("p")[0].text
            elif p_len == 2:
                description = section_content[0].find_all("p")[1].text
            elif p_len > 2:
                description = section_content[0].find_all("p")[1].text + "-" + section_content[0].find_all("p")[2].text
        else :
            description = 'No description available for this stadium'
            
        table_values = {
            'rank' : i,
            'stadium' : table_data[0].text,
            'description' : description,
            'capacity' : table_data[1].text,
            'region' : table_data[2].text,
            'country' : table_data[3].text,
            'city' : table_data[4].text,
            'image_url' : 'https:' + table_data[5].find('img').get('src') if table_data[5].find('img') else NO_IMAGE
        }
        data.append(table_values)
    return pd.DataFrame(data)

In [300]:
df_stadiums = extract_data(url)

Getting wikipedia html page https://en.wikipedia.org/wiki/List_of_association_football_stadiums_by_capacity
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Rungrado_1st_of_May_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Michigan_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Ohio_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Melbourne_Cricket_Ground
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Camp_Nou
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Estadio_Azteca
Getting wikipedia html page https://en.m.wikipedia.org/wiki/FNB_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/New_Administrative_Capital_Stadium
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Rose_Bowl_(stadium)
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Cotton_Bowl_(stadium)
Getting wikipedia html page https://en.m.wikipedia.org/wiki/Wembley_Stadium
Getting wikipedia ht

In [302]:
df_stadiums.tail(10)

Unnamed: 0,rank,stadium,description,capacity,region,country,city,image_url
368,369,Stadium Darul Makmur,Darul Makmur Stadium (Malay: Stadium Darul Mak...,40000,Southeast Asia,Malaysia,Kuantan\n,https://upload.wikimedia.org/wikipedia/commons...
369,370,Khalifa International Stadium,"Khalifa International Stadium (⫽kəˈliːfə⫽, Ara...",40000,West Asia,Qatar,Doha\n,https://upload.wikimedia.org/wikipedia/en/thum...
370,371,Stade de l’Amitié sino-gabonaise ♦,It was one of four stadiums used for the 2012 ...,40000,Africa,Gabon,Libreville\n,https://upload.wikimedia.org/wikipedia/commons...
371,372,Peoples Football Stadium,The stadium is part of a sports complex which ...,40000,South Asia,Pakistan,Karachi\n,https://upload.wikimedia.org/wikipedia/commons...
372,373,JRD Tata Sports Complex Stadium,"The JRD Tata Sports Complex Stadium, also know...",40000,South Asia,India,Jamshedpur\n,https://upload.wikimedia.org/wikipedia/commons...
373,374,Bao'an Stadium,The stadium features a cantilever membrane roo...,40000,East Asia,China\n,"Shenzhen, Guangdong\n",https://upload.wikimedia.org/wikipedia/commons...
374,375,Kunming Tuodong Sports Centre Stadium,Tuodong Sports Centre is one of the major spor...,40000,East Asia,China\n,"Kunming, Yunnan\n",https://upload.wikimedia.org/wikipedia/commons...
375,376,Taizhou Sports Centre Stadium,The Taizhou Sports Centre Stadium (Simplified ...,40000,East Asia,China\n,"Taizhou, Zhejiang\n",https://upload.wikimedia.org/wikipedia/commons...
376,377,Wuhu Olympic Stadium,31°18′40″N 118°22′33″E﻿ / ﻿31.311202°N 118.375...,40000,East Asia,China\n,"Wuhu, Anhui\n",https://upload.wikimedia.org/wikipedia/commons...
377,378,Huizhou Olympic Stadium,"The stadium holds 40,000 spectators.[1] It is ...",40000,East Asia,China\n,"Huizhou, Guangdong\n",https://upload.wikimedia.org/wikipedia/commons...
