In [None]:
## Update the 'pip' package manager and install the Beautiful Soup package

!pip3 install --upgrade pip
!pip3 install bs4

In [None]:
## Importing the packages we'll use below
from bs4 import BeautifulSoup
from urllib.request import urlopen

## URL for the Wikipedia page "List of countries in the Eurovision Song Contest"
countries_url = "https://en.wikipedia.org/wiki/List_of_countries_in_the_Eurovision_Song_Contest"

page_html = urlopen(countries_url).read().decode('utf8') ## Downloading the page's HTML source code 
                                                         # and storing it in the variable 'page_html'.
    
page_chunk = page_html.split('<dt>Table key</dt>')[1] ## Discarding the part of the page we don't need

soup = BeautifulSoup(page_chunk, 'lxml') ## Parsing the HTML fragment with Beautiful Soup

In [None]:
## Converting HTML table to a list of lists

country_lol = []

for row in soup.find('table').find_all('tr'):
    row_list = [item.get_text() for item in row.find_all('td')]
    country_lol.append(row_list)

In [None]:
## Creating a list of country names from list of lists

country_names = [row[0].replace('\xa0','') for row in country_lol if row!=[]]

print(len(country_names))

country_names

In [None]:
## Creating a list of URLs for each country

country_urls = []

for row in soup.find('table').find_all('tr'):
    item = row.find('td')
    try:
        url = item.find('a')['href'] ## Getting the URL from the link in each table cell
        url = url.replace('/wiki/', 'https://en.wikipedia.org/wiki/') ## Converting relative links to absolute URLs
        country_urls.append(url)
    except:
        pass

print(len(country_urls))

country_urls

In [None]:
## Printing country names and URLs (to make sure they match)

for i in range(len(country_names)):
    print(country_names[i])
    print(country_urls[i])
    print()

In [None]:
## Scraping "Contestants" table from every country URL on our list

from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import time

dataframe_list = []

for i in range(len(country_urls)):
    
    country = country_names[i]
    url = country_urls[i]
    
    page_html = urlopen(url).read().decode('utf8')

    page_chunk = page_html.split('<span class="mw-headline" id="Contestants">')[1]

    soup = BeautifulSoup(page_chunk, 'lxml')
    
    rows = soup.find('table').find_all('tr')

    list_of_lists = []

    header = [item.get_text() for item in soup.find('table').find_all('th')]
    
    header = [item.split('[')[0].strip() for item in header]
    
    if 'Language' not in header:
        header.append('Language')
    
    for row in rows:
        row = [item.get_text() for item in row.find_all('td')]
        if len(row)>=4:
            while len(row)<len(header):
                row.append('')
            list_of_lists.append(row)

    dataframe = pd.DataFrame(list_of_lists, columns=header) ## We're using pandas because columns on Wikipedia
                                                            # might be ordered differently.

    dataframe['Country'] = country
    
    try:
        reduced_dataframe = dataframe[['Country', 'Year', 'Language', 'Title']]
        dataframe_list.append(reduced_dataframe)
    except:
            try:
                reduced_dataframe = dataframe[['Country', 'Year', 'Language', 'Song']]
                dataframe_list.append(reduced_dataframe)
                reduced_dataframe.columns = ['Country', 'Year', 'Language', 'Title']
            except Exception as e:
                print('ERROR: ' + url)
                print(header)
                print(e)

    time.sleep(0.1)

In [None]:
len(dataframe_list)

In [None]:
## Viewing a random country's dataframe

import random

random.choice(dataframe_list)

In [None]:
master_table = pd.concat(dataframe_list)

In [None]:
import os
os.chdir('/sharedfolder/')

master_table.to_csv('Eurovision_Songs.csv', index=None)