# collect data file links from football-data source pages

using the freely available [football-data](https://www.football-data.co.uk) page, we have a collected per-country page listing. 

for each country page, we scrape/collect all the data file links and output these into a csv table, one table per country. 

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
DATA_PATH = '../data/'
URL_FILEPATH              = DATA_PATH + 'config/football_data_source_pages.txt' # input
EACH_COUNTRY_URL_FILEPATH = DATA_PATH +'source_urls/country_urls.csv'           # output
DATAFILE_URLS_FILEPATH    = DATA_PATH +'source_urls/'           

# read a list of page urls, one per country

In [3]:
# read all the urls, one per country/region, 
country_urls = []
with open(URL_FILEPATH, "r") as uf:
   country_urls = uf.read().splitlines()

# extract the country name from the url

In [4]:
countries = []
for url in country_urls:
    # the last part of the url is the country, name
    country = url.split('/')[-1].split('.')[0].lower()
    # occasionally, the country name has 'm' appended, for no good reason.
    if country.endswith('m'):
        country=country[:-1]
    countries.append(country)

# output a country-link table as csv

In [5]:
pd.DataFrame(
    {
        'country':countries, 
         'url':country_urls}
    )\
    .to_csv(EACH_COUNTRY_URL_FILEPATH, index=False)

# collect data file links
for each country page, read/parse it and scrape all the data file links within. output a table of each country's links to match data files. also report on the number of data links found in each page. 

In [6]:
for url in country_urls:
    country = url.split('/')[-1].split('.')[0].lower()
    if country.endswith('m'):
        country=country[:-1]
    r = requests.get(url)
    soup_page = BeautifulSoup(r.content, 'html.parser')
    table = soup_page.html.body.find_all('table')[4]
    links = table.find_all('a')

    data_urls = []
    for link in links:
        url = link.get('href')
        if url.endswith('.csv'):
            data_urls.append('https://www.football-data.co.uk/'+url)
    data_urls = list(set(data_urls)) # deduplicated
    num_urls = len(data_urls)
    df_url = pd.DataFrame({'country':[country]*num_urls, 'data_url':data_urls})
    df_url.to_csv(DATAFILE_URLS_FILEPATH+country+'_data_file_urls.csv', index=False)
    print(country, '-- parsed', num_urls, 'data urls')

england -- parsed 148 data urls
scotland -- parsed 118 data urls
germany -- parsed 64 data urls
italy -- parsed 60 data urls
spain -- parsed 61 data urls
france -- parsed 61 data urls
netherlands -- parsed 32 data urls
belgium -- parsed 30 data urls
portugal -- parsed 31 data urls
turkey -- parsed 31 data urls
greece -- parsed 31 data urls
argentina -- parsed 1 data urls
austria -- parsed 1 data urls
brazil -- parsed 1 data urls
china -- parsed 1 data urls
denmark -- parsed 1 data urls
finland -- parsed 1 data urls
ireland -- parsed 1 data urls
japan -- parsed 1 data urls
mexico -- parsed 1 data urls
norway -- parsed 1 data urls
poland -- parsed 1 data urls
romania -- parsed 1 data urls
russia -- parsed 1 data urls
sweden -- parsed 1 data urls
switzerland -- parsed 1 data urls
usa -- parsed 1 data urls
