In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
SEED = 1

SEASON = 19

league_ids = {
    'premier_league': 1,
    'championship': 2,
    'league_one': 3,
    'league_two': 4
}


In [3]:
rng = np.random.default_rng(SEED)

In [4]:
raw_league_tables = {}

for league in league_ids:
    page = requests.get('https://thefishy.co.uk/leaguetable.php?table=' + str(league_ids[league]) + '&season=' + str(SEASON))
    soup = BeautifulSoup(page.text, 'html.parser')
    raw_table = soup.find_all(class_='cats')
    raw_league_tables[league] = raw_table

In [5]:
def strip_out_club_name(raw_row):
    TRAILING_CHARS = 9
    raw_string = str(raw_row)
    no_trailing_chars = raw_string[:-TRAILING_CHARS]
    last_tag_index = no_trailing_chars.rfind('>') + 1
    club_name = no_trailing_chars[last_tag_index:]

    return club_name

In [6]:
def parse_league_table(league_soup):

    temp = []

    for row in league_soup:
        if 'team' in str(row):
            temp.append(strip_out_club_name(row))

    league_table = list(dict.fromkeys(temp))

    return league_table

In [7]:
football_league_standings = []

for league in league_ids:
    football_league_standings.extend(parse_league_table(raw_league_tables[league]))

In [8]:
football_league_standings.reverse()

In [9]:
number_of_entries = {}

for final_position, club in enumerate(football_league_standings):
    number_of_entries[club] = final_position + 1

In [10]:
number_of_entries

{'Scunthorpe': 1,
 'Oldham': 2,
 'Barrow': 3,
 'Stevenage': 4,
 'Carlisle': 5,
 'Harrogate Town': 6,
 'Rochdale': 7,
 'Hartlepool': 8,
 'Walsall': 9,
 'Colchester': 10,
 'Bradford': 11,
 'Leyton Orient': 12,
 'Crawley Town': 13,
 'Newport County': 14,
 'Salford': 15,
 'Tranmere': 16,
 'Sutton Utd': 17,
 'Mansfield': 18,
 'Swindon': 19,
 'Port Vale': 20,
 'Northampton': 21,
 'Bristol Rovers': 22,
 'Exeter': 23,
 'Forest Green': 24,
 'Crewe': 25,
 'AFC Wimbledon': 26,
 'Doncaster': 27,
 'Gillingham': 28,
 'Fleetwood Town': 29,
 'Morecambe': 30,
 'Shrewsbury': 31,
 'Lincoln City': 32,
 'Burton Albion': 33,
 'Cheltenham': 34,
 'Cambridge Utd': 35,
 'Charlton': 36,
 'Accrington Stanley': 37,
 'Ipswich': 38,
 'Portsmouth': 39,
 'Bolton': 40,
 'Oxford Utd': 41,
 'Plymouth': 42,
 'Wycombe': 43,
 'Sunderland': 44,
 'Sheff Wed': 45,
 'MK Dons': 46,
 'Rotherham': 47,
 'Wigan': 48,
 'Barnsley': 49,
 'Derby': 50,
 'Peterborough': 51,
 'Reading': 52,
 'Birmingham': 53,
 'Hull': 54,
 'Cardiff': 55,
 

In [11]:
total_entries = sum(number_of_entries.values())
total_entries

4278

In [12]:
entries_as_probability = {}

for club in number_of_entries:
    entries_as_probability[club] = number_of_entries[club] / total_entries

entries_as_probability

{'Scunthorpe': 0.0002337540906965872,
 'Oldham': 0.0004675081813931744,
 'Barrow': 0.0007012622720897616,
 'Stevenage': 0.0009350163627863488,
 'Carlisle': 0.0011687704534829358,
 'Harrogate Town': 0.001402524544179523,
 'Rochdale': 0.0016362786348761104,
 'Hartlepool': 0.0018700327255726976,
 'Walsall': 0.0021037868162692847,
 'Colchester': 0.0023375409069658717,
 'Bradford': 0.002571294997662459,
 'Leyton Orient': 0.002805049088359046,
 'Crawley Town': 0.0030388031790556337,
 'Newport County': 0.0032725572697522207,
 'Salford': 0.0035063113604488078,
 'Tranmere': 0.0037400654511453952,
 'Sutton Utd': 0.003973819541841982,
 'Mansfield': 0.004207573632538569,
 'Swindon': 0.004441327723235157,
 'Port Vale': 0.004675081813931743,
 'Northampton': 0.004908835904628331,
 'Bristol Rovers': 0.005142589995324918,
 'Exeter': 0.005376344086021506,
 'Forest Green': 0.005610098176718092,
 'Crewe': 0.00584385226741468,
 'AFC Wimbledon': 0.006077606358111267,
 'Doncaster': 0.006311360448807854,
 'Gi

In [13]:
clubs_to_draw_from = list(entries_as_probability.keys())
probabilities = list(entries_as_probability.values())

In [14]:
rng.choice(clubs_to_draw_from, size=4, replace=False, p=probabilities)

array(['Middlesbrough', 'Chelsea', 'Cambridge Utd', 'Peterborough'],
      dtype='<U18')