In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
SEED = 1

SEASON = 19

league_ids = {
    'premier_league': 1,
    'championship': 2,
    'league_one': 3,
    'league_two': 4
}


In [3]:
rng = np.random.default_rng(SEED)

In [4]:
raw_league_tables = {}

for league in league_ids:
    page = requests.get('https://thefishy.co.uk/leaguetable.php?table=' + str(league_ids[league]) + '&season=' + str(SEASON))
    soup = BeautifulSoup(page.text, 'html.parser')
    raw_table = soup.find_all(class_='cats')
    raw_league_tables[league] = raw_table

In [5]:
def strip_out_club_name(raw_row):

    TRAILING_CHARS = 9
    
    raw_string = str(raw_row)
    no_trailing_chars = raw_string[:-TRAILING_CHARS]
    last_tag_index = no_trailing_chars.rfind('>') + 1
    club_name = no_trailing_chars[last_tag_index:]

    return club_name

In [6]:
def parse_league_table(league_soup):

    temp = []

    for row in league_soup:
        if 'team' in str(row):
            temp.append(strip_out_club_name(row))

    league_table = list(dict.fromkeys(temp))

    return league_table

In [7]:
football_league_standings = []

for league in league_ids:
    football_league_standings.extend(parse_league_table(raw_league_tables[league]))

In [8]:
football_league_standings.reverse()

In [9]:
number_of_entries = {}

for final_position, club in enumerate(football_league_standings):
    number_of_entries[club] = final_position + 1

In [10]:
number_of_entries

{'Scunthorpe': 1,
 'Oldham': 2,
 'Barrow': 3,
 'Stevenage': 4,
 'Carlisle': 5,
 'Harrogate Town': 6,
 'Rochdale': 7,
 'Hartlepool': 8,
 'Walsall': 9,
 'Colchester': 10,
 'Bradford': 11,
 'Leyton Orient': 12,
 'Crawley Town': 13,
 'Newport County': 14,
 'Salford': 15,
 'Tranmere': 16,
 'Sutton Utd': 17,
 'Mansfield': 18,
 'Swindon': 19,
 'Port Vale': 20,
 'Northampton': 21,
 'Bristol Rovers': 22,
 'Exeter': 23,
 'Forest Green': 24,
 'Crewe': 25,
 'AFC Wimbledon': 26,
 'Doncaster': 27,
 'Gillingham': 28,
 'Fleetwood Town': 29,
 'Morecambe': 30,
 'Shrewsbury': 31,
 'Lincoln City': 32,
 'Burton Albion': 33,
 'Cheltenham': 34,
 'Cambridge Utd': 35,
 'Charlton': 36,
 'Accrington Stanley': 37,
 'Ipswich': 38,
 'Portsmouth': 39,
 'Bolton': 40,
 'Oxford Utd': 41,
 'Plymouth': 42,
 'Wycombe': 43,
 'Sunderland': 44,
 'Sheff Wed': 45,
 'MK Dons': 46,
 'Rotherham': 47,
 'Wigan': 48,
 'Barnsley': 49,
 'Derby': 50,
 'Peterborough': 51,
 'Reading': 52,
 'Birmingham': 53,
 'Hull': 54,
 'Cardiff': 55,
 

In [11]:
total_entries = sum(number_of_entries.values())
total_entries

4278

In [12]:
probabilities = np.array(list(number_of_entries.values())) / total_entries
probabilities

array([0.00023375, 0.00046751, 0.00070126, 0.00093502, 0.00116877,
       0.00140252, 0.00163628, 0.00187003, 0.00210379, 0.00233754,
       0.00257129, 0.00280505, 0.0030388 , 0.00327256, 0.00350631,
       0.00374007, 0.00397382, 0.00420757, 0.00444133, 0.00467508,
       0.00490884, 0.00514259, 0.00537634, 0.0056101 , 0.00584385,
       0.00607761, 0.00631136, 0.00654511, 0.00677887, 0.00701262,
       0.00724638, 0.00748013, 0.00771388, 0.00794764, 0.00818139,
       0.00841515, 0.0086489 , 0.00888266, 0.00911641, 0.00935016,
       0.00958392, 0.00981767, 0.01005143, 0.01028518, 0.01051893,
       0.01075269, 0.01098644, 0.0112202 , 0.01145395, 0.0116877 ,
       0.01192146, 0.01215521, 0.01238897, 0.01262272, 0.01285647,
       0.01309023, 0.01332398, 0.01355774, 0.01379149, 0.01402525,
       0.014259  , 0.01449275, 0.01472651, 0.01496026, 0.01519402,
       0.01542777, 0.01566152, 0.01589528, 0.01612903, 0.01636279,
       0.01659654, 0.01683029, 0.01706405, 0.0172978 , 0.01753

In [13]:
clubs_to_draw_from = list(number_of_entries.keys())

In [14]:
rng.choice(clubs_to_draw_from, size=4, replace=False, p=probabilities)

array(['Middlesbrough', 'Chelsea', 'Cambridge Utd', 'Peterborough'],
      dtype='<U18')