In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
import unidecode
import seaborn as sns
from scipy import stats
import math
import warnings
sns.set(color_codes=True)
warnings.filterwarnings("ignore")
def remove_special_characters(name):
    """The function removes numbers, apostrophes, and addition signs from a string and returns the cleaned string.
    This was specifically written with the format of name strings from the premier league match summaries in mind.
    Strings of the form "john doe 90 + 4' " are returned as "john doe".
   
    :param str name: The full string of text containing the player name.
    :return: A string containing the name with the unnecessary characters stripped.
    :rtype: str
   
    """
    for c in ['0','1','2','3','4','5','6','7','8','9','+',"'"]:
        name = name.replace(c,'')
   
    # remove trailing spaces
    name.strip()
   
    return name



def remove_team_code(name):
    """
    :param str name: A team name of the form name+code.
    :return: A string containing the team name with the codes removed i.e BurnleyBUR returns Burnley.
    :rtype: str
   
    """
    # team codes
    team_codes = ["BUR","BOU","CRY","WBA","HUD","ARS","LIV","BHA","MUN","WAT","NEW","CHE","SOU",\
                  "MCI","SWA","AVL","TOT","LEI","WHU","EVE","WOL","CAR","FUL","LEE","BRI","FUL","BRE","WAT","NOR"]
   
    # look for codes in name, if found, remove
    for c in team_codes:
        name = name.replace(c,'')
       
    return name
def remove_n(team_name):
    to_remove=["\n\n\n\n\n\n\n\n\n\n","\n\n\n"]  #Filter out unwanted symbols from the scraped names 
    for x in to_remove:
      team_name=team_name.replace(x,'') 
      return team_name



def extract_epl_lineups(team_container, home):
    """Extracts the complete lineups for a match from the Premier league website
   
    :param Tag team_container: Contains the matchLineupTeamContainer from the premier league website.
    :param bool home: A flag for whether team_conatiner contains the home team lineup or away team lineup
    :return:  A dictionary with the keys = [ "player1", ...., "player 14"], and the values are [{"name":"John Doe"}....].
             If a team plays with less than 14 players in a game, the remaining slots are left blank (= '').
    :rtype: dict
   
    """
   
    # generate keys for return dict, initialize player names
    if home== True:
        base_string = "home_player"
    else:
        base_string = "away_player"
    keys = [base_string+str(i+1) for i in range(16)]
    player_names = []

    # extract info from the container
    lineup_container = team_container.find_all("div",{"class":"info"})
   
    # extract player names
    for i in range(len(lineup_container)):
        if i < 11:
            player_names.append(unidecode.unidecode(remove_special_characters(lineup_container[i].div.text)))
        else:
            if lineup_container[i].div.div.text != '':
                player_names.append(unidecode.unidecode(remove_special_characters(lineup_container[i].div.text)))
   
    # create dictionary
    player_dict = dict(zip(keys,player_names))
   
    # if player12, player13, or player14 don't exits, create and and assign '' as value
    for i in range(12,17):
        if base_string+str(i) not in player_dict:
            player_dict[base_string+str(i)]= ""
           
    return player_dict




def extract_epl_match_details(match_ids):
    """This function extracts match information for games from the premier league results website
    given their match ids.
   
    :param list match_ids: A list of match ids for the games you're interested in. Can be a list of one or empty.
    :return: A dictionary containing the match information for every match id. Each unique match
            match id is a key. The value is the corresponding match information which is a list
            five dictionaries the team names, score, result (encoded as 1 for a home team win,
            0 for a draw, -1 for an away team win), the home team lineup,
            and the away team line up. The dictionaries are formatted as follows:
            team_names = {"home_team":"Arsenal", "away_team":"Liverpool"}
            score = {"home_goals":2, "away_goals":3}
            result = {"result": 1}
            home_team = {"home_player1":"Petr Cech", ....., "home_player14": "Alex Iwobi"}
            away_team = {"away_player1":"Loris Karius", ...., "away_player14":""}
    :rtype: dict
   
    """
     
    # where the match data is being scraped from
    epl_url = "https://www.premierleague.com/match/"

    # initialize a dictionary to store the results
    data = dict()

    # this loop extracts the data through each game in turn
    for match_id in match_ids:

        # url for each math
        match_url = epl_url + str(match_id)

        # download page, call and then close client
        uClient = uReq(match_url)
        results_page = uClient.read()
        uClient.close()

        # extract html
        results_soup = soup(results_page, "html.parser")

        # extract home team name container
        home_team_container = results_soup.findAll("div", {"class":"team home"})

        # extract away team name container
        away_team_container = results_soup.findAll("div", {"class":"team away"})

        # extract score container
        score_container = results_soup.findAll("div", {"class":"matchScoreContainer"})

         # extract lineup containers
        team_lineups_container = results_soup.findAll("div", {"class":"matchLineupTeamContainer"})

        # extract team names from their containers
        team_names = {"home_team": remove_team_code(home_team_container[0].text),\
                      "away_team":remove_team_code(away_team_container[0].text)}

        # extract score from score container
        score = {"home_goals": int(((score_container[0].text).split("-"))[0]),\
                 "away_goals":int(((score_container[0].text).split("-"))[1]) }
        # use score to determine match outcome, Win for home team = 1, Win for away team = -1, draw = 0
        result = {"result":  1 if score["home_goals"] > score["away_goals"] else\
                  -1 if score["away_goals"] > score["home_goals"] else 0 }

        # extract lineups for the home and away team from their containers, this includes substitutes who made appearances
        home_team = extract_epl_lineups(team_lineups_container[0], home = True)
        away_team = extract_epl_lineups(team_lineups_container[1], home = False)

        # append to data dictionary
        data[match_id] = [team_names, score, result, home_team, away_team]

    return data
# main functions used for finding fifa ratings

def last_name(name):
    """Extract the last name for a player given their full name.
   
    :param str name: A string of the full name of the player.
    :return: Returns just the last name of the player. If their full name
            is just one name - "Ronaldinho" for example - return what is passed.
    :rtype: str
   
    """
    name = unidecode.unidecode(name)
    names = name.split()
    last_name = names[len(names)-1]
   
    return last_name
def first_name(name) :
    name = unidecode.unidecode(name)
    names = name.split()
    first_name = names[len(names)-2]
    return first_name
def extract_fifa_rating(data_main, data_auxillary, name, club):
    """This function extracts the fifa rating for a player.
   
    :param DataFrame data_main: A master data store; this is the data from kaggle.
    :param DataFrame data_auxillary: A supplementary dataframe that conatains data missing in data_main.
    :param str name: A string containing the name of the player missing.
    :param str club: A string containing the the player's club.
    :return: The base fifa rating (type int) if found, else returns np.NaN.
    :rtype: int
   
    """
    rating = 0
   
    # check the main dataset to see if the full name matches
    if data_main[data_main["NAME"]== name]["RATING"].empty == False:
        rating = data_main[data_main["NAME"]== name]["RATING"].min()
   
    # check if the last name and club are enough to match
    elif data_main[data_main["LAST_NAME"]== last_name(name)][fifa_data["CLUB"]==club]["RATING"].empty == False:
        rating = data_main[data_main["LAST_NAME"]== last_name(name)][fifa_data["CLUB"]==club]["RATING"].min()
    elif data_main[data_main["FIRST_NAME"]== first_name(name)][fifa_data["CLUB"]==club]["RATING"].empty == False:
        rating = data_main[data_main["FIRST_NAME"]== first_name(name)][fifa_data["CLUB"]==club]["RATING"].min()    

    elif data_auxillary[data_auxillary["NAME"] == unidecode.unidecode(name)]["RATING"].empty == False:
        rating = data_auxillary[data_auxillary["NAME"] == unidecode.unidecode(name)]["RATING"].min()
    else:
        rating = np.NaN

    return rating
       
# functions to formet the data (list of dictionaries) appropriate as dataframe

def convert_to_dataframe(data, match_ids):
    """This function converts a list called data (see documentation for
    extract_epl_match_details for format) and match_ids to create dataframe of 33 columns.
    The column names are enumerated in the second line of the function.
   
    :param dict data: The data for all the games fromatted as a dictionary.
    :param list match_ids: A list of all the match ids. These are the keys used to access the data parameter.
    :return: DataFrame containing 33 columns with every row corresponding to a single game. The indexes correspond
            to match ids.
    :rtype: DataFrame
   
    """
   
    # index and columns used to initialize return dataframe
    index = [0]
    columns = ["home_team", "away_team", "home_goals", "away_goals","result", \
              "home_player1","home_player2","home_player3","home_player4","home_player5","home_player6","home_player7",\
               "home_player8","home_player9","home_player10","home_player11","home_player12","home_player13","home_player14","home_player15",\
              "away_player1","away_player2","away_player3","away_player4","away_player5","away_player6","away_player7",\
               "away_player8","away_player9","away_player10","away_player11","away_player12","away_player13","away_player14","away_player15"]

    # initialize dataframe with one empty row
    data_df = pd.DataFrame(index=index,columns=columns)

    # append match info for each match in turn to return dataframe
    for match_id in match_ids:
        temp_df = pd.DataFrame.from_dict(data[match_id][0],orient="index")
        for i in [1,2,3,4]:
            temp_df= pd.concat([temp_df,pd.DataFrame.from_dict(data[match_id][i],orient="index")])
        temp_df = temp_df.T
        data_df = data_df.append(temp_df)

    # remove the empty first row
    data_df = data_df.iloc[1:]

    # make the indices equal to the match ids
    data_df.index = match_ids
   
    return data_df

##### Extract Match Info #####

# there are 380 matches played in a season, there are 20 teams and each team plays every other team twice.
num_matches_2021 = 380 

# list of match IDs for the games played last season
match_ids_2021 = [66342+i for i in range(num_matches_2021)]


# extract data for each season
data_2021 = extract_epl_match_details(match_ids_2021)
data=data_2021

# convert data from dictionary to dataframe
df_2021 = convert_to_dataframe(data_2021,match_ids_2021)


##### Compile Player Names and Ratings #####

# import master FIFA22 file, includes just fifa ratings
fifa_data = pd.read_csv("https://raw.githubusercontent.com/Madhav-Mukund/myfiles/main/Raw_Rating_Data.csv")


# remove special characters from the name column (accented e's etc)
fifa_data["NAME"] = fifa_data["NAME"].transform(unidecode.unidecode)

# add last name column to dataframe to aid querying
fifa_data["LAST_NAME"] = fifa_data["NAME"].transform(last_name)
fifa_data["FIRST_NAME"] = fifa_data["NAME"].transform(first_name)

missing_fifa_data = pd.read_csv("https://raw.githubusercontent.com/Madhav-Mukund/myfiles/main/Raw_Rating_Data.csv")

# initiaize dictionary to hold player data
name_team_rating_dict = dict()

# extract player name, club, and rating for every player who played any minutes at all in the epl in the last two seasons
for id in (match_ids_2021):
    for i in range(1,16):
        if data[id][3]["home_player"+str(i)] != '':
            name = (data[id][3]["home_player"+str(i)]).strip()
            club2 = remove_team_code(data[id][0]["home_team"])
            club=club2.replace('\n','')
            rating = extract_fifa_rating(fifa_data, missing_fifa_data, name, club)
            name_team_rating_dict[name] = [club,rating]
    for j in range(1,16):
        if data[id][4]["away_player"+str(j)] != '':
            name = (data[id][4]["away_player"+str(j)]).strip()
            club2 = remove_team_code(data[id][0]["away_team"])
            club=club2.replace('\n','')
            rating = extract_fifa_rating(fifa_data, missing_fifa_data, name, club)
            name_team_rating_dict[name] = [club,rating]

# convert player data dictionary to dataframe
player_data = pd.DataFrame(name_team_rating_dict).T
player_data.columns = ["CLUB", "RATING"]
player_data["CLUB"] = player_data["CLUB"].str.replace("\n", "")
df_2021["home_team"]= df_2021["home_team"]

##### Export Data #####

# export dataframes as csv files
df_2021.to_csv("epl_results_2021", sep = ',')

player_data.to_csv('epl_player_ratings', sep=',')


Running the above segment completely takes around 15-20 minutes of processing time to download all the webpages and scrape the data. Also then to process the same for ratings for all the players in the complete database.
For ease of access and saving time we saved the results from the above process and added them to a public Github Repository and will be using the same for further processing.

In [None]:

df_2021=pd.read_csv('https://raw.githubusercontent.com/Madhav-Mukund/myfiles/main/EPL_Results_2021%20')
player_data=pd.read_csv('https://raw.githubusercontent.com/Madhav-Mukund/myfiles/main/All_player_ratings')
player_data.drop('Unnamed: 0', inplace=True, axis=1)
player_data.head()

Unnamed: 0,NAME,CLUB,RATING
0,David Raya,Brentford,74
1,Kristoffer Ajer,Brentford,75
2,Ethan Pinnock,Brentford,74
3,Pontus Jansson,Brentford,75
4,Rico Henry,Brentford,74


In [None]:
# these functions compute the rating averages for teams, the differences, and append them to the match data dataframes 

def compute_team_average(names ,ratings ):
    """This function computes the average rating for a list of players. It accounts for any empty
    strings (i.e. '') in the list.
    
    :param: list names : A list of names of the players.
    :param: DataFrame ratings : An appropriately formatted dataframe containing player ratings.
    :return: The average rating of the players passed in.
    :rtype: float
    
    """
    
    #initialize return variable
    average = 0  
    count = 0 
    
    # only count players who actually appeared, ignore empty places and NaN values
    for name in names:
        if type(name) == str:
            average = average + ratings[ratings['NAME']==name.strip()]["RATING"].min()
            count = count + 1
            
    # average contains the sum of all player ratings, thus divide
    average = average/count
    
    return average


def add_averages_to_df(match_data,ratings):
    """This function appends the following three coloumns to the match data dataframe:
    home_team_rating, away_team_rating, rating_diff. It returns this modified dataframe.
    
    :param: DataFrame match_data: This dataframe must contain the lineups for each game in the set.
    :param: DataFrame ratings: This dataframe must contain player ratings.
    :return: Appends three columns to the match_data dataframe: one for the average rating for the home team,
            one for the average rating for the away team, and a column for the average difference.
    :rtype: DataFrame        
    
    """

    # initialize lists to store the home team average rating, away team average rating, and average team difference
    home_team_rating = []
    away_team_rating = []
    average_rating_diff =[]
    
    # populate the above list by match
    for i in match_data.index:
        home_team_rating.append(compute_team_average(list(match_data.iloc[i][7:22]), ratings))
        away_team_rating.append(compute_team_average(list(match_data.iloc[i][23:35]), ratings))
        average_rating_diff.append(home_team_rating[i] - away_team_rating[i])
        
    # turn the three lists into series
    home_average_series = pd.Series(home_team_rating)
    away_average_series = pd.Series(away_team_rating)
    average_rating_diff = pd.Series(average_rating_diff)
    
    # append the series as columns into the dataframe
    match_data["home_average_rating"] = home_average_series
    match_data["away_average_rating"] = away_average_series
    match_data["rating_diff"] = average_rating_diff
    
    return match_data

df_2021.rename(columns = {'Unnamed: 0.1':'Match_IDs'}, inplace = True)
df_2021.rename(columns = {'Unnamed: 0':'Serial_No'}, inplace = True)

In [None]:
df_2021.head()

Unnamed: 0,Serial_No,Match_IDs,home_team,away_team,home_goals,away_goals,result,home_player1,home_player2,home_player3,...,away_player11,away_player12,away_player13,away_player14,away_player15,home_player16,away_player16,home_average_rating,away_average_rating,rating_diff
0,0,66342,Brentford,Arsenal,2,0,1,David Raya ...,Kristoffer Ajer ...,Ethan Pinnock ...,...,Folarin Balogun ...,Karl Hein ...,Cedric Soares ...,Rob Holding ...,Hector Bellerin ...,Yoane Wissa ...,Nuno Tavares ...,72.133333,74.25,-2.116667
1,1,66343,Burnley,Brighton and Hove Albion,1,2,-1,Nick Pope ...,Matthew Lowton ...,Ben Mee ...,...,Neal Maupay ...,Jason Steele ...,Michal Karbownik ...,Alexis Mac Allister ...,Adam Lallana ...,Nathan Collins ...,Taylor Richards ...,75.466667,74.0,1.466667
2,2,66344,Chelsea,Crystal Palace,3,0,1,Edouard Mendy ...,Trevoh Chalobah ...,Andreas Christensen ...,...,Wilfried Zaha ...,Jack Butland ...,Joachim Andersen ...,Reece Hannam ...,Nathaniel Clyne ...,Kurt Zouma ...,Martin Kelly ...,81.666667,75.25,6.416667
3,3,66345,Everton,Southampton,3,1,1,Jordan Pickford ...,Lucas Digne ...,Mason Holgate ...,...,Che Adams ...,Fraser Forster ...,Yan Valery ...,Kyle Walker-Peters ...,Jan Bednarek ...,Tom Davies ...,Stuart Armstrong ...,78.2,74.666667,3.533333
4,4,66346,Leicester City,Wolverhampton Wanderers,1,0,1,Kasper Schmeichel ...,Luke Thomas ...,Caglar Soyuncu ...,...,Adama Traore ...,John Ruddy ...,Rayan Ait-Nouri ...,Christian Marques ...,Nelson Semedo ...,Dennis Praet ...,Leander Dendoncker ...,79.533333,75.166667,4.366667


In [None]:
# function calls to compute averages, find differences

rating_games_2021 = add_averages_to_df(df_2021,player_data)

rating_games_2021.head()

Unnamed: 0,Serial_No,Match_IDs,home_team,away_team,home_goals,away_goals,result,home_player1,home_player2,home_player3,...,away_player11,away_player12,away_player13,away_player14,away_player15,home_player16,away_player16,home_average_rating,away_average_rating,rating_diff
0,0,66342,Brentford,Arsenal,2,0,1,David Raya ...,Kristoffer Ajer ...,Ethan Pinnock ...,...,Folarin Balogun ...,Karl Hein ...,Cedric Soares ...,Rob Holding ...,Hector Bellerin ...,Yoane Wissa ...,Nuno Tavares ...,72.133333,74.25,-2.116667
1,1,66343,Burnley,Brighton and Hove Albion,1,2,-1,Nick Pope ...,Matthew Lowton ...,Ben Mee ...,...,Neal Maupay ...,Jason Steele ...,Michal Karbownik ...,Alexis Mac Allister ...,Adam Lallana ...,Nathan Collins ...,Taylor Richards ...,75.466667,74.0,1.466667
2,2,66344,Chelsea,Crystal Palace,3,0,1,Edouard Mendy ...,Trevoh Chalobah ...,Andreas Christensen ...,...,Wilfried Zaha ...,Jack Butland ...,Joachim Andersen ...,Reece Hannam ...,Nathaniel Clyne ...,Kurt Zouma ...,Martin Kelly ...,81.666667,75.25,6.416667
3,3,66345,Everton,Southampton,3,1,1,Jordan Pickford ...,Lucas Digne ...,Mason Holgate ...,...,Che Adams ...,Fraser Forster ...,Yan Valery ...,Kyle Walker-Peters ...,Jan Bednarek ...,Tom Davies ...,Stuart Armstrong ...,78.2,74.666667,3.533333
4,4,66346,Leicester City,Wolverhampton Wanderers,1,0,1,Kasper Schmeichel ...,Luke Thomas ...,Caglar Soyuncu ...,...,Adama Traore ...,John Ruddy ...,Rayan Ait-Nouri ...,Christian Marques ...,Nelson Semedo ...,Dennis Praet ...,Leander Dendoncker ...,79.533333,75.166667,4.366667


In [None]:
#Finding a general model for all teams
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random
complete_list=[]
for i in range(0,380):
  complete_list.append(i) 
complete_list=np.array(complete_list)   
rand_list=random.sample(range(380), 285)# 75% data for training the model
res_list=np.setdiff1d(complete_list, rand_list)   

X=rating_games_2021[['home_goals','away_goals','home_average_rating','away_average_rating']]
Y=rating_games_2021['result']
regr = linear_model.LinearRegression()

X_train = pd.DataFrame(columns = ['home_goals','away_goals','home_average_rating','away_average_rating'])
Y_train=[]
for i in rand_list:
  X_train = X_train.append(rating_games_2021.loc[i], ignore_index = True)
  X_train = X_train[['home_goals','away_goals','home_average_rating','away_average_rating']]
  Y_train.append(rating_games_2021.loc[i,'result'])
regr.fit(X_train, Y_train)

X_test = pd.DataFrame(columns = ['home_goals','away_goals','home_average_rating','away_average_rating'])   
Y_test=[] 
res_list=res_list.tolist()     
for i in res_list:
  X_test = X_test.append(rating_games_2021.loc[i], ignore_index = True)
  X_test = X_test[['home_goals','away_goals','home_average_rating','away_average_rating']]
  Y_test.append(rating_games_2021.loc[i,'result'])
Y_pred=regr.predict(X_test)

print(regr.score(X_test, Y_test))

print(regr.coef_)

0.7200839156249971
[ 0.33574253 -0.41168741 -0.00386327 -0.00212506]


In [None]:
X_test


Unnamed: 0,home_goals,away_goals,home_average_rating,away_average_rating
0,1,2,75.466667,74.000000
1,5,1,80.733333,75.416667
2,0,2,76.733333,82.500000
3,2,0,73.066667,74.166667
4,1,1,74.933333,82.250000
...,...,...,...,...
90,0,4,73.600000,78.250000
91,1,2,71.866667,74.166667
92,3,1,73.533333,79.250000
93,4,1,79.466667,75.750000
