In [74]:
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
import time
import numpy as np
import lxml
import chardet
import logging as log

fbRefFileName = 'matches23-17_updated.csv'

In [75]:
# create a function to convert string values to numerical values
def result_to_numeric(result):
    if result.lower() in ["win","w"]:
        return 1
    elif result.lower() in ["loss","l"]:
        return -1
    elif result.lower() in ["draw","d"]:
        return 0
    

In [76]:
df = pd.read_csv(fbRefFileName, encoding = 'latin-1')

df["result_num"] = df["result"].apply(result_to_numeric)

df["ga"] = pd.to_numeric(df["ga"], errors = 'coerce').convert_dtypes()
df["gf"] = pd.to_numeric(df["gf"], errors = 'coerce').convert_dtypes()

In [77]:
df = df.sort_values(by = ["date", "time"], ascending = True, ignore_index=True)
thisDf = df.copy(deep = True)
thisDf

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,fk,pk,pkatt,team_name,season,result_num
0,2017-07-21,2023-04-12 19:00:00,Liga MX,Apertura 2017 Regular Season,Fri,Away,D,0,0,Morelia,...,,0,0,,,0,0,Monterrey,2018,0
1,2017-07-21,2023-04-12 19:00:00,Liga MX,Apertura 2017 Regular Season,Fri,Home,D,0,0,Monterrey,...,,0,0,,,0,0,Morelia,2018,0
2,2017-07-21,2023-04-12 19:00:00,Liga MX,Apertura 2017 Regular Season,Fri,Away,W,2,0,Tijuana,...,,0,0,,,0,1,Cruz Azul,2018,1
3,2017-07-21,2023-04-12 19:00:00,Liga MX,Apertura 2017 Regular Season,Fri,Home,L,0,2,Cruz Azul,...,,0,0,,,0,0,Tijuana,2018,-1
4,2017-07-22,2023-04-12 17:00:00,Liga MX,Apertura 2017 Regular Season,Sat,Home,L,0,1,Querétaro,...,,0,0,,,0,0,America,2018,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3843,2023-04-09,2023-04-12 13:00:00,Liga MX,Clausura 2023 Regular Season,Sun,Home,W,3,1,Atlético,...,,10,5,18.8,2.0,0,0,UNAM,2023,1
3844,2023-04-09,2023-04-12 19:05:00,Liga MX,Clausura 2023 Regular Season,Sun,Away,W,4,1,Santos,...,,15,6,21.2,0.0,0,0,Pachuca,2023,1
3845,2023-04-09,2023-04-12 19:05:00,Liga MX,Clausura 2023 Regular Season,Sun,Home,L,1,4,Pachuca,...,,12,7,17.1,0.0,0,0,Santos Laguna,2023,-1
3846,2023-04-09,2023-04-12 20:10:00,Liga MX,Clausura 2023 Regular Season,Sun,Home,D,1,1,Atlas,...,,10,3,19.4,2.0,0,0,FC Juarez,2023,0


In [78]:
#Normalizing team and opponent names
class MissingDict(dict):
    #if name not included in following dict, keep the name the same instead of deleting
    __missing__ = lambda self, key:key
    
map_values = {
    "América": "America",
    "Atlético": "Atletico",
    "FC Juárez":"FC Juarez",
    "León":"Leon",
    "Mazatlán":"Mazatlan",
    "Querétaro":"Queretaro",
    "Santos Laguna":"Santos"
}  

mapping = MissingDict(**map_values)

In [79]:
thisDf["team_new"] = thisDf["team_name"].map(mapping)
thisDf["opponent_new"] = thisDf["opponent"].map(mapping)
#thisDf

In [80]:
def rollingAverages(teamDf, cols, newCols):
    teamDf = teamDf.sort_values("date")
    rolling_avgs=teamDf[cols].rolling(window=4, closed = 'left').mean()
    teamDf[newCols] = rolling_avgs
    
    #Drop rows when there are not enough previous matches info (at the beginning of the dataset)
    teamDf = teamDf.dropna(subset = newCols)

    return teamDf


In [88]:
def rollingStreaks1(teamDf, newCols):
    teamDf = teamDf.sort_values("date")
    
    streakKeys = ['w','d','l','u']
    streakDict = dict(zip(streakKeys,newCols))
    for key, value in streakDict.items():
    #    print(key, value)
    #    print(cols)
    #    roll_streak    = teamDf[cols].rolling(window=1, closed = 'left').apply(lambda x: calc_streaks1(x,key))                                                                     
    #    teamDf[value] = roll_streak
        teamDf[value]  = teamDf["result_num"].rolling(window=1, closed = 'left').apply(lambda y: calc_streaks1(y,key)[-1])                                                                     
        
    
    #Drop rows when there are not enough previous matches info (at the beginning of the dataset)
    teamDf = teamDf.dropna(subset = newCols)

    return teamDf

In [83]:
def rollingStreaks(teamDf, newCols, streakType):
    teamDf = teamDf.sort_values("date")
    roll_streak    = teamDf["result_num"].rolling(window=1, closed = 'left').apply(lambda x: calc_streaks1(x,streakType)[-1])                                                                     
    teamDf[newCol] = roll_streak
    
    #Drop rows when there are not enough previous matches info (at the beginning of the dataset)
    teamDf = teamDf.dropna(subset = newCol)

    return teamDf

In [84]:
def calc_streaks1(series, streakType):
    current_streak = 0
    streaks=[]
    
    if streakType.lower() in ['unbeaten','u']:
        numMatchResult = [ result_to_numeric('w'), result_to_numeric('d') ]
    else:
        numMatchResult = [ result_to_numeric(streakType.lower()) ]
    
    for actualResult in series:
        if actualResult not in numMatchResult:
            current_streak =  0
        else:
            current_streak += 1

        streaks.append(current_streak)
    
    #Shifting list 1 position to the right
#    streaks = [0] + streaks[:-1]
    return streaks

In [89]:
matchesByTeam = thisDf.groupby("team_new")

In [90]:
newCols = ['streak_w','streak_d','streak_l','streak_u']

matches_rolling = matchesByTeam.apply(lambda x: rollingStreaks1(x, newCols))


In [53]:
matches_rolling = matchesByTeam.apply(lambda x: rollingStreaks(x, 'w_streak','w'))
matches_rolling = matchesByTeam.apply(lambda x: rollingStreaks(x, 'd_streak','d'))
matches_rolling = matchesByTeam.apply(lambda x: rollingStreaks(x, 'l_streak','l'))
matches_rolling = matchesByTeam.apply(lambda x: rollingStreaks(x, 'u_streak','u'))

In [91]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,pkatt,team_name,season,result_num,team_new,opponent_new,streak_w,streak_d,streak_l,streak_u
team_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
America,28,2017-07-29,2023-04-12 19:06:00,Liga MX,Apertura 2017 Regular Season,Sat,Away,W,2,0,Pachuca,...,0,America,2018,1,America,Pachuca,0.0,0.0,1.0,0.0
America,46,2017-08-05,2023-04-12 21:00:00,Liga MX,Apertura 2017 Regular Season,Sat,Home,W,2,1,UNAM,...,2,America,2018,1,America,UNAM,1.0,0.0,0.0,1.0
America,56,2017-08-11,2023-04-12 21:00:00,Liga MX,Apertura 2017 Regular Season,Fri,Away,W,1,0,Atlas,...,1,America,2018,1,America,Atlas,1.0,0.0,0.0,1.0
America,76,2017-08-19,2023-04-12 17:00:00,Liga MX,Apertura 2017 Regular Season,Sat,Away,W,3,2,Lobos BUAP,...,0,America,2018,1,America,Lobos BUAP,1.0,0.0,0.0,1.0
America,99,2017-08-23,2023-04-12 19:00:00,Liga MX,Apertura 2017 Regular Season,Wed,Home,D,2,2,UANL,...,0,America,2018,0,America,UANL,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Veracruz,1591,2019-10-25,2023-04-12 19:06:00,Liga MX,Apertura 2019 Regular Season,Fri,Away,L,0,2,Tijuana,...,0,Veracruz,2020,-1,Veracruz,Tijuana,0.0,0.0,1.0,0.0
Veracruz,1611,2019-10-29,2023-04-12 19:00:00,Liga MX,Apertura 2019 Regular Season,Tue,Home,W,1,0,Puebla,...,0,Veracruz,2020,1,Veracruz,Puebla,0.0,0.0,1.0,0.0
Veracruz,1633,2019-11-02,2023-04-12 19:06:00,Liga MX,Apertura 2019 Regular Season,Sat,Away,D,1,1,Monterrey,...,0,Veracruz,2020,0,Veracruz,Monterrey,1.0,0.0,0.0,1.0
Veracruz,1647,2019-11-08,2023-04-12 21:00:00,Liga MX,Apertura 2019 Regular Season,Fri,Home,L,0,5,América,...,0,Veracruz,2020,-1,Veracruz,America,0.0,1.0,0.0,1.0


In [72]:
matches_rolling = matches_rolling.droplevel('team_new')

In [73]:
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,pkatt,team_name,season,result_num,team_new,opponent_new,streak_w,streak_d,streak_l,streak_u


In [67]:
matches1 = matches_rolling.groupby("team_new")
matches1.get_group("Guadalajara")

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,pkatt,team_name,season,result_num,team_new,opponent_new,streak_w,streak_d,streak_l,streak_u
921,2017-07-29,2023-04-12 17:00:00,Liga MX,Apertura 2017 Regular Season,Sat,Away,D,1,1,Cruz Azul,...,0,Guadalajara,2018,0,Guadalajara,Cruz Azul,0.0,1.0,0.0,1.0
922,2017-08-05,2023-04-12 21:06:00,Liga MX,Apertura 2017 Regular Season,Sat,Home,D,2,2,Necaxa,...,0,Guadalajara,2018,0,Guadalajara,Necaxa,0.0,1.0,0.0,1.0
923,2017-08-12,2023-04-12 19:00:00,Liga MX,Apertura 2017 Regular Season,Sat,Away,L,1,4,Monterrey,...,0,Guadalajara,2018,-1,Guadalajara,Monterrey,0.0,1.0,0.0,1.0
924,2017-08-19,2023-04-12 21:06:00,Liga MX,Apertura 2017 Regular Season,Sat,Home,L,0,1,Puebla,...,0,Guadalajara,2018,-1,Guadalajara,Puebla,0.0,0.0,1.0,0.0
925,2017-08-23,2023-04-12 20:30:00,Liga MX,Apertura 2017 Regular Season,Wed,Away,D,1,1,Santos,...,1,Guadalajara,2018,0,Guadalajara,Santos,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121,2023-03-04,2023-04-12 21:05:00,Liga MX,Clausura 2023 Regular Season,Sat,Home,W,2,0,Santos,...,0,Guadalajara,2023,1,Guadalajara,Santos,1.0,0.0,0.0,1.0
1122,2023-03-10,2023-04-12 21:05:00,Liga MX,Clausura 2023 Regular Season,Fri,Away,L,0,1,Puebla,...,0,Guadalajara,2023,-1,Guadalajara,Puebla,1.0,0.0,0.0,1.0
1123,2023-03-18,2023-04-12 21:10:00,Liga MX,Clausura 2023 Regular Season,Sat,Home,L,2,4,América,...,0,Guadalajara,2023,-1,Guadalajara,America,0.0,0.0,1.0,0.0
1124,2023-04-01,2023-04-12 21:10:00,Liga MX,Clausura 2023 Regular Season,Sat,Away,D,3,3,Atlas,...,0,Guadalajara,2023,0,Guadalajara,Atlas,0.0,0.0,1.0,0.0


In [None]:
cols = ["gf","ga","sh","sot","dist","pk","pkatt"]
newCols = [f"{c}_avg" for c in cols]

In [None]:
matches_rolling = matchesByTeam.apply(lambda x: rollingAverages(x, cols, newCols))
matches_rolling

In [None]:
matches_rolling = matches_rolling.droplevel('team_new')



In [None]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

In [None]:
matches = matches_rolling.copy(deep=True)

In [None]:
matches

In [None]:
def streakCalculator(df, streakCols):
    
    streakKeys = ['w','d','l','u']
    thisDict = dict(zip(streakKeys,streakCols))

    subset = df["result_num"]
    
    for key , value in thisDict.items():

        #df[value] = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) 
        df[value] = subset.rolling(window = 1, min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) 

    #df = df.droplevel("team_new")
    #df.index = range(df.shape[0])
    
    return df

In [None]:
cols = ["result_num"]
newCols = ["s_w","s_d","s_l","s_u"]

In [None]:
matchesByTeam = matches.groupby("team_new")
matches_rolling = matchesByTeam.apply(lambda x: rollingAverages(x, cols, newCols))

#matches_streaks1 = pd.DataFrame()

In [None]:
roll_streak    = matchesByTeam["result_num"].rolling(window=1, closed = 'left').apply(lambda x: calc_streaks1(x,'w')[-1],raw = False)

In [None]:
matchesByTeam['win_streak']=roll_streak


In [None]:
result = rollingStreaks(matchesByTeam, cols, newCols)

In [None]:
for teamMatches in matchesByTeam:
    matches_streaks1 = matches_streaks1.append(matchesByTeam.apply(lambda x: rollingStreaks(x, cols, newCols)))

matches_streaks1

In [None]:
ubset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) 

In [None]:
groups = matches.groupby("team_new")
group = groups.get_group("Guadalajara")
group
#roll_streak     = group["result_num"].rolling(window=1, closed = 'left').apply(lambda x: calc_streaks1(x,'w'))                                                                     


In [None]:
print(matches["result_num"].dtypes)


In [None]:
grouped_matches = matches.groupby("team_new")
chivas = grouped_matches.get_group("Guadalajara")
test = streakCalculator(chivas, ["s_w","s_d","s_l","s_u"])
test[["result_num","s_w","s_d","s_l","s_u"]]

In [None]:
matches = matches.sort_values(by = ["date", "time"], ascending = True, ignore_index=True)

matches

In [None]:
matchesByTeam            = matches.groupby( "team_new")
matchesByTeamAndVenue    = matches.groupby(["team_new","venue"])
matchesByTeamAndOpp      = matches.groupby(["team_new","opponent_new"])
matchesByTeamOppAndVenue = matches.groupby(["team_new","opponent_new","venue"])

In [None]:
streakKeys = ['w','d','l','u']
streakCols = ['streak_w','streak_d','streak_l','streak_u']

theDict = dict(zip(streakKeys,streakCols))

for key, value in theDict.items():
    dftest[value] = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) 

dftest

In [None]:
streakCols = ['streak_w','streak_d','streak_l','streak_u']
matchesWithStreaks = matchesByTeam.apply(lambda x: streakCalculator(x, streakCols))
matchesWithStreaks                                        



In [None]:
subset = grouped_matches.get_group("Guadalajara")["result_num"]
dftest = grouped_matches.get_group("Guadalajara").copy()

streakKeys = ['w','d','l','unbeaten']

streakCols = ['win_streaks','draw_streaks','loss_streaks','unbeaten_streaks']

theDict = dict(zip(streakKeys,streakCols))

for key, value in theDict.items():
    dftest[value] = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) 

dftest


streakCols = ['streak_w','streak_d','streak_l','streak_u']
streakColsVenue =  ['streak_w','streak_d','streak_l','streak_u']


#Figure out what values are needed in new function. 
#Pass Dict or just columns names
#Does this works for every scenario?
matchesWithStreaks = matchesByTeam.apply(lambda x: streakCalculator(x, streakCols)
matchesWithStreaks = matchesByTeamAndVenue.apply(lambda x: streakCalculator(x, streakColsVenue)
matchesWithStreaks = matchesByTeamAndOpp.apply(lambda x: streakCalculator(x, streakColsOpp)
#Different function needed because there are only a few matches to get streaks from. Do no reset 
matchesWithStreaks = matchesByTeamOppAndVenue.apply(lambda x: streakCalculator(x, streakColsVenue)
                                         

In [None]:
'''
Scenario 1
W, L, D, U for each team

#new_cols = ['win_streaks','draw_streaks','loss_streaks','unbeaten_streaks']
#keys = ['w','d','l','unbeaten']


Scenario 2
W, L, D, U for each team while home

#new_cols = ['win_streaks_h','draw_streaks_h','loss_streaks_h','unbeaten_streaks_h']
#keys = ['w','d','l','unbeaten']


Scenario 3
W, L, D, U for each team while away

#new_cols = ['win_streaks_a','draw_streaks_a','loss_streaks_a','unbeaten_streaks_a']
#keys = ['w','d','l','unbeaten']

-----------

Stop setting to zero when streak is interrumpted from here?
Scenario 4
W, L, D, U for each team against opp

#new_cols = ['win_streaks_vs','draw_streaks_vs','loss_streaks_vs','unbeaten_streaks_vs']
#keys = ['w','d','l','unbeaten']



In [None]:
subset = grouped_matches.get_group("Guadalajara")["result_num"]
dftest = grouped_matches.get_group("Guadalajara").copy()

streakKeys = ['w','d','l','unbeaten']

streakCols = ['win_streaks','draw_streaks','loss_streaks','unbeaten_streaks']

theDict = dict(zip(streakKeys,streakCols))

for key, value in theDict.items():
    dftest[value] = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) 

dftest

#dftest[new_cols] = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, key)[-1], raw= False) for key in keys 

#win_streaks = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, 'w')[-1], raw= False)
#draw_streaks = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, 'd')[-1], raw= False)
#loss_streaks = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, 'l')[-1], raw= False)
#unbeaten_streaks = subset.rolling(window = len(subset), min_periods = 1, closed = 'left').apply(lambda x: calc_streaks1(x, 'u')[-1], raw= False)

#dftest["win_streak"] = win_streaks
#dftest#[["result_num","win_streak"]]

In [None]:
#matchesByTeamAndOpp.get_group(("America","Guadalajara"))
#matchesByTeamAndVenue.get_group(("Guadalajara","Away"))
matchesByTeamOppAndVenue.get_group(("Guadalajara","Atlas", "Home"))

In [None]:
pd.set_option('display.max_rows', 100)


In [None]:
roll_w_streaks=grouped_matches.get_group("Guadalajara")['result_num'].rolling(window=1, closed = 'left').apply(lambda x: calc_streaks1(x,'w')[-1])
grouped_matches.get_group("Guadalajara").loc(: ,'roll_w_streaks') = roll_w_streaks
grouped_matches.get_group("Guadalajara")#[['result_num','roll_w_streaks']]

In [None]:
grouped_matches = thisDf.groupby("team_name")

In [None]:
group = grouped_matches.get_group("Guadalajara")

In [None]:
group

In [None]:
group = group.sort_values("date")
rolling_avgs=group[cols].rolling(window=4, closed = 'left').mean()
rolling_avgs
group[newCols] = rolling_avgs
group= group.dropna(subset = newCols)    
group
    #Drop rows when there are not enough previous matches info (at the beginning of the dataset)
#    teamDf = teamDf.dropna(subset = newCols)
    

In [None]:
matches_rolling.to_csv('rolling.csv', index=False, encoding ="latin-1")


In [None]:
#TODO: Add shooting averages using differnt windows? 6,5,4,3 and 2 maybe
#Calculate streaks using the rolling method to with thegroup bys, or go back to the dictionary approaches

#Merge results so that we have only 1 row per match:
#  Home team info, shooting and streaks
#  Away team info, shooting and streaks
#  Change result to H, A, or D
#Check if group by works when used with 2 variables : team_name and opponent


In [None]:
matches_rolling[matches_rolling["team_name"] =="Guadalajara"]

In [None]:
mergedDf = thisDf.merge(thisDf, left_on = ["date", "team_new"], right_on = ["date","opponent_new"])
mergedDf

In [None]:
mergedDf[["date","venue_x","venue_y","team_new_x","team_new_y","opponent_x","opponent_y","result_num_x","result_num_y"]]

In [None]:
thisDf.columns

In [None]:
df_new = thisDf.groupby(thisDf[[['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'team_name', 'season', 'result_num']]])

In [None]:

#Consistent team names
thisDf = thisDf.replace("Santos Laguna", "Santos")

thisDf.replace("América", "America")
thisDf.replace("Atlético", "Atletico")
thisDf.replace("FC Juárez","FC Juarez")
thisDf.replace("León","Leon")
thisDf.replace("Mazatlán","Mazatlan")
thisDf.replace("Querétaro","Queretaro")