## This code performs an analysis of messidata.csv using XGBoost algorithm. 
### Data is from https://fbref.com/en/players/d70ce98e/matchlogs/2019-2020/summary/Lionel-Messi-Match-Logs. The data is collected for season 2019/2020 whichc includes his game attributes for both Argentina and Barcelona, however, I have considered only Barcelona data for this analysis. 
### Variables/attributes are: 
* date - date on which the game was played
* day - day of the week on which the game was played
* comp - competition in which the game was played, la liga, champions league, supercopa or copa del rey
* round - stage of competition at which the game was played
* venue - home, away or at a neutral venue
* result - result of the game
* squad - Barca or Argentina
* opponent - opposition team
* start - if Messi started the game or came on as a sub
* pos - position he played in FW: forward, AM: attacking mid, RW/LW: right/left wing 
* minutes - minutes played
* gls - goals scored by him
* ast - assists provided by him 
* pk - penalties won
* pkatt	- penalties attempted
* sh - shots taken not including penalties
* sot - shots on target not including penalties
* yellow - yellow cards shown by the ref to Messi
* red - red cards shown by the ref to Messi
* touches - Number of times he touched the ball, receiving a pass, dribbling and then making a pass counts as one touch 
* press - number of times applying pressure on the opposition player
* tkl - number of players tackled
* interceptions - interceptions made for the pass 
* blocks - blocking the ball by standing in its path
* xg - expected goals
* npxg - non penalty expected goals
* xa - expected assists
* sca - shot creation actions
* gca - goal creation actions
* passcomp - passess completed
* passattemp - passes attempted
* passcomppercent - pass completion percentage (target variable)
* progpass - progressive passes
* carries - number of time he controlled the ball
* progcarries - carries the ball towards the opponent's goal, atleast 5 yards
* successdrib - sucessful dribbles
* attemptdrib - attempted dribbles
* matchreport - NA for this study
* actresult	- actual result of the game (win, loss or draw)
* totgls - total goals scored by Messi's team
* totglconceded - total goals conceded by Messi's team 


#### Pass completion percentage is our target variable. 


###### Import libraries

In [318]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

###### Import data 

In [319]:
df = pd.read_csv('messidata.csv')

In [320]:
df.head()

Unnamed: 0,date,day,comp,round,venue,result,squad,opponent,start,pos,...,passcomppercent,progpass,carries,progcarries,successdrib,attemptdrib,matchreport,actresult,totgls,totglconceded
0,9/17/19,Tue,Champions Lg,Group stage,Away,D 0–0,es Barcelona,de Dortmund,N,"RW,LW",...,78.3,2.0,33.0,4.0,1.0,5.0,Match Report,D,0,0
1,9/21/19,Sat,La Liga,Matchweek 5,Away,L 0–2,Barcelona,Granada,N,AM,...,68.6,5.0,33.0,7.0,3.0,5.0,Match Report,L,0,2
2,9/24/19,Tue,La Liga,Matchweek 6,Home,W 2–1,Barcelona,Villarreal,Y*,RW,...,75.0,3.0,19.0,2.0,1.0,2.0,Match Report,W,2,1
3,10/2/19,Wed,Champions Lg,Group stage,Home,W 2–1,es Barcelona,it Inter,Y*,RW,...,83.8,14.0,83.0,16.0,11.0,13.0,Match Report,W,2,1
4,10/6/19,Sun,La Liga,Matchweek 8,Home,W 4–0,Barcelona,Sevilla,Y*,RW,...,81.2,5.0,74.0,17.0,7.0,9.0,Match Report,W,4,0


###### Feauture engineering

In [321]:
df['actresult'].value_counts()

W    27
D     9
L     8
Name: actresult, dtype: int64

In [322]:
df['start'].value_counts()

Y*    42
N      2
Name: start, dtype: int64

In [323]:
df = df.drop(["date","round","result","squad","opponent","pos","matchreport"],axis=1)
# dropping unnecessary/irrelevant/duplicate variables

In [324]:
df = df.fillna(df.mean()) # replace missing data with column mean

In [325]:
df['actresult'] = df['actresult'].replace(['W','D','L'],
                        [1,2,3]) # categorical variable

In [326]:
#same results could be obtained using pd.getdummies method! 
ONE_HOT_COLS = ['day', 'comp','venue','start']
print("Starting DF shape: %d, %d" % df.shape)


for col in ONE_HOT_COLS:
    s = df[col].unique()

    # Create a One Hot Dataframe with 1 row for each unique value
    one_hot_df = pd.get_dummies(s, prefix='%s_' % col)
    one_hot_df[col] = s

    print("Adding One Hot values for %s (the column has %d unique values)" % (col, len(s)))
    pre_len = len(df)

    # Merge the one hot columns
    df = df.merge(one_hot_df, on=[col], how="left")
    assert len(df) == pre_len
    print(df.shape)

Starting DF shape: 44, 34
Adding One Hot values for day (the column has 6 unique values)
(44, 40)
Adding One Hot values for comp (the column has 4 unique values)
(44, 44)
Adding One Hot values for venue (the column has 3 unique values)
(44, 47)
Adding One Hot values for start (the column has 2 unique values)
(44, 49)


In [327]:
df = df.drop(['day', 'comp','venue','start'],axis=1)

In [328]:
# converting the actresult column to binary data columns
resultdummy = ['actresult']
df = pd.get_dummies(df,columns=resultdummy,drop_first=True)

In [329]:
df.head()

Unnamed: 0,minutes,gls,ast,pk,pkatt,sh,sot,yellow,red,touches,...,comp__Copa del Rey,comp__La Liga,comp__Supercopa de España,venue__Away,venue__Home,venue__Neutral,start__N,start__Y*,actresult_2,actresult_3
0,32,0,0,0,0,1,0,0,0,33.0,...,0,0,0,1,0,0,1,0,1,0
1,45,0,0,0,0,2,1,0,0,41.0,...,0,1,0,1,0,0,1,0,0,1
2,45,0,1,0,0,1,0,0,0,22.0,...,0,1,0,0,1,0,0,1,0,0
3,90,0,1,0,0,4,3,0,0,83.0,...,0,0,0,0,1,0,0,1,0,0
4,90,1,0,0,0,6,3,0,0,83.0,...,0,1,0,0,1,0,0,1,0,0


In [330]:
df = df.drop(['red'],axis=1)

In [331]:
df.columns

Index(['minutes', 'gls', 'ast', 'pk', 'pkatt', 'sh', 'sot', 'yellow',
       'touches', 'press', 'tkl', 'interceptions', 'blocks', 'xg', 'npxg',
       'xa', 'sca', 'gca', 'passcomp', 'passattemp', 'passcomppercent',
       'progpass', 'carries', 'progcarries', 'successdrib', 'attemptdrib',
       'totgls', 'totglconceded', 'day__Fri', 'day__Sat', 'day__Sun',
       'day__Thu', 'day__Tue', 'day__Wed', 'comp__Champions Lg',
       'comp__Copa del Rey', 'comp__La Liga', 'comp__Supercopa de España',
       'venue__Away', 'venue__Home', 'venue__Neutral', 'start__N', 'start__Y*',
       'actresult_2', 'actresult_3'],
      dtype='object')

###### Train Test Split

In [332]:
X = pd.DataFrame(df.drop(['passcomppercent'],axis=1))
y = pd.Series(df['passcomppercent'])

In [333]:
X.head()

Unnamed: 0,minutes,gls,ast,pk,pkatt,sh,sot,yellow,touches,press,...,comp__Copa del Rey,comp__La Liga,comp__Supercopa de España,venue__Away,venue__Home,venue__Neutral,start__N,start__Y*,actresult_2,actresult_3
0,32,0,0,0,0,1,0,0,33.0,4.0,...,0,0,0,1,0,0,1,0,1,0
1,45,0,0,0,0,2,1,0,41.0,2.0,...,0,1,0,1,0,0,1,0,0,1
2,45,0,1,0,0,1,0,0,22.0,4.0,...,0,1,0,0,1,0,0,1,0,0
3,90,0,1,0,0,4,3,0,83.0,9.0,...,0,0,0,0,1,0,0,1,0,0
4,90,1,0,0,0,6,3,0,83.0,6.0,...,0,1,0,0,1,0,0,1,0,0


In [334]:
y.head()

0    78.3
1    68.6
2    75.0
3    83.8
4    81.2
Name: passcomppercent, dtype: float64

In [335]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30, random_state=22)

###### Model building

In [336]:
regressor = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=40, #tried 1, 10, 20 , 30 ,40 ,50...
    gamma=34,      #tried 1, 10, 20 , 30 ,40 ,50...
    max_depth=300 ## tried 3, 30, 1000...
)
## played with different combinations of lambda and gamma values, mse is the lowest at both=50
bst = regressor.fit(X_train, y_train)
bst
kk = pd.DataFrame(regressor.feature_importances_.reshape(1, -1), columns=X.columns)
gg = kk.transpose()
gg.rename(columns={gg.columns[0]:'col1'}, inplace=True)
gg.sort_values(by=['col1'],ascending=False)

Unnamed: 0,col1
passcomp,0.546085
progcarries,0.453916
successdrib,0.0
totgls,0.0
totglconceded,0.0
day__Fri,0.0
day__Sat,0.0
day__Sun,0.0
day__Thu,0.0
day__Tue,0.0


###### Check model performance

In [337]:
y_pred = regressor.predict(X_test)
y_pred

array([81.36832 , 79.57849 , 81.36832 , 79.57849 , 81.36832 , 75.6782  ,
       81.36832 , 78.913445, 81.36832 , 75.6782  , 81.36832 , 78.913445,
       81.36832 , 79.57849 ], dtype=float32)

In [338]:
y_test

42    83.00000
25    83.10000
19    78.84878
3     83.80000
22    78.84878
1     68.60000
32    78.20000
23    77.60000
12    81.00000
43    66.70000
27    68.40000
13    68.70000
10    83.30000
33    75.30000
Name: passcomppercent, dtype: float64

In [339]:
print(mean_squared_error(y_test, y_pred))
print(regressor.score(X_test, y_test))

34.4798040351702
0.045413815070494246


#### That is the lowest MSE & R^{2} I could get from this model. This model definitely requires more tuning of the parameters