## Data Preprocessing/Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("C:/Users/USER/Downloads/understat.com.csv")

In [3]:
df.rename(columns = {"Unnamed: 0": "league", "Unnamed: 1":"year"}, inplace = True)

In [4]:
temp_df=df.copy()

In [5]:
temp_df["first_position"] = np.where(temp_df["position"]==1, 1, 0)

In [6]:
temp_df.head()

Unnamed: 0,league,year,position,team,matches,wins,draws,loses,scored,missed,...,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff,first_position
0,La_liga,2014,1,Barcelona,38,30,4,4,110,21,...,7.444293,24.727907,73.049305,5.683535,16.367593,489,114,94.0813,0.0813,1
1,La_liga,2014,2,Real Madrid,38,30,2,6,118,38,...,4.607198,38.890805,47.21309,10.209085,12.92951,351,153,81.7489,-10.2511,0
2,La_liga,2014,3,Atletico Madrid,38,23,9,6,67,29,...,0.069107,26.839271,25.748737,8.982028,9.237091,197,123,73.1353,-4.8647,0
3,La_liga,2014,4,Valencia,38,22,11,5,70,32,...,7.392572,33.446477,16.257501,8.709827,7.870225,203,172,63.7068,-13.2932,0
4,La_liga,2014,5,Sevilla,38,23,7,8,71,45,...,2.862742,41.916529,20.17807,8.276148,9.477805,305,168,67.3867,-8.6133,0


In [7]:
categ_col = temp_df.select_dtypes(include=['object', 'category']).columns
categ_col

Index(['league', 'team'], dtype='object')

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
for i in categ_col:
    temp_df[i] = le.fit_transform(temp_df[i])

In [9]:
temp_df.head()

Unnamed: 0,league,year,position,team,matches,wins,draws,loses,scored,missed,...,xGA_diff,npxGA,npxGD,ppda_coef,oppda_coef,deep,deep_allowed,xpts,xpts_diff,first_position
0,2,2014,1,14,38,30,4,4,110,21,...,7.444293,24.727907,73.049305,5.683535,16.367593,489,114,94.0813,0.0813,1
1,2,2014,2,123,38,30,2,6,118,38,...,4.607198,38.890805,47.21309,10.209085,12.92951,351,153,81.7489,-10.2511,0
2,2,2014,3,12,38,23,9,6,67,29,...,0.069107,26.839271,25.748737,8.982028,9.237091,197,123,73.1353,-4.8647,0
3,2,2014,4,157,38,22,11,5,70,32,...,7.392572,33.446477,16.257501,8.709827,7.870225,203,172,63.7068,-13.2932,0
4,2,2014,5,138,38,23,7,8,71,45,...,2.862742,41.916529,20.17807,8.276148,9.477805,305,168,67.3867,-8.6133,0


In [10]:
x = temp_df.drop(columns=['wins','draws','loses','scored', 'missed', 'pts', 'first_position', 'position'], axis = 1)
y = temp_df['first_position']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train),columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test),columns=x_test.columns)

### Logistic Regression

In [13]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
clf_pred = clf.predict(x_test)
clf_accuracy = accuracy_score(y_test, clf_pred)
clf_matrix = confusion_matrix(y_test, clf_pred)
clf_f1 = f1_score(clf_pred, y_test, average = 'macro')

In [14]:
print('Accuracy on Logistic Regression is: {}'.format(clf_accuracy))
print('F1_score on Logistic Regression is: {}'.format(clf_f1))

Accuracy on Logistic Regression is: 0.9781021897810219
F1_score on Logistic Regression is: 0.8579329415831316


### Decision Tree Classifier

In [15]:
dtc = DecisionTreeClassifier(random_state=1).fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
dtc_accuracy = accuracy_score(y_test, dtc_pred)
dtc_matrix = confusion_matrix(y_test, dtc_pred)
dtc_f1 = f1_score(dtc_pred, y_test, average = 'macro')

In [16]:
print('Accuracy on Decision Tree Classifier is: {}'.format(dtc_accuracy))
print('F1_score on Decision Tree Classifier is: {}'.format(dtc_f1))

Accuracy on Decision Tree Classifier is: 0.9343065693430657
F1_score on Decision Tree Classifier is: 0.636604774535809


### Extra Trees Classifier

In [17]:
etc = ExtraTreesClassifier(random_state=1)
etc.fit(x_train, y_train)
etc_pred = etc.predict(x_test)
etc_accuracy = accuracy_score(y_test, etc_pred)
etc_matrix = confusion_matrix(y_test, dtc_pred)
etc_f1 = f1_score(etc_pred, y_test, average = 'macro')

In [18]:
print('Accuracy on Extra Tree Classifier is: {}'.format(etc_accuracy))
print('F1_score on Extra Tree Classifier is: {}'.format(etc_f1))

Accuracy on Extra Tree Classifier is: 0.9562043795620438
F1_score on Extra Tree Classifier is: 0.6886363636363637


### CatBoost Classifier

In [19]:
cat = CatBoostClassifier()
cat.fit(x_train, y_train)
cat_pred = cat.predict(x_test)
cat_accuracy = accuracy_score(y_test, cat_pred)
cat_matrix = confusion_matrix(y_test, cat_pred)
cat_f1 = f1_score(cat_pred, y_test, average = 'macro')

Learning rate set to 0.007963
0:	learn: 0.6785604	total: 164ms	remaining: 2m 43s
1:	learn: 0.6644005	total: 183ms	remaining: 1m 31s
2:	learn: 0.6500363	total: 199ms	remaining: 1m 5s
3:	learn: 0.6368307	total: 220ms	remaining: 54.9s
4:	learn: 0.6227005	total: 235ms	remaining: 46.8s
5:	learn: 0.6084200	total: 249ms	remaining: 41.3s
6:	learn: 0.5948613	total: 264ms	remaining: 37.5s
7:	learn: 0.5816438	total: 326ms	remaining: 40.4s
8:	learn: 0.5694069	total: 340ms	remaining: 37.5s
9:	learn: 0.5572123	total: 356ms	remaining: 35.3s
10:	learn: 0.5463829	total: 372ms	remaining: 33.5s
11:	learn: 0.5343487	total: 387ms	remaining: 31.9s
12:	learn: 0.5217738	total: 406ms	remaining: 30.8s
13:	learn: 0.5132159	total: 421ms	remaining: 29.7s
14:	learn: 0.5018517	total: 466ms	remaining: 30.6s
15:	learn: 0.4920289	total: 481ms	remaining: 29.6s
16:	learn: 0.4835082	total: 495ms	remaining: 28.6s
17:	learn: 0.4724926	total: 508ms	remaining: 27.7s
18:	learn: 0.4614783	total: 522ms	remaining: 26.9s
19:	learn

164:	learn: 0.0712655	total: 1.64s	remaining: 8.32s
165:	learn: 0.0707790	total: 1.66s	remaining: 8.33s
166:	learn: 0.0702509	total: 1.67s	remaining: 8.31s
167:	learn: 0.0696852	total: 1.67s	remaining: 8.28s
168:	learn: 0.0692557	total: 1.68s	remaining: 8.26s
169:	learn: 0.0688434	total: 1.69s	remaining: 8.24s
170:	learn: 0.0683832	total: 1.7s	remaining: 8.26s
171:	learn: 0.0678889	total: 1.72s	remaining: 8.29s
172:	learn: 0.0674131	total: 1.74s	remaining: 8.3s
173:	learn: 0.0669248	total: 1.76s	remaining: 8.38s
174:	learn: 0.0665469	total: 1.78s	remaining: 8.39s
175:	learn: 0.0659812	total: 1.8s	remaining: 8.42s
176:	learn: 0.0656229	total: 1.81s	remaining: 8.43s
177:	learn: 0.0651766	total: 1.85s	remaining: 8.54s
178:	learn: 0.0647703	total: 1.86s	remaining: 8.55s
179:	learn: 0.0644213	total: 1.88s	remaining: 8.56s
180:	learn: 0.0639595	total: 1.91s	remaining: 8.64s
181:	learn: 0.0634608	total: 1.93s	remaining: 8.65s
182:	learn: 0.0631862	total: 1.94s	remaining: 8.66s
183:	learn: 0.0

337:	learn: 0.0294248	total: 3.22s	remaining: 6.3s
338:	learn: 0.0293242	total: 3.23s	remaining: 6.3s
339:	learn: 0.0292830	total: 3.24s	remaining: 6.3s
340:	learn: 0.0291391	total: 3.26s	remaining: 6.3s
341:	learn: 0.0290323	total: 3.28s	remaining: 6.31s
342:	learn: 0.0289770	total: 3.29s	remaining: 6.31s
343:	learn: 0.0288657	total: 3.31s	remaining: 6.31s
344:	learn: 0.0287606	total: 3.32s	remaining: 6.31s
345:	learn: 0.0286313	total: 3.34s	remaining: 6.31s
346:	learn: 0.0284923	total: 3.36s	remaining: 6.33s
347:	learn: 0.0283726	total: 3.38s	remaining: 6.33s
348:	learn: 0.0282848	total: 3.39s	remaining: 6.32s
349:	learn: 0.0281644	total: 3.41s	remaining: 6.33s
350:	learn: 0.0280395	total: 3.42s	remaining: 6.33s
351:	learn: 0.0279157	total: 3.45s	remaining: 6.36s
352:	learn: 0.0278304	total: 3.47s	remaining: 6.35s
353:	learn: 0.0277245	total: 3.5s	remaining: 6.38s
354:	learn: 0.0275936	total: 3.53s	remaining: 6.41s
355:	learn: 0.0274936	total: 3.54s	remaining: 6.41s
356:	learn: 0.027

520:	learn: 0.0165019	total: 4.83s	remaining: 4.44s
521:	learn: 0.0164481	total: 4.85s	remaining: 4.44s
522:	learn: 0.0164307	total: 4.86s	remaining: 4.44s
523:	learn: 0.0163948	total: 4.9s	remaining: 4.45s
524:	learn: 0.0163471	total: 4.91s	remaining: 4.45s
525:	learn: 0.0163118	total: 4.95s	remaining: 4.46s
526:	learn: 0.0162854	total: 4.96s	remaining: 4.46s
527:	learn: 0.0162579	total: 4.98s	remaining: 4.45s
528:	learn: 0.0162296	total: 5s	remaining: 4.45s
529:	learn: 0.0162001	total: 5.01s	remaining: 4.45s
530:	learn: 0.0161484	total: 5.04s	remaining: 4.45s
531:	learn: 0.0161160	total: 5.05s	remaining: 4.45s
532:	learn: 0.0160551	total: 5.07s	remaining: 4.44s
533:	learn: 0.0160355	total: 5.08s	remaining: 4.44s
534:	learn: 0.0159969	total: 5.09s	remaining: 4.43s
535:	learn: 0.0159722	total: 5.1s	remaining: 4.42s
536:	learn: 0.0159401	total: 5.11s	remaining: 4.41s
537:	learn: 0.0159191	total: 5.12s	remaining: 4.39s
538:	learn: 0.0158883	total: 5.12s	remaining: 4.38s
539:	learn: 0.015

696:	learn: 0.0111599	total: 6.83s	remaining: 2.97s
697:	learn: 0.0111230	total: 6.83s	remaining: 2.96s
698:	learn: 0.0110945	total: 6.84s	remaining: 2.95s
699:	learn: 0.0110862	total: 6.85s	remaining: 2.94s
700:	learn: 0.0110766	total: 6.86s	remaining: 2.92s
701:	learn: 0.0110487	total: 6.86s	remaining: 2.91s
702:	learn: 0.0110237	total: 6.87s	remaining: 2.9s
703:	learn: 0.0110091	total: 6.88s	remaining: 2.89s
704:	learn: 0.0109754	total: 6.88s	remaining: 2.88s
705:	learn: 0.0109527	total: 6.89s	remaining: 2.87s
706:	learn: 0.0109214	total: 6.89s	remaining: 2.86s
707:	learn: 0.0108917	total: 6.9s	remaining: 2.85s
708:	learn: 0.0108710	total: 6.91s	remaining: 2.83s
709:	learn: 0.0108517	total: 6.91s	remaining: 2.82s
710:	learn: 0.0108254	total: 6.92s	remaining: 2.81s
711:	learn: 0.0108083	total: 6.92s	remaining: 2.8s
712:	learn: 0.0107747	total: 6.93s	remaining: 2.79s
713:	learn: 0.0107462	total: 6.93s	remaining: 2.78s
714:	learn: 0.0107303	total: 6.94s	remaining: 2.77s
715:	learn: 0.0

871:	learn: 0.0078758	total: 8.58s	remaining: 1.26s
872:	learn: 0.0078677	total: 8.59s	remaining: 1.25s
873:	learn: 0.0078498	total: 8.6s	remaining: 1.24s
874:	learn: 0.0078387	total: 8.6s	remaining: 1.23s
875:	learn: 0.0078262	total: 8.61s	remaining: 1.22s
876:	learn: 0.0078188	total: 8.61s	remaining: 1.21s
877:	learn: 0.0078120	total: 8.62s	remaining: 1.2s
878:	learn: 0.0077994	total: 8.62s	remaining: 1.19s
879:	learn: 0.0077892	total: 8.63s	remaining: 1.18s
880:	learn: 0.0077833	total: 8.63s	remaining: 1.17s
881:	learn: 0.0077664	total: 8.64s	remaining: 1.16s
882:	learn: 0.0077533	total: 8.64s	remaining: 1.15s
883:	learn: 0.0077428	total: 8.65s	remaining: 1.13s
884:	learn: 0.0077265	total: 8.65s	remaining: 1.12s
885:	learn: 0.0077081	total: 8.66s	remaining: 1.11s
886:	learn: 0.0076999	total: 8.66s	remaining: 1.1s
887:	learn: 0.0076835	total: 8.67s	remaining: 1.09s
888:	learn: 0.0076540	total: 8.67s	remaining: 1.08s
889:	learn: 0.0076412	total: 8.68s	remaining: 1.07s
890:	learn: 0.00

In [20]:
print('Accuracy on CatBoost Classifier is: {}'.format(cat_accuracy))
print('F1_score on CatBoost Classifier is: {}'.format(cat_f1))

Accuracy on CatBoost Classifier is: 0.9708029197080292
F1_score on CatBoost Classifier is: 0.7924242424242424


### Gradient Boosting Classifier

In [21]:
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
gbc_pred = gbc.predict(x_test)
gbc_accuracy = accuracy_score(y_test, gbc_pred)
gbc_matrix = confusion_matrix(y_test, gbc_pred)
gbc_f1 = f1_score(gbc_pred, y_test, average = 'macro')

In [22]:
print('Accuracy on Gradient Boosting Classifier is: {}'.format(gbc_accuracy))
print('F1_score on Gradient Boosting Classifier is: {}'.format(gbc_f1))

Accuracy on Gradient Boosting Classifier is: 0.9562043795620438
F1_score on Gradient Boosting Classifier is: 0.6886363636363637


### Random Forest Classifier

In [23]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
rfc_pred = rfc.predict(x_test)
rfc_accuracy = accuracy_score(y_test, rfc_pred)
rfc_matrix = confusion_matrix(y_test, rfc_pred)
rfc_f1 = f1_score(rfc_pred, y_test, average = 'macro')

In [24]:
print('Accuracy on Random Forest Classifier is: {}'.format(rfc_accuracy))
print('F1_score on Random Forest Classifier is: {}'.format(rfc_f1))

Accuracy on Random Forest Classifier is: 0.9635036496350365
F1_score on Random Forest Classifier is: 0.7127882599580713


### XGB Classifier

In [25]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_matrix = confusion_matrix(y_test, xgb_pred)
xgb_f1 = f1_score(xgb_pred, y_test, average = 'macro')

In [26]:
print('Accuracy on XGB Classifier is: {}'.format(xgb_accuracy))
print('F1_score on XGB Classifier is: {}'.format(xgb_f1))

Accuracy on XGB Classifier is: 0.9781021897810219
F1_score on XGB Classifier is: 0.8579329415831316


### Ridge Classifier

In [27]:
ridge = RidgeClassifier()
ridge.fit(x_train, y_train)
ridge_pred = ridge.predict(x_test)
ridge_accuracy = accuracy_score(y_test, ridge_pred)
ridge_matrix = confusion_matrix(y_test, ridge_pred)
ridge_f1 = f1_score(ridge_pred, y_test, average = 'macro')

In [28]:
print('Accuracy on XGB Classifier is: {}'.format(xgb_accuracy))
print('F1_score on XGB Classifier is: {}'.format(xgb_f1))

Accuracy on XGB Classifier is: 0.9781021897810219
F1_score on XGB Classifier is: 0.8579329415831316
