In [203]:
import sqlite3
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import pickle as pkl
import numpy as np

In [204]:
WIN_SIZE= 5

In [205]:
cnx = sqlite3.connect('../input/soccer/database.sqlite')
match_df = pd.read_sql_query("SELECT * FROM Match", cnx)
country_df = pd.read_sql_query("SELECT * FROM Country", cnx)
league_df = pd.read_sql_query("SELECT * FROM League", cnx)
team_df = pd.read_sql_query("SELECT * FROM Team_Attributes", cnx)
player_df = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

In [206]:
print(match_df["season"].value_counts())

2008/2009    3326
2015/2016    3326
2014/2015    3325
2010/2011    3260
2012/2013    3260
2009/2010    3230
2011/2012    3220
2013/2014    3032
Name: season, dtype: int64


In [207]:
# obtain year feature in order to merge
match_df['year'] = pd.DatetimeIndex(match_df['date']).year
df=match_df[['id','date','season','country_id','league_id','stage','home_team_api_id','away_team_api_id',
             'home_team_goal','away_team_goal','year', #"B365H", "B365D", "B365A", "BWH", 
#              "BWD", "BWA", "IWH", "IWD", "IWA", "LBH", "LBD", "LBA", "WHH", "WHD", "WHA", 
#              "SJH", "SJD", "SJA", "VCH", "VCD", "VCA", "GBH", "GBD", "GBA", "BSH", "BSD", "BSA",
             "home_player_1", "home_player_2", "home_player_3", "home_player_4", "home_player_5",
             "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
             "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4", 
             "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
             "away_player_10", "away_player_11"]]
team_df['year'] = pd.DatetimeIndex(team_df['date']).year
player_df['year'] = pd.DatetimeIndex(player_df['date']).year

In [208]:
avg_player_df= player_df[["player_api_id", "year", "overall_rating"]].groupby(["player_api_id", "year"], as_index=False).mean()
avg_player_df.head()

Unnamed: 0,player_api_id,year,overall_rating
0,2625,2007,63.0
1,2625,2008,60.0
2,2625,2010,60.0
3,2625,2011,58.5
4,2625,2012,58.0


In [209]:
# MERGE SCORE OF HOME_TEAM TO DF
df=df.merge(team_df[["team_api_id","buildUpPlaySpeed","buildUpPlayDribbling","buildUpPlayPassing",
                    "buildUpPlayPositioningClass", "chanceCreationPassing", "chanceCreationCrossing",
                    "chanceCreationShooting","chanceCreationPositioningClass", "defencePressure",
                    "defenceAggression","defenceTeamWidth", "defenceDefenderLineClass","year"]],
            left_on=["home_team_api_id", "year"], right_on=["team_api_id","year"], how="inner")
print(df.shape)

(18816, 46)


In [210]:
print(df["season"].value_counts())

2014/2015    3199
2010/2011    3172
2012/2013    3150
2011/2012    3129
2013/2014    2947
2009/2010    1637
2015/2016    1582
Name: season, dtype: int64


In [211]:
# CHANGE FEATURES TEAM TO HOME_TEAM
df.rename(columns={"buildUpPlaySpeed":"buildUpPlaySpeed_home",
                   "buildUpPlayDribbling":"buildUpPlayDribbling_home",
                   "buildUpPlayPassing":"buildUpPlayPassing_home", 
                   "buildUpPlayPositioningClass":"buildUpPlayPositioningClass_home",
                   "chanceCreationPassing":"chanceCreationPassing_home",
                   "chanceCreationCrossing":"chanceCreationCrossing_home",
                   "chanceCreationShooting":"chanceCreationShooting_home", 
                   "chanceCreationPositioningClass":"chanceCreationPositioningClass_home", 
                   "defencePressure":"defencePressure_home", 
                   "defenceAggression":"defenceAggression_home", 
                   "defenceTeamWidth":"defenceTeamWidth_home", 
                   "defenceDefenderLineClass":"defenceDefenderLineClass_home", 
                   }, inplace=True)

In [212]:
# MERGE SCORE OF AWAY_TEAM
df=df.merge(team_df[["team_api_id","buildUpPlaySpeed","buildUpPlayDribbling","buildUpPlayPassing",
                    "buildUpPlayPositioningClass", "chanceCreationPassing", "chanceCreationCrossing",
                    "chanceCreationShooting","chanceCreationPositioningClass", "defencePressure",
                    "defenceAggression","defenceTeamWidth", "defenceDefenderLineClass","year"]],
            left_on=["away_team_api_id", "year"], right_on=["team_api_id","year"], how="inner")

In [213]:
# CHANGE FEATURES TEAM TO HOME_TEAM
df.rename(columns={"buildUpPlaySpeed":"buildUpPlaySpeed_away",
                   "buildUpPlayDribbling":"buildUpPlayDribbling_away",
                   "buildUpPlayPassing":"buildUpPlayPassing_away", 
                   "buildUpPlayPositioningClass":"buildUpPlayPositioningClass_away",
                   "chanceCreationPassing":"chanceCreationPassing_away",
                   "chanceCreationCrossing":"chanceCreationCrossing_away",
                   "chanceCreationShooting":"chanceCreationShooting_away", 
                   "chanceCreationPositioningClass":"chanceCreationPositioningClass_away", 
                   "defencePressure":"defencePressure_away", 
                   "defenceAggression":"defenceAggression_away", 
                   "defenceTeamWidth":"defenceTeamWidth_away", 
                   "defenceDefenderLineClass":"defenceDefenderLineClass_away", 
                   }, inplace=True)

In [214]:
rating_avg= player_df["overall_rating"].describe()["mean"]
rating_avg

68.6000152886831

In [215]:
# MERGE AVERAGE PLAYERS SCORING
left_columns= df.filter(like="_player_").columns
right_column= "player_api_id" 

print(df.shape)
for i in range(len(left_columns)):
    df= (df.merge(avg_player_df, 
                  left_on=[left_columns[i], "year"], right_on=["player_api_id", "year"], 
                  how="left")
           .drop(columns=["player_api_id"])
           .rename(columns={"overall_rating": "overall_rating_"+str(i+1)})
           .fillna({"overall_rating_"+str(i+1): rating_avg})
        )
print(df.shape)
print(df.columns)

(18243, 59)
(18243, 81)
Index(['id', 'date', 'season', 'country_id', 'league_id', 'stage',
       'home_team_api_id', 'away_team_api_id', 'home_team_goal',
       'away_team_goal', 'year', 'home_player_1', 'home_player_2',
       'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
       'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10',
       'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3',
       'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7',
       'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11',
       'team_api_id_x', 'buildUpPlaySpeed_home', 'buildUpPlayDribbling_home',
       'buildUpPlayPassing_home', 'buildUpPlayPositioningClass_home',
       'chanceCreationPassing_home', 'chanceCreationCrossing_home',
       'chanceCreationShooting_home', 'chanceCreationPositioningClass_home',
       'defencePressure_home', 'defenceAggression_home',
       'defenceTeamWidth_home', 'defenceDefender

In [216]:
# CALCULATE THE TARGET FUNCTION
# FIRST: PRECTING HOME_WIN, TIE, AWAY WIN
def result_game(home_goals, away_goals):
    
    dif_goals=home_goals-away_goals
    
    if dif_goals==0:
        target="TIE"
    elif dif_goals>0:
        target="HOME_WIN"
    else:
        target="AWAY_WIN"
    return target    

In [217]:
df['result']=df.apply(lambda row:result_game(row['home_team_goal'],row['away_team_goal']), axis=1)

In [218]:
print(df.isnull().sum().sum())
df.fillna(0, inplace=True)
print(df.isnull().sum().sum())

31377
0


In [219]:
# REMOVE THE REPEATED FEATURES AND FEATURES WITH HIGHER NUMBER OF MISSING VALUES
print(df.shape)
df=df.drop(['home_team_goal','away_team_goal','team_api_id_x','team_api_id_y',
            'buildUpPlayDribbling_home','buildUpPlayDribbling_away'] + 
           df.filter(like="_player_").columns.tolist(),axis=1)
print(df.shape)

(18243, 82)
(18243, 54)


In [220]:
df.dtypes

id                                       int64
date                                    object
season                                  object
country_id                               int64
league_id                                int64
stage                                    int64
home_team_api_id                         int64
away_team_api_id                         int64
year                                     int64
buildUpPlaySpeed_home                    int64
buildUpPlayPassing_home                  int64
buildUpPlayPositioningClass_home        object
chanceCreationPassing_home               int64
chanceCreationCrossing_home              int64
chanceCreationShooting_home              int64
chanceCreationPositioningClass_home     object
defencePressure_home                     int64
defenceAggression_home                   int64
defenceTeamWidth_home                    int64
defenceDefenderLineClass_home           object
buildUpPlaySpeed_away                    int64
buildUpPlayPa

In [221]:
# CONVERT CATEGORICAL VARIABLES TO ONE HOT 
df_categorical = pd.get_dummies(df, columns = ["country_id", 'buildUpPlayPositioningClass_home', 'chanceCreationPositioningClass_home',
                                               'defenceDefenderLineClass_home','buildUpPlayPositioningClass_away',
                                               'chanceCreationPositioningClass_away','defenceDefenderLineClass_away'])

In [222]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df_categorical["result_code"] = lb_make.fit_transform(df_categorical["result"])

In [223]:
label_mapping = dict(zip(lb_make.transform(lb_make.classes_), lb_make.classes_))
label_mapping

{0: 'AWAY_WIN', 1: 'HOME_WIN', 2: 'TIE'}

In [224]:
df_categorical.head()

Unnamed: 0,id,date,season,league_id,stage,home_team_api_id,away_team_api_id,year,buildUpPlaySpeed_home,buildUpPlayPassing_home,...,chanceCreationPositioningClass_home_Organised,defenceDefenderLineClass_home_Cover,defenceDefenderLineClass_home_Offside Trap,buildUpPlayPositioningClass_away_Free Form,buildUpPlayPositioningClass_away_Organised,chanceCreationPositioningClass_away_Free Form,chanceCreationPositioningClass_away_Organised,defenceDefenderLineClass_away_Cover,defenceDefenderLineClass_away_Offside Trap,result_code
0,377,2010-02-03 00:00:00,2009/2010,1,19,8635,8342,2010,50,35,...,1,1,0,0,1,0,1,1,0,1
1,557,2010-11-07 00:00:00,2010/2011,1,14,8635,8342,2010,50,35,...,1,1,0,0,1,0,1,1,0,2
2,736,2010-09-19 00:00:00,2010/2011,1,7,9986,8342,2010,40,50,...,1,1,0,0,1,0,1,1,0,0
3,602,2010-12-10 00:00:00,2010/2011,1,19,8203,8342,2010,65,60,...,1,1,0,0,1,0,1,1,0,0
4,447,2010-02-28 00:00:00,2009/2010,1,28,9997,8342,2010,50,50,...,1,1,0,0,1,0,1,1,0,2


In [225]:
df_categorical.columns

Index(['id', 'date', 'season', 'league_id', 'stage', 'home_team_api_id',
       'away_team_api_id', 'year', 'buildUpPlaySpeed_home',
       'buildUpPlayPassing_home', 'chanceCreationPassing_home',
       'chanceCreationCrossing_home', 'chanceCreationShooting_home',
       'defencePressure_home', 'defenceAggression_home',
       'defenceTeamWidth_home', 'buildUpPlaySpeed_away',
       'buildUpPlayPassing_away', 'chanceCreationPassing_away',
       'chanceCreationCrossing_away', 'chanceCreationShooting_away',
       'defencePressure_away', 'defenceAggression_away',
       'defenceTeamWidth_away', 'overall_rating_1', 'overall_rating_2',
       'overall_rating_3', 'overall_rating_4', 'overall_rating_5',
       'overall_rating_6', 'overall_rating_7', 'overall_rating_8',
       'overall_rating_9', 'overall_rating_10', 'overall_rating_11',
       'overall_rating_12', 'overall_rating_13', 'overall_rating_14',
       'overall_rating_15', 'overall_rating_16', 'overall_rating_17',
       'overall

In [None]:
print(df_categorical.shape)

df_home= df_categorical[["id", "home_team_api_id", "date", "season", "result_code"]]
col= 'result_code'
conditions= [ df_home[col] == 1, df_home[col] == 2, df_home[col] == 0 ]
choices= [ 3, 1, 0 ]
df_home["score"]= np.select(conditions, choices, default=np.nan)

df_away= df_categorical[["id", "away_team_api_id", "date", "season", "result_code"]]
col= 'result_code'
conditions= [ df_away[col] == 0, df_away[col] == 2, df_away[col] == 1 ]
choices= [ 3, 1, 0 ]
df_away["score"]= np.select(conditions, choices, default=np.nan)


df_concat= pd.concat([df_home.rename(columns={"home_team_api_id": "team_api_id"}), 
                      df_away.rename(columns={"away_team_api_id": "team_api_id"})], axis=0)
df_concat.sort_values("date", ascending=True, inplace=True)

print(df_concat.shape)

In [227]:
for i in range(1, WIN_SIZE + 1):
    df_concat['score_m-' + str(i)]= df_concat.groupby(['team_api_id', 'season'])['score'].shift(i)

In [228]:
df_concat.head()

Unnamed: 0,id,team_api_id,date,season,result_code,score,score_m-1,score_m-2,score_m-3,score_m-4,score_m-5
13890,19987,9938,2010-01-02 00:00:00,2009/2010,0,0.0,,,,,
13825,19985,9800,2010-01-02 00:00:00,2009/2010,1,3.0,,,,,
15294,21972,8634,2010-01-02 00:00:00,2009/2010,2,1.0,,,,,
13825,19985,8597,2010-01-02 00:00:00,2009/2010,1,0.0,,,,,
15272,21970,9906,2010-01-02 00:00:00,2009/2010,1,3.0,,,,,


In [229]:
print(df_concat.shape)
df_concat.dropna(inplace=True)
print(df_concat.shape)
df_concat[df_concat["team_api_id"] == 9938].head(6)

(36486, 11)
(30106, 11)


Unnamed: 0,id,team_api_id,date,season,result_code,score,score_m-1,score_m-2,score_m-3,score_m-4,score_m-5
13772,20012,9938,2010-01-30 00:00:00,2009/2010,2,1.0,1.0,3.0,3.0,3.0,0.0
13974,20018,9938,2010-02-10 00:00:00,2009/2010,0,0.0,1.0,1.0,3.0,3.0,3.0
13769,20027,9938,2010-02-13 00:00:00,2009/2010,0,3.0,0.0,1.0,1.0,3.0,3.0
13780,20031,9938,2010-02-20 00:00:00,2009/2010,1,0.0,3.0,0.0,1.0,1.0,3.0
13903,20038,9938,2010-02-27 00:00:00,2009/2010,1,3.0,0.0,3.0,0.0,1.0,1.0
13958,20047,9938,2010-03-07 00:00:00,2009/2010,1,3.0,3.0,0.0,3.0,0.0,1.0


In [230]:
print(df_categorical.shape)
df_temp= (df_categorical.merge(df_concat[["team_api_id", "date", "score", "score_m-1", 
                                          "score_m-2", "score_m-3", "score_m-4", "score_m-5"]],
                               left_on=["home_team_api_id", "date"], right_on=["team_api_id", "date"],
                               how="left"
                              )
                        .drop(columns=["team_api_id"])
                        .rename(columns={"score_m-1": "home_score_m-1", "score_m-2": "home_score_m-2", 
                                         "score_m-3": "home_score_m-3", "score_m-4": "home_score_m-4", 
                                         "score_m-5": "home_score_m-5", "score": "home_score"})
         )

print(df_temp.shape)

df_temp= (df_temp.merge(df_concat[["team_api_id", "date", "score", "score_m-1", 
                                   "score_m-2", "score_m-3", "score_m-4", "score_m-5"]],
                        left_on=["away_team_api_id", "date"], right_on=["team_api_id", "date"],
                        how="left"
                       )
                 .drop(columns=["team_api_id"])
                 .rename(columns={"score_m-1": "away_score_m-1", "score_m-2": "away_score_m-2", 
                                  "score_m-3": "away_score_m-3", "score_m-4": "away_score_m-4", 
                                  "score_m-5": "away_score_m-5", "score": "away_score"})
         )

print(df_temp.shape)

(18243, 71)
(18243, 77)
(18243, 83)


In [231]:
df_temp.dropna(inplace=True)
print(df_temp.shape)
print(df_temp["season"].value_counts())

(14838, 83)
2010/2011    2595
2014/2015    2566
2011/2012    2544
2012/2013    2541
2013/2014    2407
2009/2010    1154
2015/2016    1031
Name: season, dtype: int64


In [232]:
df_temp.to_csv("soccer_dataset.csv", index=False)

In [233]:
df_train= df_temp[(df_temp["season"] != "2015/2016") & (df_temp["season"] != "2014/2015")]
df_test= df_temp[(df_temp["season"] == "2015/2016") | (df_temp["season"] == "2014/2015")]

print(df_train.shape)
print(df_test.shape)
print(df_train.shape[0] + df_test.shape[0])
print(100 * df_test.shape[0] / (df_test.shape[0] + df_train.shape[0]))

(11241, 83)
(3597, 83)
14838
24.24181156490093


In [234]:
df_temp.columns

Index(['id', 'date', 'season', 'league_id', 'stage', 'home_team_api_id',
       'away_team_api_id', 'year', 'buildUpPlaySpeed_home',
       'buildUpPlayPassing_home', 'chanceCreationPassing_home',
       'chanceCreationCrossing_home', 'chanceCreationShooting_home',
       'defencePressure_home', 'defenceAggression_home',
       'defenceTeamWidth_home', 'buildUpPlaySpeed_away',
       'buildUpPlayPassing_away', 'chanceCreationPassing_away',
       'chanceCreationCrossing_away', 'chanceCreationShooting_away',
       'defencePressure_away', 'defenceAggression_away',
       'defenceTeamWidth_away', 'overall_rating_1', 'overall_rating_2',
       'overall_rating_3', 'overall_rating_4', 'overall_rating_5',
       'overall_rating_6', 'overall_rating_7', 'overall_rating_8',
       'overall_rating_9', 'overall_rating_10', 'overall_rating_11',
       'overall_rating_12', 'overall_rating_13', 'overall_rating_14',
       'overall_rating_15', 'overall_rating_16', 'overall_rating_17',
       'overall

In [235]:
# CREATE FEATURES AND TARGET

X_train= df_train.drop(["date", "season", 'result_code','result', 
                        "league_id", "id", "home_score", "away_score", 
                        "home_team_api_id", "away_team_api_id"], axis=1)
y_train= df_train['result_code']

X_test= df_test.drop(["date", "season", 'result_code','result', 
                      "league_id", "id", "home_score", "away_score", 
                      "home_team_api_id", "away_team_api_id"], axis=1)
y_test= df_test['result_code']

print(X_train.shape)
print(X_test.shape)

(11241, 73)
(3597, 73)


In [236]:
# from sklearn.model_selection import train_test_split
# # CREATE FEATURES AND TARGET

# X = df_categorical.drop(['result_code','result', "league_id", "id", 
#                          "home_team_api_id", "away_team_api_id"], axis=1)
# y = df_categorical['result_code']

# # Split train and test

# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
# print(X_train.shape)
# print(X_test.shape)
# y

In [237]:
# NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

nbc= GaussianNB()
pca = PCA()
pipe= Pipeline([('pca', pca), ('nbc', nbc)])

feature_len= X_train.shape[1]
parameters_GNB = {'pca__n_components': np.arange(5, feature_len, np.around(feature_len/20)).astype("int")}

grid= GridSearchCV(estimator=pipe, param_grid=parameters_GNB, 
                   cv=StratifiedShuffleSplit(n_splits=5, random_state=42), 
                   scoring='accuracy', verbose=10)
grid.fit(X_train, y_train)
print(grid.best_params_)

Fitting 5 folds for each of 17 candidates, totalling 85 fits
[CV] pca__n_components=5 .............................................
[CV] ................. pca__n_components=5, score=0.504, total=   0.1s
[CV] pca__n_components=5 .............................................
[CV] ................. pca__n_components=5, score=0.508, total=   0.1s
[CV] pca__n_components=5 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] ................. pca__n_components=5, score=0.515, total=   0.1s
[CV] pca__n_components=5 .............................................
[CV] ................. pca__n_components=5, score=0.511, total=   0.1s
[CV] pca__n_components=5 .............................................
[CV] ................. pca__n_components=5, score=0.493, total=   0.1s
[CV] pca__n_components=9 .............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.4s remaining:    0.0s


[CV] ................. pca__n_components=9, score=0.537, total=   0.1s
[CV] pca__n_components=9 .............................................
[CV] ................. pca__n_components=9, score=0.530, total=   0.1s
[CV] pca__n_components=9 .............................................
[CV] ................. pca__n_components=9, score=0.535, total=   0.1s
[CV] pca__n_components=9 .............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.6s remaining:    0.0s


[CV] ................. pca__n_components=9, score=0.545, total=   0.1s
[CV] pca__n_components=9 .............................................
[CV] ................. pca__n_components=9, score=0.542, total=   0.1s
[CV] pca__n_components=13 ............................................
[CV] ................ pca__n_components=13, score=0.540, total=   0.1s
[CV] pca__n_components=13 ............................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.7s remaining:    0.0s


[CV] ................ pca__n_components=13, score=0.533, total=   0.1s
[CV] pca__n_components=13 ............................................
[CV] ................ pca__n_components=13, score=0.532, total=   0.1s
[CV] pca__n_components=13 ............................................
[CV] ................ pca__n_components=13, score=0.542, total=   0.1s
[CV] pca__n_components=13 ............................................
[CV] ................ pca__n_components=13, score=0.535, total=   0.1s
[CV] pca__n_components=17 ............................................
[CV] ................ pca__n_components=17, score=0.542, total=   0.1s
[CV] pca__n_components=17 ............................................
[CV] ................ pca__n_components=17, score=0.535, total=   0.1s
[CV] pca__n_components=17 ............................................
[CV] ................ pca__n_components=17, score=0.533, total=   0.1s
[CV] pca__n_components=17 ............................................
[CV] .

[Parallel(n_jobs=1)]: Done  85 out of  85 | elapsed:    9.6s finished


In [238]:
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions)) 

              precision    recall  f1-score   support

           0       0.50      0.47      0.49      1067
           1       0.53      0.85      0.66      1625
           2       0.40      0.00      0.00       905

    accuracy                           0.52      3597
   macro avg       0.48      0.44      0.38      3597
weighted avg       0.49      0.52      0.44      3597



In [239]:
with open("gaussian_nb_pca_model.pkl", "wb") as f:
    pkl.dump(grid.best_estimator_, f)

In [240]:
# # RANDOM FOREST
# from sklearn.ensemble import RandomForestClassifier

# rfc=RandomForestClassifier(random_state=42)

# param_grid = { 
#     'n_estimators': [100, 200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [6,8,10],
#     'criterion' :['gini', 'entropy']
# }

# grid = GridSearchCV(estimator=rfc, param_grid=param_grid, 
#                     cv=StratifiedShuffleSplit(n_splits=5, random_state=42), 
#                     scoring='accuracy', verbose=10)
# grid.fit(X_train, y_train)
# print(grid.best_params_)

In [241]:
# # SVM 
# from sklearn.svm import SVC
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# svm_pipe= Pipeline([("norm", StandardScaler()),
#                     ("svm", SVC())])

# param_grid = {'svm__C': [0.1,1, 10], 'svm__gamma': ["scale", "auto"],
#               'svm__kernel': ['rbf', 'poly', 'sigmoid']}
# grid = GridSearchCV(svm_pipe, param_grid, refit=True, verbose=10, scoring='accuracy')
# grid.fit(X_train,y_train)
# print(CV_rfc.best_params_)

In [242]:
# grid_predictions = grid.predict(X_test)
# print(classification_report(y_test, grid_predictions)) 

In [243]:
# # CHECK IF FEATURE_IMPORTANCE RETURNS FEATURES IN THE CORRECT ORDER
# features_df= pd.DataFrame({"feature_name": df_categorical.drop(['result_code','result', "league_id", "id", 
#                                                                 "home_team_api_id", "away_team_api_id"], axis=1)
#                                                          .columns.tolist(), 
#                            "feature_importance": grid.best_estimator_.feature_importances_})
# features_df.sort_values("feature_importance", ascending=False)[:50]