In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
base_path = './data'

path_player_away_train = f'{base_path}/Train_Data/train_away_player_statistics_df.csv'
path_player_home_train = f'{base_path}/Train_Data/train_home_player_statistics_df.csv'
path_team_away_train = f'{base_path}/Train_Data/train_away_team_statistics_df.csv'
path_team_home_train = f'{base_path}/Train_Data/train_home_team_statistics_df.csv'
path_target = f'{base_path}/Y_train_1rknArQ.csv'
path_target_sup = f'{base_path}/benchmark_and_extras/Y_train_supp.csv'

In [3]:
train_target = pd.read_csv(path_target, sep=',')

In [4]:
encoded_target = train_target.apply(lambda x: np.where(x.to_numpy() == 1)[0][0], axis=1)

In [5]:
train_team = pd.read_csv('train_team.csv', sep = ',' )
train_team.head()

Unnamed: 0,ID,HOME_LEAGUE,HOME_TEAM_NAME,HOME_TEAM_SHOTS_TOTAL_season_sum,HOME_TEAM_SHOTS_INSIDEBOX_season_sum,HOME_TEAM_SHOTS_OFF_TARGET_season_sum,HOME_TEAM_SHOTS_ON_TARGET_season_sum,HOME_TEAM_SHOTS_OUTSIDEBOX_season_sum,HOME_TEAM_PASSES_season_sum,HOME_TEAM_SUCCESSFUL_PASSES_season_sum,...,AWAY_TEAM_YELLOWCARDS_5_last_match_std,AWAY_TEAM_REDCARDS_5_last_match_std,AWAY_TEAM_OFFSIDES_5_last_match_std,AWAY_TEAM_ATTACKS_5_last_match_std,AWAY_TEAM_PENALTIES_5_last_match_std,AWAY_TEAM_SUBSTITUTIONS_5_last_match_std,AWAY_TEAM_BALL_SAFE_5_last_match_std,AWAY_TEAM_DANGEROUS_ATTACKS_5_last_match_std,AWAY_TEAM_INJURIES_5_last_match_std,AWAY_TEAM_GOALS_5_last_match_std
0,0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,...,5.0,5.0,4.0,0.0,6.0,8.0,4.0,3.0,2.0,3.0
1,1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,...,0.0,0.0,3.0,1.0,8.0,4.0,10.0,0.0,5.0,3.0
2,2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,...,6.0,10.0,4.0,4.0,0.0,8.0,3.0,0.0,9.0,6.0
3,3,League One,Coventry City,7.0,5.0,5.0,6.0,6.0,9.0,9.0,...,0.0,0.0,1.0,2.0,0.0,5.0,6.0,3.0,,2.0
4,4,Premier League,Wolverhampton Wanderers,3.0,3.0,2.0,3.0,4.0,4.0,3.0,...,1.0,0.0,4.0,4.0,9.0,4.0,1.0,4.0,6.0,5.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_team.drop(columns=['ID', 'HOME_LEAGUE', 'HOME_TEAM_NAME', 'AWAY_LEAGUE', 'AWAY_TEAM_NAME']),
                                                    encoded_target, test_size=0.2, random_state=42)

In [7]:
print(f"Nombre de données d'entrainement: {X_train.shape[0]}")
print(f"Nombre de données de test: {X_test.shape[0]}")

Nombre de données d'entrainement: 9842
Nombre de données de test: 2461


In [8]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf_clf = grid_search.best_estimator_

y_pred = best_rf_clf.predict(X_test)



Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


In [9]:
grid_search.score(X_test, y_test)

0.497765136123527

score de 0.497765 (sul sito data challenge di 0.485 "RF_B"), paraemtri: Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}

In [10]:
path_team_away_test = f'{base_path}/Test_Data/test_away_team_statistics_df.csv'
path_team_home_test = f'{base_path}/Test_Data/test_home_team_statistics_df.csv'

test_team_home = pd.read_csv(path_team_home_test, sep=',')
test_team_away = pd.read_csv(path_team_away_test, sep=',')

test_team_home.columns = 'HOME_' + test_team_home.columns
test_team_away.columns = 'AWAY_' + test_team_away.columns

test_team =  pd.concat([test_team_home, test_team_away.iloc[:, 1:]],join='inner',axis=1)
test_team = test_team.rename(columns={'HOME_ID':'ID'})

In [11]:
predictions = grid_search.predict(test_team.drop(columns=['ID']))

In [12]:
print(set(predictions))

{1, 2, 3}


In [13]:
p = []
for pred in predictions:
    _p = np.zeros(3, dtype=np.int32)
    _p[pred - 1] = 1 
    p.append(_p)


In [14]:
pred_sub = pd.DataFrame(data=p, columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

In [15]:
pred_sub = pd.concat([test_team['ID'], pred_sub],join='inner',axis=1)

In [16]:
pred_sub.to_csv('./submission_RF.csv', index=False)

Variant with no Bootstrap

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_clf = RandomForestClassifier(random_state=42, bootstrap=False)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf_clf = grid_search.best_estimator_

y_pred = best_rf_clf.predict(X_test)





Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}


In [18]:
grid_search.score(X_test, y_test)

0.4965461194636327

In [19]:
path_team_away_test = f'{base_path}/Test_Data/test_away_team_statistics_df.csv'
path_team_home_test = f'{base_path}/Test_Data/test_home_team_statistics_df.csv'

test_team_home = pd.read_csv(path_team_home_test, sep=',')
test_team_away = pd.read_csv(path_team_away_test, sep=',')

test_team_home.columns = 'HOME_' + test_team_home.columns
test_team_away.columns = 'AWAY_' + test_team_away.columns

test_team =  pd.concat([test_team_home, test_team_away.iloc[:, 1:]],join='inner',axis=1)
test_team = test_team.rename(columns={'HOME_ID':'ID'})

In [20]:
predictions = grid_search.predict(test_team.drop(columns=['ID']))

In [21]:
p = []
for pred in predictions:
    _p = np.zeros(3, dtype=np.int32)
    _p[pred - 1] = 1 
    p.append(_p)


In [22]:
pred_sub = pd.DataFrame(data=p, columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

In [23]:
pred_sub = pd.concat([test_team['ID'], pred_sub],join='inner',axis=1)

In [24]:
pred_sub.to_csv('./submission_RF_NO_Bootstrap.csv', index=False)

Variant avec nested cross-validation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, scoring='accuracy')

outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

nested_scores = cross_val_score(grid_search, X_train, y_train, 
                                 cv=outer_cv, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)



In [25]:
grid_search.score(X_test, y_test)

0.497765136123527

In [26]:
path_team_away_test = f'{base_path}/Test_Data/test_away_team_statistics_df.csv'
path_team_home_test = f'{base_path}/Test_Data/test_home_team_statistics_df.csv'

test_team_home = pd.read_csv(path_team_home_test, sep=',')
test_team_away = pd.read_csv(path_team_away_test, sep=',')

test_team_home.columns = 'HOME_' + test_team_home.columns
test_team_away.columns = 'AWAY_' + test_team_away.columns

test_team =  pd.concat([test_team_home, test_team_away.iloc[:, 1:]],join='inner',axis=1)
test_team = test_team.rename(columns={'HOME_ID':'ID'})

In [27]:
predictions = grid_search.predict(test_team.drop(columns=['ID']))

In [28]:
p = []
for pred in predictions:
    _p = np.zeros(3, dtype=np.int32)
    _p[pred - 1] = 1 
    p.append(_p)

In [29]:
pred_sub = pd.DataFrame(data=p, columns=['HOME_WINS', 'DRAW', 'AWAY_WINS'])

In [30]:
pred_sub = pd.concat([test_team['ID'], pred_sub],join='inner',axis=1)

In [31]:
pred_sub.to_csv('./submission_RF_NestedCrossoVal.csv', index=False)