# Analyse de game professionelles de League of Legends

In [1]:
import pandas as pd 

dataset = pd.read_csv("data/lol_pre_match_dataset.csv").dropna()
print(f"Il y a {len(dataset)} parties prêtes à l'analyse.")
mapped_dataset = dataset.copy()

Il y a 8971 parties prêtes à l'analyse.


In [2]:
print(f"Pour chaque ligne, voici les informaions que l'on a :\n{[col for col in dataset.columns]}")
# 0 = Blue win, 1 = Red win

Pour chaque ligne, voici les informaions que l'on a :
['gameid', 'team_blue', 'team_red', 'league', 'patch', 'pick_blue_1', 'pick_blue_2', 'pick_blue_3', 'pick_blue_4', 'pick_blue_5', 'pick_red_1', 'pick_red_2', 'pick_red_3', 'pick_red_4', 'pick_red_5', 'ban_blue_1', 'ban_blue_2', 'ban_blue_3', 'ban_blue_4', 'ban_blue_5', 'ban_red_1', 'ban_red_2', 'ban_red_3', 'ban_red_4', 'ban_red_5', 'result']


In [3]:
# Liste de tous les champions_set qui apparaissent dans le dataset
# Liste de tous les champions qui apparaissent dans le dataset
champions_set = set()
for col in [
    "pick_blue_1", "pick_blue_2", "pick_blue_3", "pick_blue_4", "pick_blue_5",
    "pick_red_1", "pick_red_2", "pick_red_3", "pick_red_4", "pick_red_5",
    "ban_blue_1", "ban_blue_2", "ban_blue_3", "ban_blue_4", "ban_blue_5",
    "ban_red_1", "ban_red_2", "ban_red_3", "ban_red_4", "ban_red_5"
]:
    for champ in dataset[col]:
        champ = str(champ).strip()
        if champ:
            champions_set.add(champ)

print(f"Nous allons mapper aléatoirement le nom des {len(champions_set)} champions utilisés dans le dataset.")
champions_dict = {champ: id for id, champ in enumerate(champions_set)}
del champions_set

# Remplacement des noms par les IDs dans chaque colonne pick/ban
for col in [
    "pick_blue_1", "pick_blue_2", "pick_blue_3", "pick_blue_4", "pick_blue_5",
    "pick_red_1", "pick_red_2", "pick_red_3", "pick_red_4", "pick_red_5",
    "ban_blue_1", "ban_blue_2", "ban_blue_3", "ban_blue_4", "ban_blue_5",
    "ban_red_1", "ban_red_2", "ban_red_3", "ban_red_4", "ban_red_5"
]:
    mapped_dataset[col] = dataset[col].apply(lambda x: champions_dict.get(str(x).strip(), -1))

# print(f"Voici un exemple de mapping : {list(champions_dict.items())[:5]}")
# print(f"\nVoici un exemple de ligne du dataset mappé :\n{mapped_dataset.iloc[0]}")


Nous allons mapper aléatoirement le nom des 169 champions utilisés dans le dataset.


In [4]:
teams_set = set()
for col in ["team_blue", "team_red"]:
    for cell in dataset[col]:
        if isinstance(cell, str):
            teams_set.add(cell.strip())
teams_dict = {}
for id, team in enumerate(teams_set):
    teams_dict[team] = id
del teams_set
def change_teams_ids(teams):
    return teams_dict.get(teams, -1)
for col in ["team_blue", "team_red"]:
    mapped_dataset[col] = dataset[col].apply(change_teams_ids)
# print(f"Voici un exemple de mapping : {list(teams_dict.items())[:3]}")
# print(f"\nVoici un exemple de ligne du dataset mappé :\n{mapped_dataset.iloc[0]}")

In [5]:
league_set = set()
for cell in dataset["league"]:
    if isinstance(cell, str):
        league_set.add(cell.strip())
league_dict = {}
for id, league in enumerate(league_set):
    league_dict[league] = id
del league_set
mapped_dataset["league"] = dataset["league"].apply(lambda x: league_dict.get(str(x).strip(), -1))
# print(f"\nVoici un exemple de ligne du dataset mappé :\n{mapped_dataset.iloc[0]}")


In [6]:
assert mapped_dataset.isnull().sum().sum() == 0, "Il y a encore des valeurs nulles dans le dataset !"   
assert all([pd.api.types.is_numeric_dtype(mapped_dataset[col]) for col in mapped_dataset.columns if col != "gameid"]), "Le dataset contient des colonnes non numériques !"
assert all(mapped_dataset["result"].isin([0, 1])), "La colonne 'result' contient des valeurs autres que 0 et 1 !"
print("Le dataset mappé est prêt pour l'analyse !")

Le dataset mappé est prêt pour l'analyse !


In [7]:
X = mapped_dataset.drop(columns=["gameid", "result"])
y = mapped_dataset["result"]

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Séparation train/test (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

# Exemple d'entraînement et d'évaluation avec un modèle
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(model.get_params())
# Métriques de base
print("Accuracy:", accuracy_score(y_test, y_pred)) # (TP + TN) / (TP + TN + FP + FN)
print("Precision:", precision_score(y_test, y_pred)) # (TP) / (TP + FP)
print("Recall:", recall_score(y_test, y_pred)) # (TP) / (TP + FN)
print("F1-score:", f1_score(y_test, y_pred)) # 2 * (Precision * Recall) / (Precision + Recall)
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("Ratio de victoires dans le test set:", y_test.mean())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Accuracy: 0.5587743732590529
Precision: 0.5474956822107081
Recall: 0.3742621015348288
F1-score: 0.44460028050490885
Confusion matrix:
 [[686 262]
 [530 317]]
Classification report:
               precision    recall  f1-score   support

           0       0.56      0.72      0.63       948
           1       0.55      0.37      0.44       847

    accuracy                           0.56      1795
   macro avg       0.56      0.55      0.54      1795
weighted avg       0.56      0.56      0.54      1795

Ratio de victoires dans le test set: 0.471866295264624


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Logistic Regression (baseline rapide)
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)
print("LogisticRegression accuracy:", logreg.score(X_test, y_test))

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
print("GradientBoostingClassifier accuracy:", gb.score(X_test, y_test))

# HistGradientBoosting (rapide sur gros jeux de données)
hgb = HistGradientBoostingClassifier(random_state=42)
hgb.fit(X_train, y_train)
print("HistGradientBoostingClassifier accuracy:", hgb.score(X_test, y_test))

# XGBoost (nécessite 'pip install xgboost')
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
print("XGBClassifier accuracy:", xgb.score(X_test, y_test))

# SVM (attention, peut être lent si beaucoup de données)
svc = SVC(random_state=42)
svc.fit(X_train, y_train)
print("SVC accuracy:", svc.score(X_test, y_test))

# KNN (simple, mais rarement le meilleur sur ce type de données)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print("KNeighborsClassifier accuracy:", knn.score(X_test, y_test))

LogisticRegression accuracy: 0.5247910863509749


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


GradientBoostingClassifier accuracy: 0.5793871866295265
HistGradientBoostingClassifier accuracy: 0.5604456824512535
XGBClassifier accuracy: 0.5621169916434541
SVC accuracy: 0.5403899721448467
KNeighborsClassifier accuracy: 0.5125348189415042


In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Meilleurs paramètres :", grid_search.best_params_)
print("Meilleur score de validation croisée :", grid_search.best_score_)

# Évaluation sur le test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

Fitting 3 folds for each of 72 candidates, totalling 216 fits


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a rece

Meilleurs paramètres : {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100, 'subsample': 0.8}
Meilleur score de validation croisée : 0.5546265328874025
Accuracy: 0.5376044568245125
Precision: 0.5112881806108898
Recall: 0.45454545454545453
F1-score: 0.48125
Confusion matrix:
 [[580 368]
 [462 385]]
Classification report:
               precision    recall  f1-score   support

           0       0.56      0.61      0.58       948
           1       0.51      0.45      0.48       847

    accuracy                           0.54      1795
   macro avg       0.53      0.53      0.53      1795
weighted avg       0.54      0.54      0.53      1795

