# Analyse de game professionelles de League of Legends

In [13]:
import pandas as pd 

dataset = pd.read_csv("data/lol_pre_match_dataset.csv").dropna()
print(f"Il y a {len(dataset)} parties prêtes à l'analyse.")
mapped_dataset = dataset.copy()

Il y a 8971 parties prêtes à l'analyse.


In [14]:
print(f"Pour chaque ligne, voici les informaions que l'on a :\n{[col for col in dataset.columns]}")
# 0 = Blue win, 1 = Red win

Pour chaque ligne, voici les informaions que l'on a :
['gameid', 'team_blue', 'team_red', 'league', 'patch', 'pick_blue_1', 'pick_blue_2', 'pick_blue_3', 'pick_blue_4', 'pick_blue_5', 'pick_red_1', 'pick_red_2', 'pick_red_3', 'pick_red_4', 'pick_red_5', 'ban_blue_1', 'ban_blue_2', 'ban_blue_3', 'ban_blue_4', 'ban_blue_5', 'ban_red_1', 'ban_red_2', 'ban_red_3', 'ban_red_4', 'ban_red_5', 'result']


In [15]:
# Liste de tous les champions_set qui apparaissent dans le dataset
# Liste de tous les champions qui apparaissent dans le dataset
champions_set = set()
for col in [
    "pick_blue_1", "pick_blue_2", "pick_blue_3", "pick_blue_4", "pick_blue_5",
    "pick_red_1", "pick_red_2", "pick_red_3", "pick_red_4", "pick_red_5",
    "ban_blue_1", "ban_blue_2", "ban_blue_3", "ban_blue_4", "ban_blue_5",
    "ban_red_1", "ban_red_2", "ban_red_3", "ban_red_4", "ban_red_5"
]:
    for champ in dataset[col]:
        champ = str(champ).strip()
        if champ:
            champions_set.add(champ)

print(f"Nous allons mapper aléatoirement le nom des {len(champions_set)} champions utilisés dans le dataset.")
champions_dict = {champ: id for id, champ in enumerate(champions_set)}
del champions_set

# Remplacement des noms par les IDs dans chaque colonne pick/ban
for col in [
    "pick_blue_1", "pick_blue_2", "pick_blue_3", "pick_blue_4", "pick_blue_5",
    "pick_red_1", "pick_red_2", "pick_red_3", "pick_red_4", "pick_red_5",
    "ban_blue_1", "ban_blue_2", "ban_blue_3", "ban_blue_4", "ban_blue_5",
    "ban_red_1", "ban_red_2", "ban_red_3", "ban_red_4", "ban_red_5"
]:
    mapped_dataset[col] = dataset[col].apply(lambda x: champions_dict.get(str(x).strip(), -1))

# print(f"Voici un exemple de mapping : {list(champions_dict.items())[:5]}")
# print(f"\nVoici un exemple de ligne du dataset mappé :\n{mapped_dataset.iloc[0]}")


Nous allons mapper aléatoirement le nom des 169 champions utilisés dans le dataset.


In [16]:
teams_set = set()
for col in ["team_blue", "team_red"]:
    for cell in dataset[col]:
        if isinstance(cell, str):
            teams_set.add(cell.strip())
teams_dict = {}
for id, team in enumerate(teams_set):
    teams_dict[team] = id
del teams_set
def change_teams_ids(teams):
    return teams_dict.get(teams, -1)
for col in ["team_blue", "team_red"]:
    mapped_dataset[col] = dataset[col].apply(change_teams_ids)
# print(f"Voici un exemple de mapping : {list(teams_dict.items())[:3]}")
# print(f"\nVoici un exemple de ligne du dataset mappé :\n{mapped_dataset.iloc[0]}")

In [17]:
league_set = set()
for cell in dataset["league"]:
    if isinstance(cell, str):
        league_set.add(cell.strip())
league_dict = {}
for id, league in enumerate(league_set):
    league_dict[league] = id
del league_set
mapped_dataset["league"] = dataset["league"].apply(lambda x: league_dict.get(str(x).strip(), -1))
# print(f"\nVoici un exemple de ligne du dataset mappé :\n{mapped_dataset.iloc[0]}")


In [18]:
assert mapped_dataset.isnull().sum().sum() == 0, "Il y a encore des valeurs nulles dans le dataset !"   
assert all([pd.api.types.is_numeric_dtype(mapped_dataset[col]) for col in mapped_dataset.columns if col != "gameid"]), "Le dataset contient des colonnes non numériques !"
assert all(mapped_dataset["result"].isin([0, 1])), "La colonne 'result' contient des valeurs autres que 0 et 1 !"
print("Le dataset mappé est prêt pour l'analyse !")

Le dataset mappé est prêt pour l'analyse !


In [19]:
X = mapped_dataset.drop(columns=["gameid", "result"])
y = mapped_dataset["result"]

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Séparation train/test (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import time

def make_result_dict(name, y_test, y_pred, fit_time, predict_time):
    return {
        "Modèle": name,
        "Accuracy": round(accuracy_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred), 3),
        "Recall": round(recall_score(y_test, y_pred), 3),
        "F1-score": round(f1_score(y_test, y_pred), 3),
        "Fit_time(s)": round(fit_time, 3),
        "Predict_time(ms)": round(predict_time * 1000, 3)
    }

def random_forest_model(X_train, X_test, y_train, y_test):
    model = RandomForestClassifier(random_state=42)
    start = time.time()
    model.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = model.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("Random Forest", y_test, y_pred, fit_time, predict_time)

def logistic_regression_model(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(max_iter=1000, random_state=42)
    start = time.time()
    logreg.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = logreg.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("Logistic Regression", y_test, y_pred, fit_time, predict_time)

def gradient_boosting_model(X_train, X_test, y_train, y_test):
    gb = GradientBoostingClassifier(random_state=42)
    start = time.time()
    gb.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = gb.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("Gradient Boosting", y_test, y_pred, fit_time, predict_time)

def hist_gradient_boosting_model(X_train, X_test, y_train, y_test):
    hgb = HistGradientBoostingClassifier(random_state=42)
    start = time.time()
    hgb.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = hgb.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("HistGradientBoosting", y_test, y_pred, fit_time, predict_time)

def xgboost_model(X_train, X_test, y_train, y_test):
    xgb = XGBClassifier(eval_metric='logloss', random_state=42)
    start = time.time()
    xgb.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = xgb.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("XGBoost", y_test, y_pred, fit_time, predict_time)

def svm_model(X_train, X_test, y_train, y_test):
    svc = SVC(random_state=42)
    start = time.time()
    svc.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = svc.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("SVM", y_test, y_pred, fit_time, predict_time)

def knn_model(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier()
    start = time.time()
    knn.fit(X_train, y_train)
    fit_time = time.time() - start
    start_pred = time.time()
    y_pred = knn.predict(X_test)
    predict_time = time.time() - start_pred
    return make_result_dict("KNN", y_test, y_pred, fit_time, predict_time)

def models_from_data(X_train, X_test, y_train, y_test, label="", print_results=True):
    results = []
    for func in [
        random_forest_model,
        logistic_regression_model,
        gradient_boosting_model,
        hist_gradient_boosting_model,
        xgboost_model,
        svm_model,
        knn_model
    ]:
        res = func(X_train, X_test, y_train, y_test)
        res["Spécificité"] = label
        results.append(res)
    results_df = pd.DataFrame(results)
    if print_results:
        print("\nTableau comparatif des modèles :")
        display(results_df)
    return results_df


In [22]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [3, 6, 10],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0]
# }

# grid_search = GridSearchCV(
#     estimator=XGBClassifier(eval_metric='logloss', random_state=42),
#     param_grid=param_grid,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1
# )

# grid_search.fit(X_train, y_train)

# print("Meilleurs paramètres :", grid_search.best_params_)
# print("Meilleur score de validation croisée :", grid_search.best_score_)

# # Évaluation sur le test set
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred))
# print("F1-score:", f1_score(y_test, y_pred))
# print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
# print("Classification report:\n", classification_report(y_test, y_pred))

In [23]:
X_encoded = pd.get_dummies(
    dataset.drop(columns=["gameid", "result"]),  # retire gameid et result si besoin
    columns=[
        "pick_blue_1", "pick_blue_2", "pick_blue_3", "pick_blue_4", "pick_blue_5",
        "pick_red_1", "pick_red_2", "pick_red_3", "pick_red_4", "pick_red_5",
        "ban_blue_1", "ban_blue_2", "ban_blue_3", "ban_blue_4", "ban_blue_5",
        "ban_red_1", "ban_red_2", "ban_red_3", "ban_red_4", "ban_red_5",
        "team_blue", "team_red", "league", "patch"
    ]
)
X_encoded_train, X_encoded_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

res_dict_encoded = models_from_data(X_train, X_test, y_train, y_test, label="mapping int", print_results=True)
res_one_hot_encoded = models_from_data(X_encoded_train, X_encoded_test, y_train, y_test, label="one hot", print_results=True)



Tableau comparatif des modèles :


Unnamed: 0,Modèle,Accuracy,Precision,Recall,F1-score,Fit_time(s),Predict_time(ms),Spécificité
0,Random Forest,0.538,0.515,0.345,0.413,1.061,19.067,mapping int
1,Logistic Regression,0.532,0.509,0.234,0.32,0.016,0.646,mapping int
2,Gradient Boosting,0.555,0.54,0.384,0.449,1.23,2.107,mapping int
3,HistGradientBoosting,0.554,0.532,0.464,0.496,0.159,2.705,mapping int
4,XGBoost,0.543,0.518,0.46,0.487,0.254,6.881,mapping int
5,SVM,0.519,0.482,0.275,0.35,0.893,318.975,mapping int
6,KNN,0.486,0.454,0.432,0.443,0.001,10.598,mapping int



Tableau comparatif des modèles :


Unnamed: 0,Modèle,Accuracy,Precision,Recall,F1-score,Fit_time(s),Predict_time(ms),Spécificité
0,Random Forest,0.584,0.575,0.455,0.508,5.609,76.32,one hot
1,Logistic Regression,0.603,0.579,0.586,0.582,0.583,13.058,one hot
2,Gradient Boosting,0.577,0.6,0.312,0.41,10.26,32.936,one hot
3,HistGradientBoosting,0.588,0.573,0.501,0.534,5.15,20.692,one hot
4,XGBoost,0.599,0.587,0.509,0.545,2.753,245.616,one hot
5,SVM,0.594,0.581,0.498,0.537,52.824,17719.759,one hot
6,KNN,0.557,0.532,0.503,0.517,0.097,320.2,one hot


In [24]:
res = pd.concat([res_dict_encoded, res_one_hot_encoded], axis=0).sort_values("F1-score", ascending=False)
display(res)

Unnamed: 0,Modèle,Accuracy,Precision,Recall,F1-score,Fit_time(s),Predict_time(ms),Spécificité
1,Logistic Regression,0.603,0.579,0.586,0.582,0.583,13.058,one hot
4,XGBoost,0.599,0.587,0.509,0.545,2.753,245.616,one hot
5,SVM,0.594,0.581,0.498,0.537,52.824,17719.759,one hot
3,HistGradientBoosting,0.588,0.573,0.501,0.534,5.15,20.692,one hot
6,KNN,0.557,0.532,0.503,0.517,0.097,320.2,one hot
0,Random Forest,0.584,0.575,0.455,0.508,5.609,76.32,one hot
3,HistGradientBoosting,0.554,0.532,0.464,0.496,0.159,2.705,mapping int
4,XGBoost,0.543,0.518,0.46,0.487,0.254,6.881,mapping int
2,Gradient Boosting,0.555,0.54,0.384,0.449,1.23,2.107,mapping int
6,KNN,0.486,0.454,0.432,0.443,0.001,10.598,mapping int
