In [2]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV,KFold,GroupKFold
import matplotlib.pyplot as plt
from os.path import exists
import csv

Lecture du fichier sur les données sur les participants

In [3]:
infodfPath= r"C:\Users\live_\Documents\Coding_practice\PytorchPractice\MedidationEegCode\data\participants.tsv"
infodf = pd.read_csv(infodfPath, sep='\t')

Selection des identifiants des participants faisant pas partie du groupe contrôle et de ceux ayant 10 ans d'expertise

In [4]:
#Selection des patients qui font partie du groupe contrôle
dfisCtr= infodf[infodf["group"]=="ctr"]

#Selection des de leur id
idsCtr = dfisCtr["participant_id"].unique()
#on ne prend que le chiffre de l'id
idsCtrPure = [int(i[-3:]) for i in idsCtr]

#Selection des participants ne faisant pas partie du groupe contrôle
dfnotCtr= infodf[infodf["group"]!="ctr"]
#Selection des participants avec plus de dix ans d'expérience
df10 = dfnotCtr[dfnotCtr["years_of_practice"]>=10]
#Selection de leurs identifiants
df10Ids = df10["participant_id"].unique()
ids10 = [int(i[-3:]) for i in df10Ids]


Fonction d'entraînement du Xgboost et de fonction de sauvegarde des résultats dans un fichier Csv

In [5]:
def XgboostGroupKfoldGridSearch(df,columnsToDrop,param_grid):
    """Fonction qui entraine le model Xgboost avec cross validation et selon la combinaison de paramètres données en entrée"""
    model = XGBClassifier()

    group_kfold = GroupKFold(n_splits=10)

    grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, cv=group_kfold,verbose=10)
    grid_result = grid_search.fit(df.drop(columns=columnsToDrop), df["MeditationState"],groups=df["subId"])
    return grid_result

def resultSavingCsv(grid_result,path,columnComb,dfread):
    """Sauvegarde des résultats dans un fichier csv"""

    if not exists(path):
        resultfile = open(path,"a")
        writer = csv.writer(resultfile)
        writer.writerow(["model","colsample_bylevel","gamma","learning_rate","max_depth","n_estimators","subsample","accuracy","accuracy_std","Columns"])

    else:
        resultfile = open(path,"a")
        writer = csv.writer(resultfile)

    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        writer.writerow(["xgboost",param["colsample_bylevel"],param["gamma"],param["learning_rate"],param["max_depth"],param["n_estimators"],param["subsample"],mean,stdev,columnComb+dfread])
    resultfile.close()

Chemin da'accès des fichiers,combinaisons des colonnes et hyperparamètres

In [40]:

dfsPaths = [("","editationDataFourierSimpleFeatures.csv"),("_G_extended","editationDataFourierSimpleFeaturesGExtended.csv")]
columnsToDropCombinations = [("allFeatures",["subId","MeditationState"]),("waves_only",['Mean', 'Std', 'Min', 'Max',
       'Kurtosis', 'Skewness',"subId","MeditationState"])]


n_estimators = [100]
max_depth = range(2,10,1)
learning_rate = [0.01]
subsample = [0.5, 0.75, 1.0]
colsample_bylevel= [0.4, 0.6, 0.8, 1.0]
gamma = [0]

param_grid = dict(n_estimators=n_estimators,max_depth=max_depth,learning_rate=learning_rate,subsample=subsample,
                  colsample_bylevel=colsample_bylevel,gamma=gamma)




Fitting 10 folds for each of 96 candidates, totalling 960 fits


Entraînement des models sur chaque dataset et avec les hyperparamètres

In [None]:
for dfPath in dfsPaths:

    df= pd.read_csv(dfPath[1])
    #Selection des données des patients ne faisant pas partie du grouep contrôle

    dfDatanoCtr = df[~df["subId"].isin(idsCtrPure)].drop(columns="Unnamed: 0")
    #Selection des données des participants avec dix ans d'expériences
    dfData10 = df[df["subId"].isin(ids10)].drop(columns="Unnamed: 0")
    for columnToDrop in columnsToDropCombinations:

        grid_resultsAllSubjects = XgboostGroupKfoldGridSearch(df,columnToDrop[1],param_grid)
        grid_resultsNoCtr = XgboostGroupKfoldGridSearch(dfDatanoCtr,columnToDrop[1],param_grid)
        grid_results10Years = XgboostGroupKfoldGridSearch(dfData10,columnToDrop[1],param_grid)


        resultSavingCsv(grid_resultsAllSubjects,"xgboostModelsResultsAllSubjects.csv",columnToDrop[0],dfPath[0])
        resultSavingCsv(grid_resultsNoCtr,"xgboostModelsResultsNoCtr.csv",columnToDrop[0],dfPath[0])
        resultSavingCsv(grid_results10Years,"xgboostModelsResults10YearsPractice.csv",columnToDrop[0],dfPath[0])