Travail réalisé par Marc le Chevoir, Claire Espérou et Adrien Schaffner, encadrés par Philippe Besse.


# Etude du jeu de données "Maths" ou "Portugais" avec Python

# 1 - Importation des données

Choisir le jeu de données : soit l'évaluation en Maths, soit en Portugais

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime

In [None]:
Data=pd.read_csv("./student/student-mat.csv",sep=";",header=0)
#Data=pd.read_csv("./student/student-por.csv",sep=";",header=0)
Data

In [None]:
for i in range(0,np.shape(Data)[0]):
    if (Data["G3"][i]==0) :
        Data = Data.drop(i, axis=0)

In [None]:
Data

In [None]:
np.shape(Data)

### Modifications pour variables qualitatives

In [None]:
Variables_descri=pd.get_dummies(Data[["school","sex","address","famsize","Pstatus","Mjob","Fjob","reason","guardian","schoolsup","famsup","paid","activities","nursery","higher","internet","romantic"]])
Variables_numeric = Data[["age","Medu","Fedu","traveltime","studytime","failures","famrel","freetime","goout","Dalc","Walc","health","absences"]]

In [None]:
Data_num = pd.concat([Variables_descri,Variables_numeric],axis=1)

# 2 - Séparation des données en un échantillon "Apprentissage" et un "Test"

In [None]:
Y = Data["G3"]

In [None]:
from sklearn.model_selection import train_test_split  
X_train,X_test,Y_train,Y_test=train_test_split(Data_num,Y,test_size=int(0.2*np.shape(Data_num)[0]),random_state=11)

In [None]:
print(np.shape(X_train))
print(np.shape(X_test))

In [None]:
X_train

# 3 - Modèle linéaire

## 3.1 - Construction d'un modèle sans optimisation de paramètres

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
reg = linear_model.Lasso()
reg.fit(X_train, Y_train)

In [None]:
prev_RL=reg.predict(X_test)
print("MSE=",mean_squared_error(Y_test,prev_RL))

In [None]:
plt.plot(prev_RL,Y_test-prev_RL,"o")
plt.xlabel(u"Prédites")
plt.ylabel(u"Résidus")
plt.hlines(0,5,15)
plt.show()

In [None]:
ceofs_var = reg.fit(X_train, Y_train).coef_
for i in range (0, 56):
    if (abs(ceofs_var[i]) != 0):
        print(X_train.columns[i]+"____"+str(ceofs_var[i]))

On cherche à améliorer le modèle en optimisant le paramètre alpha de la méthode lasso.

## 3.2 - Optmisation du modèle linéaire en optimisant le paramètre alpha

In [None]:
t1=datetime.datetime.now()

# grille de valeurs du paramètre alpha à optimiser
param=[{"alpha":[0.01,0.05,0.1,0.15,0.2,0.3,0.4,0.5,1]}]
reg = GridSearchCV(linear_model.Lasso(), param,cv=5,n_jobs=-1)
regOpt=reg.fit(X_train, Y_train)

# paramètre optimal
regOpt.best_params_["alpha"]
print("Meilleur R2 = %f, Meilleur paramètre = %s" % (regOpt.best_score_,regOpt.best_params_))


t2=datetime.datetime.now()
print((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))

## 3.3 - Prévision sur l'échantillon test

In [None]:
prev_RL=regOpt.predict(X_test)
print("MSE=",mean_squared_error(Y_test,prev_RL))
prev_RL #Notes prédites

In [None]:
plt.plot(prev_RL,Y_test-prev_RL,"o")
plt.xlabel(u"Prédites")
plt.ylabel(u"Résidus")
plt.hlines(0,5,15)
plt.show()

In [None]:
print(prev_RL)
for i in range (0, np.size(prev_RL)):
    prev_RL[i] = round(prev_RL[i]) #On arrondi les valeurs prédites pour les comparer
                            # plus facilement aux valeurs réelles
print(prev_RL)

In [None]:
table=pd.crosstab(prev_RL,Y_test)
print(table)

In [None]:
print("MSE=",mean_squared_error(Y_test,prev_RL))

## 3.4 - Calcul du temps moyen d'exécution de l'algortihme

In [None]:
t_tot = []
for i in range(0,50):
    t1=datetime.datetime.now()
    # grille de valeurs du paramètre alpha à optimiser
    param=[{"alpha":[0.01,0.05,0.1,0.15,0.2,0.3,0.4,0.5,1]}]
    reg = GridSearchCV(linear_model.Lasso(), param,cv=5,n_jobs=-1)
    regOpt=reg.fit(X_train, Y_train)
    # paramètre optimal
    regOpt.best_params_["alpha"]
    #print("Meilleur R2 = %f, Meilleur paramètre = %s" % (regOpt.best_score_,regOpt.best_params_))


    t2=datetime.datetime.now()
    t_tot.append((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))
#print(t_tot)
#pint(np.mean(t_tot))

t_b=[]
for i in range(0,np.size(t_tot)):
    if (t_tot[i] > 0 ):
        t_b.append(t_tot[i])
        
print(t_b)
print(np.mean(t_b))

## 3.5 - Analyse des variables retenues sur 25 modèles obtenus par validation croisée de Monte Carlo

In [None]:
from sklearn.utils import check_random_state
check_random_state(13)

reg = linear_model.Lasso()

# Nombre d'itérations
B=25

# définition des grilles de paramètres
listMethGrid=[
    [reg,{"alpha":[0.01,0.05,0.1,0.15,0.2]}],
    ]
# Initialisation à 0 des erreurs pour chaque méthode (colonne) et chaque itération (ligne)
arrayErreur=np.empty((B,1)) 
arrayMSE=np.empty((B,1)) 
arrayParam=[] #sauvegarde des coefficients des variablesde chaque test
for i in range(B):   # itérations sur B échantillons test
    # extraction apprentissage et test
    X_train,X_test,Y_train,Y_test=train_test_split(Data_num,Data["G3"],test_size=int(0.2*np.shape(Data_num)[0]))
    # optimisation de chaque méthode et calcul de l'erreur sur le test
    for j,(method, grid_list) in enumerate(listMethGrid):
        methodGrid=GridSearchCV(method,grid_list,cv=5,n_jobs=-1).fit(X_train, Y_train)
        methodOpt = methodGrid.best_estimator_
        methFit=methodOpt.fit(X_train, Y_train)
        methPred = methodOpt.predict(X_test)
        arrayParam.append(methodOpt.fit(X_train, Y_train).coef_)
        arrayMSE[i,j]=mean_squared_error(Y_test,np.transpose(methPred))
        arrayErreur[i,j]=1-methFit.score(X_test,Y_test)


In [None]:
for k in range (0,25):
    for i in range (0, np.size(X_train.columns)):
        if (abs(arrayParam[k][i]) != 0):
            print(X_train.columns[i]+"____"+str(arrayParam[k][i]))
    print('******************************')
    print('******************************')

# 4 - Arbre de décision

## 4.1 - Construction du modèle avec optimisation de la profondeur de l'arbre

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
t1=datetime.datetime.now()

# Optimisation de la profondeur de l'arbre
param=[{"max_depth":list(range(2,10))}]
tree= GridSearchCV(DecisionTreeRegressor(),param,cv=10,n_jobs=-1)
treeOpt=tree.fit(X_train, Y_train)

# paramètre optimal
print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - treeOpt.best_score_,treeOpt.best_params_))


t2=datetime.datetime.now()
print((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))


In [None]:
# Estimation de l'erreur de prévision
1-treeOpt.score(X_test,Y_test)

## 4.2 - Préivsion sur l'échantillon test

In [None]:
# prévision de l'échantillon test
prev_DT = treeOpt.predict(X_test)

#print(prev_DT)

# matrice de confusion
table=pd.crosstab(prev_DT,Y_test)
print(table)

In [None]:
print("MSE=",mean_squared_error(Y_test,prev_DT))

## 4.3 - Aperçu graphique de l'arbre modélisé

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
import pydotplus
treeG=DecisionTreeRegressor(max_depth=treeOpt.best_params_['max_depth'])
treeG.fit(X_train,Y_train)
dot_data = StringIO() 
export_graphviz(treeG, out_file=dot_data) 
graph=pydotplus.graph_from_dot_data(dot_data.getvalue()) 

graph.write_png("treeOpt.png")  

In [None]:
from IPython.display import Image
Image(filename='treeOpt.png')

In [None]:
print(Data_num.columns[48])
print(Data_num.columns[28])
print(Data_num.columns[53])

## 4.4 - Calcul du temps moyen d'exécution de l'algorithme

In [None]:
t_tot = np.zeros(50)
for i in range(0,50):
    t1=datetime.datetime.now()
    # Optimisation de la profondeur de l'arbre
    param=[{"max_depth":list(range(2,10))}]
    tree= GridSearchCV(DecisionTreeRegressor(),param,cv=10,n_jobs=-1)
    treeOpt=tree.fit(X_train, Y_train)
    # paramètre optimal
    #print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - treeOpt.best_score_,treeOpt.best_params_))


    t2=datetime.datetime.now()
    t_tot[i]=((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))
print(t_tot)
print(np.mean(t_tot))

t_b=[]
for i in range(0,np.size(t_tot)):
    if (t_tot[i] > 0 ):
        t_b.append(t_tot[i])
        
print(t_b)
print(np.mean(t_b))

## 4.5 - Analyse des variables retenues sur 25 modèles obtenus par validation croisée de Monte Carlo

In [None]:
from sklearn.utils import check_random_state
import time
check_random_state(13)
# définition des estimateurs

arbre = DecisionTreeRegressor()

# Nombre d'itérations
B=25

# définition des grilles de paramètres
listMethGrid=[
    [arbre,{"max_depth":[2,3,4,5,6,7,8,9,10]}]#,
    ]

# Initialisation à 0 des erreurs pour chaque méthode (colonne) et chaque itération (ligne)
arrayErreur=np.empty((B,5)) 
arrayMSE=np.empty((B,5)) 
for i in range(B):   # itérations sur B échantillons test
    # extraction apprentissage et test
    X_train,X_test,Y_train,Y_test=train_test_split(Data_num,Data["G3"],test_size=int(0.2*np.shape(Data_num)[0]))
    # optimisation de chaque méthode et calcul de l'erreur sur le test
    for j,(method, grid_list) in enumerate(listMethGrid):
        methodGrid=GridSearchCV(method,grid_list,cv=5,n_jobs=-1).fit(X_train, Y_train)
        methodOpt = methodGrid.best_estimator_
        methFit=methodOpt.fit(X_train, Y_train)
        methPred = methodOpt.predict(X_test)
        arrayMSE[i,j]=mean_squared_error(Y_test,np.transpose(methPred))
        arrayErreur[i,j]=1-methFit.score(X_test,Y_test)
        for i in range (0, np.size(X_train.columns)):
            if (abs(methFit.feature_importances_[i]) != 0):
                print(X_train.columns[i]+"____"+str(methFit.feature_importances_[i]))
        print('******************************')
        print('******************************')

# 5 - Random Forest

## 5.1 - Construction du modèle

In [None]:
from sklearn.ensemble import RandomForestRegressor 

Optimisation par validation croisée

In [None]:
t1=datetime.datetime.now()

param=[{"max_features":list(range(1,10,1))}]

rf= GridSearchCV(
    RandomForestRegressor(n_estimators=100),
    param,cv=5,n_jobs=-1)

rfOpt=rf.fit(X_train, Y_train)

# paramètre optimal
print("Meilleur score = %f" % (1. - rfOpt.best_score_,))
print("Meilleur paramètre = %s" % (rfOpt.best_params_))

t2=datetime.datetime.now()
print((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))


In [None]:
# définition des paramètres
forest = RandomForestRegressor(n_estimators=500, 
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1, 
    max_features=rfOpt.best_params_["max_features"], 
    max_leaf_nodes=None,
    bootstrap=True, oob_score=True)

# apprentissage
rfFit = forest.fit(X_train,Y_train)
print(1-rfFit.oob_score_)

In [None]:
# erreur de prévision sur le test
1-rfFit.score(X_test,Y_test)

## 5.2 - Prévision sur l'échantillon test

In [None]:
# prévision
prev_RF = rfFit.predict(X_test)
print(prev_RF)

for i in range (0, np.size(prev_RF)):
    prev_RF[i] = round(prev_RF[i]) #On arrondi les valeurs prédites pour les comparer
                            # plus facilement aux valeurs réelles
print(prev_RF)

In [None]:
# matrice de confusion
table=pd.crosstab(prev_RF,Y_test)
print(table)

In [None]:
print("MSE=",mean_squared_error(Y_test,prev_RF))

## 5.3 - Importance des variables dans le modèle créé

In [None]:
# Importance décroissante des variables
importances = rfFit.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print(Data_num.columns[indices[f]], importances[indices[f]])

## 5.4 - Calcul du temps moyen d'exécution de l'algorithme

In [None]:
t_tot = np.zeros(50)
for i in range(0,50):
    t1=datetime.datetime.now()

    param=[{"max_features":list(range(1,10,1))}]

    rf= GridSearchCV(
        RandomForestRegressor(n_estimators=100),
        param,cv=5,n_jobs=-1)

    rfOpt=rf.fit(X_train, Y_train)

    # paramètre optimal
    #print("Meilleur score = %f" % (1. - rfOpt.best_score_,))
    #print("Meilleur paramètre = %s" % (rfOpt.best_params_))
    
    t2=datetime.datetime.now()
    t_tot[i]=((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))
#print(t_tot)
#print(np.mean(t_tot))

t_b=[]

for i in range(0,np.size(t_tot)):
    if (t_tot[i] > 0 ):
        t_b.append(t_tot[i])
        
print(t_b)
print(np.mean(t_b))

# 6 - Algorithme de boosting : GBM

## 6.1 - Construction du modèle

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

t1=datetime.datetime.now()

# Optimisation de deux paramètres
paramGrid = [
  {'n_estimators': list(range(20,201,20)), 'learning_rate': [0.04,0.06,0.08,0.1,0.12,0.14]}
 ]
gbmC= GridSearchCV(GradientBoostingRegressor(),paramGrid,cv=5,n_jobs=-1)
gbmOpt=gbmC.fit(X_train, Y_train)

# paramètre optimal, 
print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - gbmOpt.best_score_,gbmOpt.best_params_))


t2=datetime.datetime.now()
print((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))


## 6.2 - Prévision sur l'échantillon test

In [None]:
# Prévision de l'échantillon test
yChap = gbmOpt.predict(X_test)


#print(yChap)
for i in range (0, np.size(yChap)):
    yChap[i] = round(yChap[i]) #On arrondi les valeurs prédites pour les comparer
                            # plus facilement aux valeurs réelles
print(yChap)


In [None]:
# matrice de confusion
table=pd.crosstab(yChap,Y_test)
print(table)

In [None]:
# Erreur de prévision sur le test
print("Erreur de test gbm opt = %f" % (1-gbmOpt.score(X_test,Y_test)))
print("MSE=",mean_squared_error(Y_test,yChap))

## 6.3 - Calcul du temps moyen d'exécution de l'algorithme

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

t_tot = np.zeros(50)
for i in range(0,50):
    t1=datetime.datetime.now()

    # Optimisation de deux paramètres
    paramGrid = [
      {'n_estimators': list(range(20,201,20)), 'learning_rate': [0.04,0.06,0.08,0.1,0.12,0.14]}
     ]
    gbmC= GridSearchCV(GradientBoostingRegressor(),paramGrid,cv=5,n_jobs=-1)
    gbmOpt=gbmC.fit(X_train, Y_train)
    
    # paramètre optimal, 
    #print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - gbmOpt.best_score_,gbmOpt.best_params_))


    t2=datetime.datetime.now()
    t_tot[i]=((t2.second+t2.microsecond*0.000001) - (t1.second+t1.microsecond*0.000001))
#print(t_tot)
#print(np.mean(t_tot))

t_b=[]

for i in range(0,np.size(t_tot)):
    if (t_tot[i] > 0 ):
        t_b.append(t_tot[i])
        
print(t_b)
print(np.mean(t_b))

# 7 - Algorithme de boosting : XGB

## 7.1 - Création du modèle

In [None]:
import os

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'

os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [None]:
import xgboost as xgb

t1=datetime.datetime.now()

paramGrid = [
  {'n_estimators': list(range(10,201,10)), 'learning_rate': [0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]}
 ]
gbmC= GridSearchCV(xgb.XGBRegressor(),paramGrid,cv=5,n_jobs=-1)
gbmOpt=gbmC.fit(X_train, Y_train)
# paramètre optimal, 
print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - gbmOpt.best_score_,gbmOpt.best_params_))

t2=datetime.datetime.now()
print((t2.minute*60+t2.second+t2.microsecond*0.000001) - (t1.minute*60+t1.second+t1.microsecond*0.000001))


## 7.2 - Prévision sur l'échantillon test

In [None]:
pred_xgb = gbmOpt.predict(X_test)

for i in range (0, np.size(pred_xgb)):
    pred_xgb[i] = round(pred_xgb[i]) #On arrondi les valeurs prédites pour les comparer
                            # plus facilement aux valeurs réelles

In [None]:
table=pd.crosstab(pred_xgb,Y_test)
print(table)

In [None]:
print("MSE=",mean_squared_error(Y_test,np.transpose(pred_xgb)))

## 7.3 - Calcul du temps moyen d'exécution de l'algorithme

In [None]:
import xgboost as xgb

t_tot = np.zeros(50)
for i in range(0,50):
    print(i)
    t1=datetime.datetime.now()
    
    paramGrid = [
      {'n_estimators': list(range(10,201,10)), 'learning_rate': [0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]}
     ]
    gbmC= GridSearchCV(xgb.XGBRegressor(),paramGrid,cv=5,n_jobs=-1)
    gbmOpt=gbmC.fit(X_train, Y_train)
    # paramètre optimal, 
    #print("Meilleur score = %f, Meilleur paramètre = %s" % (1. - gbmOpt.best_score_,gbmOpt.best_params_))
    
    t2=datetime.datetime.now()
    t_tot[i]=((t2.minute*60+t2.second+t2.microsecond*0.000001) - (t1.minute*60+t1.second+t1.microsecond*0.000001))
#print(t_tot)
#print(np.mean(t_tot))

t_b=[]

for i in range(0,np.size(t_tot)):
    if (t_tot[i] > 0 ):
        t_b.append(t_tot[i])
        
print(t_b)
print(np.mean(t_b))

# 8 - Validation croisée de Monte Carlo sur tous les modèles pour comparer les erreurs de prédiction

In [None]:
from sklearn.utils import check_random_state
import xgboost as xgb
import time
check_random_state(13)
# définition des estimateurs

reg   = linear_model.Lasso()
arbre = DecisionTreeRegressor()
rf    = RandomForestRegressor(n_estimators=200)
gbm   = GradientBoostingRegressor()
xgb   = xgb.XGBRegressor()

# Nombre d'itérations
B=50

# définition des grilles de paramètres
listMethGrid=[
    [reg,{"alpha":[0.01,0.05,0.1,0.15,0.2]}],
    [arbre,{"max_depth":[2,3,4,5,6,7,8,9,10]}],
    [rf,{"max_features":[2,3,4,5,6,7,8,9,10]}],
    [gbm,{'n_estimators': list(range(50,301,20)), 'learning_rate': [0.04,0.06,0.08,0.1,0.12,0.14,0.16]}],
    [xgb,{'n_estimators': list(range(10,201,10)), 'learning_rate': [0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]}]
    ]

# Initialisation à 0 des erreurs pour chaque méthode (colonne) et chaque itération (ligne)
arrayErreur=np.empty((B,5)) 
arrayMSE=np.empty((B,5)) 
for i in range(B):   # itérations sur B échantillons test
    # extraction apprentissage et test
    X_train,X_test,Y_train,Y_test=train_test_split(Data_num,Data["G3"],test_size=int(0.2*np.shape(Data_num)[0]))
    # optimisation de chaque méthode et calcul de l'erreur sur le test
    for j,(method, grid_list) in enumerate(listMethGrid):
        methodGrid=GridSearchCV(method,grid_list,cv=5,n_jobs=-1).fit(X_train, Y_train)
        methodOpt = methodGrid.best_estimator_
        methFit=methodOpt.fit(X_train, Y_train)
        methPred = methodOpt.predict(X_test)
        arrayMSE[i,j]=mean_squared_error(Y_test,np.transpose(methPred))
        arrayErreur[i,j]=1-methFit.score(X_test,Y_test)

In [None]:
dataframeErreur=pd.DataFrame(arrayErreur,columns=["reg","Arbre","RF","GBM","XGB"])
dataframeMSE=pd.DataFrame(arrayMSE,columns=["reg","Arbre","RF","GBM","XGB"])

In [None]:
# Distribution des erreurs
dataframeErreur[["reg","Arbre","RF","GBM","XGB"]].boxplot(return_type='dict')
plt.show()

In [None]:
dataframeMSE[["reg","Arbre","RF","GBM","XGB"]].boxplot(return_type='dict')
plt.show()

In [None]:
print(np.mean(dataframeMSE["reg"]))
print(np.mean(dataframeMSE["Arbre"]))
print(np.mean(dataframeMSE["RF"]))
print(np.mean(dataframeMSE["GBM"]))
print(np.mean(dataframeMSE["XGB"]))

# 9 - Comparaison des moyennes des erreurs des différentes méthodes

In [None]:
import scipy.stats as stats

In [None]:
stats.f_oneway(arrayErreur[:,0],arrayErreur[:,1],arrayErreur[:,2],arrayErreur[:,3],arrayErreur[:,4])

En enlevant "arbre" qui a une moyenne d'erreur de prédiction clairement différente des autres modèles.

In [None]:
stats.f_oneway(arrayErreur[:,0],arrayErreur[:,2],arrayErreur[:,3],arrayErreur[:,4])