# Groupe Geoffroy Dufay, Louis Lapassat : diabète.

In [2]:
#########################
### loading libraries ###
#########################

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets

##############################
### fixing some parameters ###
##############################

sns.set(style="darkgrid", palette="muted", font_scale=1.5)
random_seed = 141421

# Loading the dataset

In [23]:
diabete = datasets.load_diabetes()
df = pd.DataFrame(data=diabete.data, columns=diabete.feature_names)
df['target'] = diabete.target
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline. 

**Note**: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1). For instance if `f` is a feature we have:

$$ \frac{f - mean(f)}{std(f)} \times n\_samples$$ 

In [45]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import GridSearchCV
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

def tune_plot(model, param, X, y, normalize=1, iter_gridsearch=1, nb_cv=5, test_size=0.3, seed=141421):
    
    """
    Tune a given model with GridSearchCV and give for accuracy results with a confusion matrix.
    
    First you need to make sure that you imported the library for your model!
    
    model = classificator
    param = parameters of your classificator that you want to tune 
    X = data for prediction (DataFrame pref)
    y = target to predict (DataFrame pref)
    normalize = True/False (normalize both X and y with StandardScaler and LabelEncoder)
    iter_gridsearch = number of iteration for GridSearchCV (can be usefull if your model involve randomness)
    nb_cv = number of cross validation
    test_size = test size in percent (splitting train and test set)
    seed = seed for random state
    """
    
    print(" ************************* ", str(model), " ************************* ")
    
    ######################
    ### Normalize data ###
    ######################
    
    if normalize:
        le = LabelEncoder()
        le.fit(y)
        yy = le.transform(y)

        scaler = StandardScaler()
        scaler.fit(X)
        XX = scaler.transform(X)
    else:
        yy = y
        XX = X
    
    ####################
    ### GridSearchCV ###
    ####################
    
    if iter_gridsearch > 1:
        keys = param.keys()
        dic_result = {key: [] for key in keys}
        for i in tqdm(range(iter_gridsearch)):
            clf = model()
            result_grid_search = GridSearchCV(estimator=clf, param_grid=param, cv=nb_cv, return_train_score=False)
            result_grid_search.fit(XX, yy)
            for key in keys:
                dic_result[key].append(result_grid_search.best_params_[key])
        best_param = {}
        for key in keys:
            track = {}
            for item in dic_result[key]: 
                if item not in track:
                    track[item] = 0
                else:
                    track[item] += 1
            best_param[key] = max(track, key=track.get)
    else:
        clf = model()
        result_grid_search = GridSearchCV(estimator=clf, param_grid=param, cv=nb_cv)
        result_grid_search.fit(XX, yy)
        best_param = result_grid_search.best_params_
    
    print('best parameters: ', best_param)
    
    ####################
    ### update model ###
    ####################
    
    clf = model(**best_param)
    
    ########################
    ### train / test set ###
    ########################
    
    x_train, x_test, y_train, y_test = train_test_split(XX, yy, test_size=test_size, random_state=seed)
    
    #################
    ### fit model ###
    #################    
    
    clf.fit(x_train, y_train)
    
    #############
    ### Score ###
    #############
    
    print("Mean squared error (on test set): %.2f" % mean_squared_error(y_test, clf.predict(x_test)))
    print('Variance score (max_value=1 for perfect prediction): %.2f' % r2_score(y_test, clf.predict(x_test)))
    
    return best_param

In [49]:
param = { 
    'hidden_layer_sizes' : [(20,), (25,), (10,)], # (100,) is default
    'activation' : ['relu', 'logistic', 'tanh', 'identity'], # 'relu' is default
    'solver' : ['lbfgs', 'adam', 'sgd'], # 'adam' is default
    'alpha' : [0.0001, 0.001, 0.01] # 0.0001 is default
    }

best_param_MLP = tune_plot(MLPRegressor, param,
                          df.drop(['target'], axis=1), df['target'], normalize=True,
                          iter_gridsearch=2, nb_cv=3, test_size=0.3, seed=141421)

 *************************  <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>  ************************* 


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

best parameters:  {'hidden_layer_sizes': (15,), 'activation': 'logistic', 'solver': 'adam', 'alpha': 0.001}
Mean squared error (on test set): 3424.76
Variance score (max_value=1 for perfect prediction): -0.05
