# Proyecto: Student performance

Lo principal fue tomar los datos de los dos datasets suministrados por UCI (https://archive.ics.uci.edu/ml/datasets/Student+Performance) y juntarlos en un sólo dataset para tener toda la información en un mismo lugar para revisar desde un mismo dataset si se encuentran valores atípicos o nulos. Luego, se volverán a usar ambos datasets por separado y se harán los respectivos cambios si se encuentra alguna inconsistencia con las tablas.

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np

In [2]:
data_mat = pd.read_csv('student-mat.csv', sep=";")
data_por = pd.read_csv('student-por.csv', sep=";")

data = pd.concat([data_mat, data_por])
print("Número de muestras: ", data.shape[0])
print("Número de características: ", data.shape[1])

Número de muestras:  1044
Número de características:  33


In [3]:
data.head(10)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
5,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,10,15,15,15
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,12,12,11
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,6,6,5,6
8,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,16,18,19
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,14,15,15


## Limpieza de datos

Con la información configurada en un sólo dataset se procede a realizar la limpieza de datos observando los valores que se encuentran en cada columna con el fin de encontrar valores atípicos o nulos.

In [4]:
data.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [5]:
data['school'].value_counts()

GP    772
MS    272
Name: school, dtype: int64

In [6]:
data['sex'].value_counts()

F    591
M    453
Name: sex, dtype: int64

In [7]:
data['age'].value_counts()

16    281
17    277
18    222
15    194
19     56
20      9
21      3
22      2
Name: age, dtype: int64

In [8]:
data['address'].value_counts()

U    759
R    285
Name: address, dtype: int64

In [9]:
data['famsize'].value_counts()

GT3    738
LE3    306
Name: famsize, dtype: int64

In [10]:
data['Pstatus'].value_counts() 

T    923
A    121
Name: Pstatus, dtype: int64

In [11]:
data['Medu'].value_counts() 

4    306
2    289
3    238
1    202
0      9
Name: Medu, dtype: int64

In [12]:
data['Fedu'].value_counts() 

2    324
1    256
3    231
4    224
0      9
Name: Fedu, dtype: int64

In [13]:
data['Mjob'].value_counts() 

other       399
services    239
at_home     194
teacher     130
health       82
Name: Mjob, dtype: int64

In [14]:
data['Fjob'].value_counts() 

other       584
services    292
teacher      65
at_home      62
health       41
Name: Fjob, dtype: int64

In [15]:
data['reason'].value_counts() 

course        430
home          258
reputation    248
other         108
Name: reason, dtype: int64

In [16]:
data['guardian'].value_counts() 

mother    728
father    243
other      73
Name: guardian, dtype: int64

In [17]:
data['traveltime'].value_counts() 

1    623
2    320
3     77
4     24
Name: traveltime, dtype: int64

In [18]:
data['studytime'].value_counts() 

2    503
1    317
3    162
4     62
Name: studytime, dtype: int64

In [19]:
data['failures'].value_counts() 

0    861
1    120
2     33
3     30
Name: failures, dtype: int64

In [20]:
data['schoolsup'].value_counts() 

no     925
yes    119
Name: schoolsup, dtype: int64

In [21]:
data['famsup'].value_counts() 

yes    640
no     404
Name: famsup, dtype: int64

In [22]:
data['paid'].value_counts() 

no     824
yes    220
Name: paid, dtype: int64

In [23]:
data['activities'].value_counts() 

no     528
yes    516
Name: activities, dtype: int64

In [24]:
data['nursery'].value_counts() 

yes    835
no     209
Name: nursery, dtype: int64

In [25]:
data['higher'].value_counts() 

yes    955
no      89
Name: higher, dtype: int64

In [26]:
data['internet'].value_counts() 

yes    827
no     217
Name: internet, dtype: int64

In [27]:
data['romantic'].value_counts() 

no     673
yes    371
Name: romantic, dtype: int64

In [28]:
data['famrel'].value_counts() 

4    512
5    286
3    169
2     47
1     30
Name: famrel, dtype: int64

In [29]:
data['freetime'].value_counts() 

3    408
4    293
2    171
5    108
1     64
Name: freetime, dtype: int64

In [30]:
data['goout'].value_counts() 

3    335
2    248
4    227
5    163
1     71
Name: goout, dtype: int64

In [31]:
data['Dalc'].value_counts() 

1    727
2    196
3     69
5     26
4     26
Name: Dalc, dtype: int64

In [32]:
data['Walc'].value_counts() 

1    398
2    235
3    200
4    138
5     73
Name: Walc, dtype: int64

In [33]:
data['health'].value_counts() 

5    395
3    215
4    174
1    137
2    123
Name: health, dtype: int64

In [34]:
data['absences'].value_counts() 

0     359
2     175
4     146
6      80
8      64
10     38
12     24
14     20
5      17
16     17
1      15
3      15
7      10
9      10
18      8
11      8
15      5
22      5
20      4
13      4
21      3
24      2
26      2
30      2
75      1
56      1
19      1
23      1
25      1
28      1
32      1
38      1
40      1
54      1
17      1
Name: absences, dtype: int64

In [35]:
data['G1'].value_counts() 

10    146
11    130
12    117
13    105
14    101
9      96
8      83
7      70
15     59
16     44
6      33
17     24
18     15
5      12
19      4
4       3
3       1
0       1
Name: G1, dtype: int64

In [36]:
data['G2'].value_counts() 

11    138
10    129
12    127
9     122
13    117
14     77
15     72
8      72
16     38
7      37
18     26
17     25
6      21
0      20
5      18
19      4
4       1
Name: G2, dtype: int64

In [37]:
data['G3'].value_counts() 

10    153
11    151
13    113
12    103
14     90
15     82
8      67
9      63
0      53
16     52
17     35
18     27
7      19
6      18
5       8
19      7
4       1
1       1
20      1
Name: G3, dtype: int64

In [38]:
data.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

#### No se encontraron valores atípicos o nulos en toda la tabla, por lo tanto, se puede proceder a seguir utilizando ambos datasets por aparte.
Ya teniendo en cuenta esto, se procede a usar LabelEncoder en ambos datasets ya que hay datos de tipo object y los modelos funcionarán mejor si todos los datos son de tipo numérico.

In [39]:
data_mat = data_mat.apply(preprocessing.LabelEncoder().fit_transform)
data_mat.head(10)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,3,1,0,0,4,4,0,4,...,3,2,3,0,0,2,6,2,3,3
1,0,0,2,1,0,1,1,1,0,2,...,4,2,2,0,0,2,4,2,2,3
2,0,0,0,1,1,1,1,1,0,2,...,3,2,1,1,2,2,10,4,5,7
3,0,0,0,1,0,1,4,2,1,3,...,2,1,1,0,0,4,2,12,11,12
4,0,0,1,1,0,1,3,3,2,2,...,3,2,1,0,1,4,4,3,7,7
5,0,1,1,1,1,1,4,3,3,2,...,4,3,1,0,1,4,10,12,12,12
6,0,1,1,1,1,1,2,2,2,2,...,3,3,3,0,0,2,0,9,9,8
7,0,0,2,1,0,0,4,4,2,4,...,3,0,3,0,0,0,6,3,2,3
8,0,1,0,1,1,0,3,2,3,2,...,3,1,1,0,0,0,0,13,15,16
9,0,1,0,1,0,1,3,4,2,2,...,4,4,0,0,0,4,0,11,12,12


In [40]:
data_por = data_por.apply(preprocessing.LabelEncoder().fit_transform)
data_por.head(10)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,3,1,0,0,4,4,0,4,...,3,2,3,0,0,2,4,0,7,8
1,0,0,2,1,0,1,1,1,0,2,...,4,2,2,0,0,2,2,6,7,8
2,0,0,0,1,1,1,1,1,0,2,...,3,2,1,1,2,2,6,9,9,9
3,0,0,0,1,0,1,4,2,1,3,...,2,1,1,0,0,4,0,11,10,11
4,0,0,1,1,0,1,3,3,2,2,...,3,2,1,0,1,4,0,8,9,10
5,0,1,1,1,1,1,4,3,3,2,...,4,3,1,0,1,4,6,9,8,10
6,0,1,1,1,1,1,2,2,2,2,...,3,3,3,0,0,2,0,10,8,10
7,0,0,2,1,0,0,4,4,2,4,...,3,0,3,0,0,0,2,7,9,10
8,0,1,0,1,1,0,3,2,3,2,...,3,1,1,0,0,0,0,12,12,14
9,0,1,0,1,0,1,3,4,2,2,...,4,4,0,0,0,4,0,9,8,10


In [41]:
X_mat = data_mat.drop('G3', axis=1)
y_mat = data_mat["G3"]
print(X_mat.shape, y_mat.shape)

(395, 32) (395,)


In [42]:
X_por = data_por.drop('G3', axis=1)
y_por = data_por["G3"]
print(X_por.shape, y_por.shape)

(649, 32) (649,)


## Entrenar modelos de regresión

In [43]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold

In [45]:
skf = KFold(n_splits=10)

In [74]:
def make_table(data):
    df = pd.DataFrame()
    df = df.assign(mean_fit_time = data["mean_fit_time"])

    df = df.assign(params = data["params"])

    df = df.assign(mean_test_mean_squared_error= data["mean_test_mean_squared_error"])
    df = df.assign(rank_test_mean_squared_error = data["rank_test_mean_squared_error"])
    df = df.assign(mean_train_mean_squared_error = data["mean_train_mean_squared_error"])

    return df

In [112]:
from sklearn.linear_model import LogisticRegression

scoring = {'mean_squared_error': 'neg_mean_squared_error'} 
parameters = {'logisticregression__solver':('newton-cg', 'sag','saga', 'lbfgs'),
               'logisticregression__penalty':('l2','none')}

pp = make_pipeline(StandardScaler(), LogisticRegression(multi_class="multinomial", max_iter=150))

gs = GridSearchCV(pp, parameters, cv=skf, scoring=scoring, refit='mean_squared_error', return_train_score=True, n_jobs=-1, verbose=10)

gs.fit(X_mat, y_mat)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   30.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.3min finished


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('logisticregression',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max

In [113]:
results_lg = gs
lg = pd.DataFrame(results_lg.cv_results_)
print("El conjunto de hiperparámetros que tuvo mejor resultado: ", results_lg.best_estimator_)
print("El index del mejor conjunto de hiperparámetros: ", results_lg.best_index_)
table_lg = make_table(lg).head(10)
table_lg

El conjunto de hiperparámetros que tuvo mejor resultado:  Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=150,
                                    multi_class='multinomial', n_jobs=None,
                                    penalty='none', random_state=None,
                                    solver='sag', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
El index del mejor conjunto de hiperparámetros:  5


Unnamed: 0,mean_fit_time,params,mean_test_mean_squared_error,rank_test_mean_squared_error,mean_train_mean_squared_error
0,0.257496,"{'logisticregression__penalty': 'l2', 'logisti...",-5.797468,4,-1.172326
1,0.348799,"{'logisticregression__penalty': 'l2', 'logisti...",-5.797468,4,-1.172607
2,0.583903,"{'logisticregression__penalty': 'l2', 'logisti...",-5.939241,7,-1.219592
3,0.20101,"{'logisticregression__penalty': 'l2', 'logisti...",-5.797468,4,-1.172326
4,21.790826,"{'logisticregression__penalty': 'none', 'logis...",-8.827848,8,-0.019418
5,0.770993,"{'logisticregression__penalty': 'none', 'logis...",-5.159494,1,-0.453433
6,0.995498,"{'logisticregression__penalty': 'none', 'logis...",-5.739241,3,-0.626149
7,0.449894,"{'logisticregression__penalty': 'none', 'logis...",-5.643038,2,-0.040795


In [83]:
from sklearn.ensemble import RandomForestRegressor

scoring = {'mean_squared_error': 'neg_mean_squared_error'} 
parameters = {'randomforestregressor__n_estimators': [10, 50, 150, 200],
              'randomforestregressor__criterion':('mse','mae'),
              'randomforestregressor__max_features':('auto', 'sqrt', 'log2')}

pp = make_pipeline(StandardScaler(), RandomForestRegressor())

gs = GridSearchCV(pp, parameters, cv=skf, scoring=scoring, refit='mean_squared_error', return_train_score=True, n_jobs=-1, verbose=10)

gs.fit(X_mat, y_mat)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_im...
                                ve

In [98]:
results_rf = gs
rf = pd.DataFrame(results_rf.cv_results_)
print("El conjunto de hiperparámetros que tuvo mejor resultado: ", results_rf.best_estimator_)
print("El index del mejor conjunto de hiperparámetros: ", results_rf.best_index_)
table_rf = make_table(rf).head(10)
table_rf

El conjunto de hiperparámetros que tuvo mejor resultado:  Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=200, n_jobs=None,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False))],
         verbose=False)
El index del mejor conjunto de hiperpará

Unnamed: 0,mean_fit_time,params,mean_test_mean_squared_error,rank_test_mean_squared_error,mean_train_mean_squared_error
0,0.069991,"{'randomforestregressor__criterion': 'mse', 'r...",-1.803671,5,-0.307172
1,0.219483,"{'randomforestregressor__criterion': 'mse', 'r...",-1.672644,3,-0.210119
2,0.584834,"{'randomforestregressor__criterion': 'mse', 'r...",-1.622687,2,-0.206668
3,0.993656,"{'randomforestregressor__criterion': 'mse', 'r...",-1.603938,1,-0.200926
4,0.047498,"{'randomforestregressor__criterion': 'mse', 'r...",-3.696987,22,-0.658872
5,0.187899,"{'randomforestregressor__criterion': 'mse', 'r...",-3.123149,19,-0.418053
6,0.450005,"{'randomforestregressor__criterion': 'mse', 'r...",-3.003816,9,-0.385295
7,0.662196,"{'randomforestregressor__criterion': 'mse', 'r...",-3.016599,11,-0.385076
8,0.079592,"{'randomforestregressor__criterion': 'mse', 'r...",-3.788152,23,-0.658967
9,0.242499,"{'randomforestregressor__criterion': 'mse', 'r...",-3.072002,16,-0.424801


In [92]:
from sklearn.neural_network  import MLPRegressor

scoring = {'mean_squared_error': 'neg_mean_squared_error'} 
parameters = {'mlpregressor__hidden_layer_sizes': [30, 40, 45],
              'mlpregressor__activation': ('identity', 'logistic', 'tanh', 'relu'),
              'mlpregressor__solver':('lbfgs', 'sgd', 'adam'),
              'mlpregressor__learning_rate':('constant', 'invscaling')}

pp = make_pipeline(StandardScaler(), MLPRegressor())

gs = GridSearchCV(pp, parameters, cv=skf, scoring=scoring, refit='mean_squared_error', return_train_score=True, n_jobs=-1, verbose=10)

gs.fit(X_mat, y_mat)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('mlpregressor',
                                        MLPRegressor(activation='relu',
                                                     alpha=0.0001,
                                                     batch_size='auto',
                                                     beta_1=0.9, beta_2=0.999,
                                                     early_stopping=False,
                                                     epsilon=1e-08,
                                                     hi...
             param_grid=

In [97]:
results_mlp = gs
mlp = pd.DataFrame(results_mlp.cv_results_)
print("El conjunto de hiperparámetros que tuvo mejor resultado: ", results_mlp.best_estimator_)
print("El index del mejor conjunto de hiperparámetros: ", results_mlp.best_index_)
table_mlp = make_table(mlp).head(10)
table_mlp

El conjunto de hiperparámetros que tuvo mejor resultado:  Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpregressor',
                 MLPRegressor(activation='identity', alpha=0.0001,
                              batch_size='auto', beta_1=0.9, beta_2=0.999,
                              early_stopping=False, epsilon=1e-08,
                              hidden_layer_sizes=40, learning_rate='constant',
                              learning_rate_init=0.001, max_iter=200,
                              momentum=0.9, n_iter_no_change=10,
                              nesterovs_momentum=True, power_t=0.5,
                              random_state=None, shuffle=True, solver='lbfgs',
                              tol=0.0001, validation_fraction=0.1,
                              verbose=False, warm_start=False))],
         verbose=False)
El index del mejor conjunto de hiperparámetros:  

Unnamed: 0,mean_fit_time,params,mean_test_mean_squared_error,rank_test_mean_squared_error,mean_train_mean_squared_error
0,0.080768,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.002375,2,-1.602666
1,0.709054,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.006431,8,-1.604793
2,0.769835,"{'mlpregressor__activation': 'identity', 'mlpr...",-4.313965,38,-4.001295
3,0.080699,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.00242,5,-1.602666
4,0.637888,"{'mlpregressor__activation': 'identity', 'mlpr...",-49.390572,68,-50.639832
5,0.748348,"{'mlpregressor__activation': 'identity', 'mlpr...",-3.7644,28,-3.491021
6,0.099507,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.002331,1,-1.602666
7,0.703575,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.006017,7,-1.604479
8,1.037747,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.701039,19,-2.332393
9,0.129904,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.002411,4,-1.602666


In [119]:
final_results = pd.DataFrame(columns = ["mean_fit_time", "params", "mean_test_mean_squared_error", "rank_test_mean_squared_error",
                                            "mean_train_mean_squared_error"])
final_results.loc[0] = table_rf.iloc[results_rf.best_index_]
final_results.loc[1] = table_mlp.iloc[results_mlp.best_index_]
final_results.loc[2] = table_lg.iloc[results_lg.best_index_]
final_results

Unnamed: 0,mean_fit_time,params,mean_test_mean_squared_error,rank_test_mean_squared_error,mean_train_mean_squared_error
0,0.993656,"{'randomforestregressor__criterion': 'mse', 'r...",-1.603938,1,-0.200926
1,0.099507,"{'mlpregressor__activation': 'identity', 'mlpr...",-2.002331,1,-1.602666
2,0.770993,"{'logisticregression__penalty': 'none', 'logis...",-5.159494,1,-0.453433


## Curvas de aprendizaje

In [131]:
train_sizes, train_scores, valid_scores = learning_curve(MLPRegressor(activation='identity', alpha=0.0001,
                              batch_size='auto', beta_1=0.9, beta_2=0.999,
                              early_stopping=False, epsilon=1e-08,
                              hidden_layer_sizes=40, learning_rate='constant',
                              learning_rate_init=0.001, max_iter=200,
                              momentum=0.9, n_iter_no_change=10,
                              nesterovs_momentum=True, power_t=0.5,
                              random_state=None, shuffle=True, solver='lbfgs',
                              tol=0.0001, validation_fraction=0.1,
                              verbose=False, warm_start=False), X_mat, y_mat, train_sizes=[50, 80, 110], cv=5)

In [133]:
print(train_sizes)
print(train_scores)
print(valid_scores)

[ 50  80 110]
[[0.98841595 0.98171718 0.98171498 0.98171416 0.98171502]
 [0.97018061 0.95077869 0.95269375 0.95269463 0.95266937]
 [0.94826794 0.93941709 0.95202523 0.95200455 0.95200238]]
[[0.70812659 0.70699751 0.74600686 0.5417128  0.54145114]
 [0.82924293 0.89220147 0.86614803 0.72589981 0.65017751]
 [0.85652941 0.88478524 0.85231019 0.75596841 0.75851996]]


In [134]:
train_sizes, train_scores, valid_scores = learning_curve(RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=200, n_jobs=None,
                                       oob_score=False, random_state=None,
                                       verbose=0, warm_start=False), X_mat, y_mat, train_sizes=[50, 80, 110], cv=5)

In [135]:
print(train_sizes)
print(train_scores)
print(valid_scores)

[ 50  80 110]
[[0.98868318 0.97752764 0.97898698 0.98118566 0.98015503]
 [0.99157212 0.98712756 0.98682184 0.98775586 0.986303  ]
 [0.98891296 0.98626827 0.98974896 0.98987057 0.9902073 ]]
[[0.88992503 0.82228005 0.83722499 0.7423948  0.71265937]
 [0.87795891 0.8704512  0.84042227 0.75108436 0.70388978]
 [0.89629269 0.91864468 0.84320694 0.76621178 0.70734057]]


In [137]:
train_sizes, train_scores, valid_scores = learning_curve(LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=150,
                                    multi_class='multinomial', n_jobs=None,
                                    penalty='none', random_state=None,
                                    solver='sag', tol=0.0001, verbose=0,
                                    warm_start=False), X_mat, y_mat, train_sizes=[50, 80, 110], cv=5)

  


In [138]:
print(train_sizes)
print(train_scores)
print(valid_scores)

[ 50  80 110]
[[1.         1.         1.         1.         1.        ]
 [0.9875     0.9875     0.9625     0.9625     0.9625    ]
 [0.92727273 0.97272727 0.94545455 0.94545455 0.94545455]]
[[0.15730337 0.19512195 0.18181818 0.14864865 0.10958904]
 [0.23595506 0.19512195 0.18181818 0.16216216 0.23287671]
 [0.25842697 0.20731707 0.11688312 0.21621622 0.21917808]]
