In [1]:
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
import pandas as pd

%load_ext autoreload
%autoreload 2
import pre_ml
import mlpipeline

In [2]:
covid = pre_ml.data()

print(covid.shape)
covid.head(5)

Requesting data to datos abiertos Mexico
Getting zip raw data into directory, will delete soon
Raw data deleted. If you specified filename, clean data will be saved in data directory
(87512, 21)


Unnamed: 0,entidad_res,municipio_res,CVE_MUN,embarazo,edad,diabetes,epoc,asma,inmusupr,hipertension,...,obesidad,tabaquismo,pobreza,Densidad_pob,medicos,camas_hosp,enfermeras,hospitalizado,muertos,covid_grave
0,15,57,15057,0.0,40,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,32.9,5792.941701,0.648218,4.691344,13.579627,1,1,1
1,2,4,2004,0.0,46,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,31.1,1449.318075,0.726447,4.889549,11.70139,1,0,1
2,8,19,8019,0.0,33,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,25.8,113.262819,1.653685,10.343429,29.239674,0,0,0
3,15,9,15009,0.0,37,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,43.7,288.09549,3.483171,10.999487,44.364596,0,0,0
4,1,1,1001,0.0,53,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,30.2,816.521806,1.871147,7.60933,24.376882,0,0,0


In [3]:
print('hospitalizado')
print(covid['hospitalizado'].value_counts())
print('muerto')
print(covid['muertos'].value_counts())
print('covid grave')
print(covid['covid_grave'].value_counts())

hospitalizado
0    56596
1    30916
Name: hospitalizado, dtype: int64
muerto
0    77733
1     9779
Name: muertos, dtype: int64
covid grave
0    55597
1    31915
Name: covid_grave, dtype: int64


# Predicting Deaths 
(using SMOTE)

The correct application of oversampling during k-fold cross-validation is to apply the method to the training dataset only, then evaluate the model on the stratified but non-transformed test set.

The ideal way to conduct this experiment is to perform oversampling of the training set in each cross-validation iteration, not before beginning the process. This prevents the data leakage from the validation set to the training set during cross-validation, and reflects how the model trained on a balanced training set would perform when applied to an imbalanced, unseen test set.

* In each iteration exclude some data for validation. The excluded data should not be used for feature selection, oversampling and model building.
* Oversample the minority class only in the training set without the data already excluded for validation.
* Repeat K times, where K is number of folds.

This can be achieved by defining a Pipeline that first transforms the training dataset with SMOTE then fits the model.
This pipeline can then be evaluated using repeated k-fold cross-validation.

In [4]:
#separate training and testing
train, test = mlpipeline.split_data(covid, 0.2, 1)
print(train.shape)

#normalize train and test continous variables
train, test = mlpipeline.normalize(train, test, ['edad', 'pobreza', 'Densidad_pob', 'medicos',
                                                            'camas_hosp', 'enfermeras'])
train[['edad', 'pobreza', 'Densidad_pob', 'medicos','camas_hosp', 'enfermeras']].describe().loc[['mean', 'std'], :]

(70009, 21)


Unnamed: 0,edad,pobreza,Densidad_pob,medicos,camas_hosp,enfermeras
mean,-1.936488e-16,-4.235813e-16,-7.287201000000001e-17,-1.333619e-16,-1.063647e-16,2.9635970000000004e-17
std,1.000007,1.000007,1.000007,1.000007,1.000007,1.000007


In [None]:
features = ['edad', 'diabetes', 'epoc', 'asma', 'inmusupr', 'hipertension', 'cardiovascular',
            'obesidad', 'tabaquismo', 'pobreza', 'medicos', 'camas_hosp', 'enfermeras']
target= ['muertos']

MODELS = {'LogisticRegression': LogisticRegression()}

GRID = {'LogisticRegression': [{'penalty': x, 'C': y, 'random_state': 0, 'solver':'lbfgs'} 
                           for x in ('l2',) \
                           for y in (0.01, 0.1, 1, 10, 100)]}

results, best_model = mlpipeline.grid_search(train, features, target, MODELS, GRID)

Training model: LogisticRegression | {'penalty': 'l2', 'C': 0.01, 'random_state': 0, 'solver': 'lbfgs'}
Training model: LogisticRegression | {'penalty': 'l2', 'C': 0.1, 'random_state': 0, 'solver': 'lbfgs'}
Training model: LogisticRegression | {'penalty': 'l2', 'C': 1, 'random_state': 0, 'solver': 'lbfgs'}
Training model: LogisticRegression | {'penalty': 'l2', 'C': 10, 'random_state': 0, 'solver': 'lbfgs'}


In [7]:
print(best_model)
print(best_model.coef_)
results

LogisticRegression(C=100, random_state=0)
[[ 1.04752972  0.65278619  0.0098159  -0.46089242  0.43589441  0.26404304
  -0.06407189  0.41484987  0.00944827  0.07669974 -0.00806718 -0.00991199
  -0.07321002]]


Unnamed: 0,params,precision,accuracy,recall
0,"{'penalty': 'l2', 'C': 0.01, 'random_state': 0...",0.727006,0.720122,0.247696
1,"{'penalty': 'l2', 'C': 0.1, 'random_state': 0,...",0.726387,0.719908,0.247418
2,"{'penalty': 'l2', 'C': 1, 'random_state': 0, '...",0.726137,0.719879,0.247352
3,"{'penalty': 'l2', 'C': 10, 'random_state': 0, ...",0.726137,0.719865,0.247342
4,"{'penalty': 'l2', 'C': 100, 'random_state': 0,...",0.726137,0.719865,0.247342


# Predicting Hospitalization

In [7]:
features = ['edad', 'diabetes', 'epoc', 'asma', 'inmusupr', 'hipertension', 'cardiovascular',
            'obesidad', 'tabaquismo', 'pobreza', 'medicos', 'camas_hosp', 'enfermeras']
target= ['hospitalizado']

pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(random_state=0))

kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [1, 3, 5],
          'logisticregression__solver': ['lbfgs']}

grid = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=kf,
                    return_train_score=True,
                    scoring= ['accuracy', 'precision', 'recall'],
                    refit = 'accuracy',
                    iid = True)

grid.fit(train[features].values, train[target].values.ravel())
log_results = pd.DataFrame(grid.cv_results_)
log_results[['params', 'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall']]



Unnamed: 0,params,mean_test_accuracy,mean_test_precision,mean_test_recall
0,"{'logisticregression__C': 1, 'logisticregressi...",0.704162,0.568851,0.673631
1,"{'logisticregression__C': 3, 'logisticregressi...",0.704157,0.568846,0.673618
2,"{'logisticregression__C': 5, 'logisticregressi...",0.704157,0.568846,0.673618


# Predicting Hospitalization/Death

In [10]:
features = ['edad', 'diabetes', 'epoc', 'asma', 'inmusupr', 'hipertension', 'cardiovascular',
            'obesidad', 'tabaquismo', 'pobreza', 'medicos', 'camas_hosp', 'enfermeras']
target= ['covid_grave']

pipeline = make_pipeline((SMOTE(random_state=0)), LogisticRegression(random_state=0))

kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#model
params = {'logisticregression__penalty': ['l2'],
          'logisticregression__C': [1, 3, 5],
          'logisticregression__solver': ['lbfgs']}

grid = GridSearchCV(estimator=pipeline,
                    param_grid=params,
                    cv=kf,
                    return_train_score=True,
                    scoring= ['accuracy', 'precision', 'recall'],
                    refit = 'accuracy',
                    iid = True)

grid.fit(train[features].values, train[target].values.ravel())
log_results = pd.DataFrame(grid.cv_results_)
log_results[['params', 'mean_test_accuracy', 'mean_test_precision', 'mean_test_recall']]



Unnamed: 0,params,mean_test_accuracy,mean_test_precision,mean_test_recall
0,"{'logisticregression__C': 1, 'logisticregressi...",0.710237,0.589686,0.678989
1,"{'logisticregression__C': 3, 'logisticregressi...",0.710223,0.58967,0.678963
2,"{'logisticregression__C': 5, 'logisticregressi...",0.710228,0.589676,0.678963
