In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("C:/Users/noeam/Documents/git/github/proyecto_pt2/outputs/completo_seleccionadas.csv")
df = df.astype("category")
df

Unnamed: 0,obesidad,ejer_act,ejer_1,ejer_5
0,0,M,M,B
1,0,M,M,M
2,0,M,M,M
3,0,M,B,B
4,0,B,B,B
...,...,...,...,...
1793,0,B,B,B
1794,0,M,M,B
1795,0,M,M,B
1796,0,M,B,B


# Feature Engineering

In [3]:
df['ejer_act'] = df['ejer_act'].replace('B',1).replace('M',0)
df['ejer_1'] = df['ejer_1'].replace('B',1).replace('M',0)
df['ejer_5'] = df['ejer_5'].replace('B',1).replace('M',0)

In [4]:
label = df['obesidad']

In [5]:
df.drop(['obesidad'], axis=1, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_val, label_train, label_val = train_test_split(df, label, test_size=0.20, random_state=123)

In [8]:
X_train.shape, label_train.shape, X_val.shape, label_val.shape

((1438, 3), (1438,), (360, 3), (360,))

In [9]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()

X_train_scaled = s.fit_transform(X_train)
X_val_scaled = s.transform(X_val)

# Modeling

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear').fit(X_train_scaled, label_train)

In [11]:
from sklearn.linear_model import LogisticRegressionCV

# L1 regularized logistic regression
lr_l1 = LogisticRegressionCV(Cs=15, cv=4, penalty='l1', solver='liblinear').fit(X_train_scaled, label_train)  #C=10

In [12]:
# L2 regularized logistic regression
lr_l2 = LogisticRegressionCV(Cs=15, cv=4, penalty='l2', solver='liblinear').fit(X_train_scaled, label_train)  #C=10

In [13]:
# Predict the class and the probability for each
y_pred = list()
y_prob = list()

coeff_labels = ['lr', 'l1', 'l2']
coeff_models = [lr, lr_l1, lr_l2]

for lab,mod in zip(coeff_labels, coeff_models):
    y_pred.append(pd.Series(mod.predict(X_val_scaled), name=lab))
    y_prob.append(pd.Series(mod.predict_proba(X_val_scaled).max(axis=1), name=lab))

y_pred = pd.concat(y_pred, axis=1)
y_prob = pd.concat(y_prob, axis=1)

y_pred.head()

Unnamed: 0,lr,l1,l2
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [14]:
y_prob.head()

Unnamed: 0,lr,l1,l2
0,0.715919,0.5,0.506305
1,0.715919,0.5,0.506305
2,0.715919,0.5,0.506305
3,0.715919,0.5,0.506305
4,0.828147,0.5,0.512975


# Error metrics

In [15]:
from sklearn.metrics import classification_report
print('Classification report for Logistic regression without regularization:')
print(classification_report(label_val,y_pred['lr']))

print('Classification report for Logistic regression with L1(Lasso) regularization:')
print(classification_report(label_val,y_pred['l1']))

print('Classification report for Logistic regression with L2(Ridge) regularization:')
print(classification_report(label_val,y_pred['l2']))

Classification report for Logistic regression without regularization:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87       278
           1       0.00      0.00      0.00        82

    accuracy                           0.77       360
   macro avg       0.39      0.50      0.44       360
weighted avg       0.60      0.77      0.67       360

Classification report for Logistic regression with L1(Lasso) regularization:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87       278
           1       0.00      0.00      0.00        82

    accuracy                           0.77       360
   macro avg       0.39      0.50      0.44       360
weighted avg       0.60      0.77      0.67       360

Classification report for Logistic regression with L2(Ridge) regularization:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87       278
           1 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

metrics = list()
cm = dict()

for lab in coeff_labels:

    # Precision, recall, f-score from the multi-class support function, we will average them because we will have one value per class
    precision, recall, fscore, _ = score(label_val, y_pred[lab], average='weighted')

    # The usual way to calculate accuracy
    accuracy = accuracy_score(label_val, y_pred[lab])

    # ROC-AUC scores can be calculated by binarizing the data
    auc = roc_auc_score(label_binarize(label_val, classes=[0,1]),
                        label_binarize(y_pred[lab], classes=[0,1]),
                        average='weighted')

    # Last, the confusion matrix
    cm[lab] = confusion_matrix(label_val, y_pred[lab])

    metrics.append(pd.Series({'precision':precision, 'recall':recall,
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc},
                             name=lab))

metrics = pd.concat(metrics, axis=1)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
metrics

Unnamed: 0,lr,l1,l2
precision,0.596327,0.596327,0.596327
recall,0.772222,0.772222,0.772222
fscore,0.672971,0.672971,0.672971
accuracy,0.772222,0.772222,0.772222
auc,0.5,0.5,0.5


## Displaying the confusion matrix for each model:

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
from matplotlib import pyplot as plt

f, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(18,5))

disp1 = ConfusionMatrixDisplay(confusion_matrix=cm['lr'], display_labels=lr.classes_)
disp1.plot(cmap='Blues', ax=ax1)
ax1.set_title('Logistic')

disp2 = ConfusionMatrixDisplay(confusion_matrix=cm['l1'], display_labels=lr.classes_)
disp2.plot(cmap='Blues', ax=ax2)
ax2.set_title('Logistic Lasso')

disp3 = ConfusionMatrixDisplay(confusion_matrix=cm['l2'], display_labels=lr.classes_)
disp3.plot(cmap='Blues', ax=ax3)
ax3.set_title('Logistic Ridge')

#f.suptitle('Confusion matrix of models')
f.tight_layout()

In [None]:
pd.DataFrame(label).value_counts()