In [2]:
import pdb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import inspect
import tensorflow as tf
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers, callbacks

2024-03-20 12:08:29.809415: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 12:08:29.809471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 12:08:29.810659: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-20 12:08:29.820548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:


def get_input_output(df, input_cols=None, output_cols=None, remove_cols=None):
    if remove_cols is None:
        remove_cols = ['id']
    if output_cols is None:
        output_cols = ['smoking']
    if input_cols is None:
        input_cols = [c for c in df.columns if c not in remove_cols and c not in output_cols]
    X = df[input_cols].values
    y = df[output_cols].values
    shape = (y.shape[0],)
    data = {
        'X': X,
        'y': y.reshape(shape),
        'columns': input_cols
    }
    return data

def scaler_preprocess(X):
    scaler = StandardScaler().fit(X)
    X_processed = scaler.transform(X)

    data = {
        'X': X_processed,
    }
    return data

def PCA_preprocess(X):
    X_PCA = PCA().fit_transform(X)
    X_processed = np.concatenate([X, X_PCA], axis=1)
    data = {
        'X': X_processed,
    }
    return data

def preprocess_eyesight(df):

    cols = ['eyesight(left)', 'eyesight(right)']
    df.loc[:, 'best_eyesight_is_left'] = False
    df.loc[:, 'best_eyesight_is_right'] = False

    df.loc[df['eyesight(left)'] > df['eyesight(right)'], 'best_eyesight_is_left'] = True
    df.loc[df['eyesight(left)'] < df['eyesight(right)'], 'best_eyesight_is_right'] = True
    # values = df[cols].values.flatten()
    # thermos = generate_thermos(values)
    # for key, value in thermos:
    #     df.loc[:, f'eyesight(left)_{key}'] = False
    #     df.loc[:, f'eyesight(right)_{key}'] = False
    #     df.loc[df['eyesight(left)'] >= value, f'eyesight(left)_{key}'] = True
    #     df.loc[df['eyesight(right)'] >= value, f'eyesight(right)_{key}'] = True

    df.drop(cols, axis=1, inplace=True)
    df.fillna(True, inplace=True)

    data = {'df': df}
    return data

def preprocess_hearing(df):

    df.loc[:, 'best_hearing_is_left'] = df['hearing(left)'] > df['hearing(right)']
    df.loc[:, 'best_hearing_is_right'] = df['hearing(left)'] < df['hearing(right)']
    df.loc[:, 'hearing(left)'] = df.loc[:, 'hearing(left)'] - 1
    df.loc[:, 'hearing(right)'] = df.loc[:, 'hearing(right)'] - 1

    data = {'df':df}

    return data

def preprocess_thermos(df, thermos_columns=None):
    if thermos_columns is None:
        thermos_columns = ['weight(kg)', 'waist(cm)', 'systolic', 'age', 'height(cm)',
        'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
        'HDL', 'LDL', 'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp']
    for col in thermos_columns:
        values = df[col].values.flatten()
        thermos = generate_thermos(values)
        for key, value in thermos:
            df.loc[:, f'{col}_{key}'] = False
            df.loc[df[col] >= value, f'{col}_{key}'] = True
    df.drop(thermos_columns, axis=1, inplace=True)

    data = {
        'df':df
    }
    return data

def preprocess_one_hot(df, one_hot_columns=None):
    if one_hot_columns is None:
        one_hot_columns = ['Urine protein']
    for col in one_hot_columns:
        values = df[col].unique()
        thermos = generate_one_hot(values)
        for key, value in thermos:
            df.loc[:, f'{col}_{key}'] = df[col] == value
    df.drop(one_hot_columns, axis=1, inplace=True)
    data = {
        'df':df
    }
    return data
def generate_thermos(values):
    percentiles = set(np.percentile(values, [i for i in range(0,100, 10)]))
    percentiles = [a for a in enumerate(percentiles)]

    percentiles.sort(key=lambda x: x[1])
    return percentiles

def generate_one_hot(values):
    one_hot_values = set(values)
    one_hot_values = [a for a in enumerate(one_hot_values)]
    return one_hot_values

def preprocess(df, preprocessing_functions):
    input_data = {'df': df}

    for f in preprocessing_functions:
        args = inspect.getfullargspec(f).args
        sub_input = {k: input_data[k] for k in args if k in input_data}
        res = f(**sub_input)
        input_data.update(res)

    return input_data['X'], input_data['y'], input_data['columns']

def get_MLPClassifier_for_grid_search():
    alphas = [0.1, 0.15, 0.25]
    layers = [(300, 100, i) for i in range(10, 60, 10)]

    parameters = {'alpha': alphas, 'hidden_layer_sizes': layers}
    clf = MLPClassifier()
    return 'MLPC', clf, parameters

def get_keras_MLPClassifier_for_grid_search():
    clf = KerasClassifier(
        model=get_clf_model,
        hidden_layer_sizes=(100,),
        optimizer="adam",
        optimizer__learning_rate=0.001,
        epochs=50,
        verbose=0,
    )
def print_grid_search_results(results):
    params_with_values = [(p, m, s) for p, m, s in
                          zip(results['params'], results['mean_test_AUC'], results['std_test_AUC'])]
    best_result = np.argmin(results['rank_test_AUC'])
    winner_params = params_with_values[best_result]
    winner_params_str = " ".join([f"{key}: {value}" for key, value in winner_params[0].items()])
    print(f'Winner is [{winner_params_str}] {winner_params[1]:.4f} mean. {winner_params[2]:.2f} std')
    for params, mean, std in params_with_values:
        params_str = " ".join([f"{key}: {value}" for key, value in params.items()])
        print(f"[{params_str}] {mean:.4f} mean. {std:.2f} std")

def estandar(cols):
	# create model
    model = Sequential()
    model.add(Input((cols,)))
    model.add(Dense(300, activation='relu', kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(30, activation='relu', kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
    return lambda : model

def deep(cols):
	# create model
    model = Sequential()
    model.add(Input((cols,)))
    model.add(Dense(30, activation='relu', kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(30, activation='relu', kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(30, activation='relu', kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(30, activation='relu', kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
    return lambda : model

def get_KerasClassifier_for_grid_search(cols):
    es = callbacks.EarlyStopping()
    clf = KerasClassifier(verbose=3, callbacks=[es])
    return "Keras", clf, {'model':[deep(cols), estandar(cols)], 'epochs': [5], 'batch_size': [100]}

In [4]:
df = pd.read_csv('../../data/smoking/train.csv')

# Train dataset has the column that indicates that the person is a smoker or not.
# The idea is to use this dataset to train and tune our model, once trained we use the same model to classify test.csv which doesn't have the "smokes" column. This is not done on this notebook.
df

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159251,159251,40,155,45,69.0,1.5,2.0,1,1,127,...,72,159,14.5,1,0.8,25,26,13,0,0
159252,159252,50,155,75,82.0,1.0,1.0,1,1,120,...,64,108,14.5,1,0.6,21,20,18,0,0
159253,159253,40,160,50,66.0,1.5,1.0,1,1,114,...,87,93,10.9,1,0.6,15,9,12,0,0
159254,159254,50,165,75,92.0,1.2,1.0,1,1,121,...,55,80,14.4,1,1.1,22,17,37,0,1


In [None]:
# We preprocess the dataset in different ways so it is easier to handle and provides better results
preprocessing_functions = [
    preprocess_eyesight,
    preprocess_hearing,
    #preprocess_thermos,
    preprocess_one_hot,
    get_input_output,
    scaler_preprocess
]
X, y, cols = preprocess(df, preprocessing_functions)

# We generate different classifiers to see which one performs the best. Each classifier has its own parameters to explore, so we do a grid search: https://en.wikipedia.org/wiki/Hyperparameter_optimization to look for the most performant.
classifiers_to_explore = [
    #get_MLPClassifier_for_grid_search(),
    get_KerasClassifier_for_grid_search(len(cols))
]
for name, clf, parameters in classifiers_to_explore:
    # For each classifier we do a grid search for all its parameters
    gs = GridSearchCV(clf, parameters, n_jobs=1, scoring={"AUC":"roc_auc"}, refit="AUC", return_train_score=True, verbose=3)

    gs.fit(X, y)
    results = gs.cv_results_
    print(name)
    print_grid_search_results(results)

# The best classifier should have the better score at the end