# Data preparation and model selection


In [15]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import NuSVR
from numpy import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import NuSVR
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.neural_network import MLPRegressor

import warnings
warnings.simplefilter("ignore")

#TODO Deep learning model with Tensorflow

In [16]:
X1 = pd.read_csv("data/X1.csv", index_col=0)
Y1 = pd.read_csv("data/Y1.csv", header=None, names=['Weight'])
X2 = pd.read_csv("data/X2.csv", index_col=0)

## Data preparation

### Data encoding

In [17]:
class DataEncoder(BaseEstimator):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        columns = ['Gender', 'Age', 'Height', 'Family history with overweight', 'Consumption of high caloric food',
           'Consumption frequency of vegetables', 'Number of main meals daily', 'Food Consumption between meals', 'Smoke',
           'Water consumption daily', 'Calories consumption monitoring', 'Physical activity frequency', 'Time using technology devices',
           'Alcohol consumption', 'Usual transportation used']
        X.columns = columns
        X['Consumption frequency of vegetables'] = X['Consumption frequency of vegetables'].replace(
            {1: '1. Never', 2: '2. Sometimes', 3: '3. Always'})
        X['Number of main meals daily'] = X['Number of main meals daily'].replace(
            {1: '1', 2: '2', 3: '3', 4: '3+'})
        X['Water consumption daily'] = X['Water consumption daily'].replace(
            {1: '1. Less than 1L', 2: '2. Between 1L and 2L', 3: '3. More than 2L'})
        X['Physical activity frequency'] = X['Physical activity frequency'].replace(
            {0: '1. I do not have', 1: '2. 1 or 2 days', 2: '3. 2 or 4 days', 3: '4. 4 or 5 days'})
        X['Time using technology devices'] = X['Time using technology devices'].replace(
            {0: '1. 0–2 hours', 1: '2. 3–5 hours', 2: '3. More than 5 hours'})
        X['Food Consumption between meals'] = X['Food Consumption between meals'].replace(
            {'no' : '1. No', 'Sometimes' : '2. Sometimes', 'Frequently': '3. Frequently','Always': '4. Always'})

        #X['Height'] = X['Height']*100

        X["Usual transportation used"] = X["Usual transportation used"].apply(lambda x : "Public_Transportation" if x == "Public_Transportation" else "Other")
        X["Public Transportation"] = X.apply(lambda x : "yes" if x['Usual transportation used'] == "Public_Transportation" else "no", axis=1)
        X.drop("Usual transportation used", inplace=True, axis=1)

        ordinal_cat = ['Consumption frequency of vegetables', 'Number of main meals daily', 'Food Consumption between meals',
               'Water consumption daily', 'Physical activity frequency', 'Time using technology devices',
               'Alcohol consumption']
        for cat in ordinal_cat:
            X[cat] = X[cat].map(lambda x: sorted(X[cat].unique()).index(x)+1)

        # Handling binary categorical features
        X[['Family history with overweight', 'Consumption of high caloric food', 'Smoke', 'Calories consumption monitoring','Public Transportation']] = X[[
            'Family history with overweight', 'Consumption of high caloric food', 'Smoke', 'Calories consumption monitoring','Public Transportation']].replace(to_replace=['no', 'yes'], value=[0, 1])
        X[['Gender']] = X[['Gender']].replace(
            to_replace=['Female', 'Male'], value=[0, 1])

        X.drop(["Smoke","Calories consumption monitoring"], inplace=True, axis=1)

        X['Age'] = X['Age'].map(lambda x: np.log(x))

        #X[ordinal_cat] = X[ordinal_cat].apply(lambda x : np.exp(x))

        return X.astype(np.float64),y

### Handling outliers

In [4]:
# class OutlierImputer(BaseEstimator):
#     def __init__(self): # no *args or **kargs
#         pass
#     def fit(self, X, y=None):
#         return self # nothing else to do
#     def transform(self, X, y):
#         df = pd.concat([X,y], axis=1)
#         for col in ['Age', 'Height', 'Weight']:
#             upper_lim = df[col].quantile(.95)
#             lower_lim = df[col].quantile(.05)

#             df.loc[(df[col] > upper_lim), col] = np.nan
#             df.loc[(df[col] < lower_lim), col] = np.nan
        
#         df.dropna(inplace=True)
#         return df.drop("Weight",axis=1), df['Weight']


## Model selection (Regression)

In [34]:

def score_weight_class(bmi_pred, bmi_true, low, high):
    tol = 1
    vpred = (bmi_pred >= low - tol) & (bmi_pred < high+tol)
    vtrue = (bmi_true >= low) & (bmi_true < high)
    if vtrue.sum() == 0:
        print("no true samples here")
        return 0
    rmse = np.sqrt(((bmi_true[vtrue]-bmi_pred[vtrue])**2).mean())
    rmse = rmse/(high-low+tol)  # normalize rmse in interval
    acc = (vpred & vtrue).sum()/vtrue.sum()
    return rmse*(1-acc)


def score_regression(ytrue, ypred, height):
    bmi_pred = ypred/(height*height)
    bmi_true = ytrue/(height*height)
    scores = []
    for bmi_low, bmi_high in zip([0, 18.5, 25, 30], [18.5, 25, 30, 100]):
        scores.append(score_weight_class(bmi_pred, bmi_true,
                                         low=bmi_low, high=bmi_high))
    return np.mean(scores)

# def _score(estimator, X, y):
#     ypred = estimator.predict(X)
#     return score_regression(np.array(y), np.array(ypred), np.array(X[2]))


In [44]:
# TODO Ensemble methods https://scikit-learn.org/stable/modules/ensemble.html
from sklearn.svm import SVR

X,y = DataEncoder().transform(X1.copy(),Y1.copy())
n_columns = len(X.columns)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    # 'Lasso': Lasso(),
    # 'Ridge': Ridge(),
    # 'KNeighborsRegressor': KNeighborsRegressor(),
    # 'SVR' : SVR(),
    'MLPRegressor' : MLPRegressor()
}

params = {
    # 'Lasso__alpha': random.uniform(low=0.0001, high=50, size=400),

    # 'Ridge__alpha': random.uniform(low=0.0001, high=50, size=400),
    # 'Ridge__solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],

    # 'KNeighborsRegressor__weights': ['uniform', 'distance'],
    # 'KNeighborsRegressor__n_neighbors': random.randint(5, 200, size=400),
    # 'KNeighborsRegressor__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    # 'KNeighborsRegressor__leaf_size': random.randint(20, 200, size=400),

    # 'SVR__kernel': ["rbf", "sigmoid", "linear", "poly"],
    # 'SVR__C': random.uniform(low=0.001, high=200, size=400),
    # 'SVR__epsilon': random.uniform(low=0.001, high=200, size=400),
    
    "MLPRegressor__hidden_layer_sizes": list(zip(random.randint(10, 30, size=200),random.randint(1, 200, size=200))), 
    "MLPRegressor__solver": ["lbfgs", "sgd", "adam"],
    "MLPRegressor__learning_rate": ['constant', 'invscaling', 'adaptive'], 
    "MLPRegressor__alpha": random.uniform(low=0.0001, high=200, size=200),
    "MLPRegressor__activation": ['logistic', 'tanh', 'relu', 'identity'],
    'MLPRegressor__max_iter': random.randint(200, 1000, size=200),
}

In [45]:
mlflow.set_experiment("Obsesity Regression")

for key in models.keys():
    with mlflow.start_run():
        print(f"Running RandomizedSearchCV for {key}.")
        model = models[key]
        param = {k : v for k,v in params.items() if key in k or 'KernelPCA' in k}
        pipeline = Pipeline(steps=[
            ('Scaler', StandardScaler()),
            (key,model)
        ])
        
        rs = RandomizedSearchCV(pipeline, param, n_iter=400, cv=KFold(n_splits=5), n_jobs=-2,
                                verbose=8, scoring=['neg_mean_absolute_error','neg_root_mean_squared_error'], refit='neg_mean_absolute_error',
                                random_state=42
                                )
        
        rs.fit(X_train, y_train)
        mlflow.log_param("Best Estimator", key)
        mlflow.log_metric("score_train_mae", float(rs.best_score_))
        preds = rs.predict(X_test)
        mlflow.log_metric("score_test_mae", mean_absolute_error(preds,y_test))
        mlflow.log_metric("score_test_rmse", mean_squared_error(preds,y_test,squared=False))
        mlflow.log_metric("score_test_reg", score_regression(y_test['Weight'].ravel(),preds.ravel(),X_test['Height'].ravel()))
        
        for key in rs.best_params_:
            mlflow.log_param(key.split('__')[1], rs.best_params_[key])
        mlflow.sklearn.log_model(rs.best_estimator_, "best_model")


Running RandomizedSearchCV for MLPRegressor.
Fitting 5 folds for each of 400 candidates, totalling 2000 fits


In [48]:
#Getting the best experients from Mlflow
df_experients = mlflow.search_runs(filter_string="metrics.score_test_reg < 0.12")

#Getting the best run id
best_experients_run_id = df_experients.loc[df_experients['metrics.score_test_reg'].idxmin()]['run_id']
best_experients_reg_score = df_experients.loc[df_experients['metrics.score_test_reg'].idxmin()]['metrics.score_test_reg']
#Load model
model = mlflow.sklearn.load_model("runs:/" + best_experients_run_id + "/best_model")
print(model)

Pipeline(steps=[('Scaler', StandardScaler()),
                ('MLPRegressor',
                 MLPRegressor(activation='logistic', alpha=23.707441290955774,
                              hidden_layer_sizes=(114, 196),
                              learning_rate='adaptive', max_iter=454,
                              solver='sgd'))])


In [49]:
#Predict Y2
X,_ = DataEncoder().transform(X2.copy())
y_pred = model.predict(X)
series_y = pd.Series(y_pred)
series_y = series_y.append(pd.Series(best_experients_reg_score))
series_y.to_csv('data/Y2.csv',index=False,header=False,sep='\n')


## Model selection (Classification)


### Getting the labels

In [60]:
def get_label(height, weight):
    bmi = weight/height**2
    if 0 < bmi < 18.5:
        return 1
    if 18.5 < bmi < 25.0:
        return 2
    if 25.0 < bmi < 30.0:
        return 3
    if bmi >=30.0:
        return 4

y_pred = model.predict(X_test)
df_true = pd.concat([X_test,y_test], axis=1)
df_pred = pd.concat([X_test,pd.DataFrame(y_pred, columns=['Weight'])], axis=1)
true_labels = df_true.apply(lambda x : get_label(x['Height'],x['Weight']),axis=1).ravel()
pred_labels = df_true.apply(lambda x : get_label(x['Height'],x['Weight']),axis=1).ravel()

In [61]:
from sklearn.metrics import confusion_matrix
confusion_matrix(true_labels, pred_labels)

array([[ 3,  0,  0,  0],
       [ 0, 32,  0,  0],
       [ 0,  0, 12,  0],
       [ 0,  0,  0,  3]], dtype=int64)

In [None]:

df_class = df.copy()


def get_label(height, weight):
    bmi = weight/height**2
    if 0 < bmi < 18.5:
        return 1
    if 18.5 < bmi < 25.0:
        return 2
    if 25.0 < bmi < 30.0:
        return 3
    if bmi >=30.0:
        return 4

df_class['label'] = df_class.apply(lambda x : get_label(x['Height'],x['Weight']),axis=1)
df_class.drop('Weight', inplace=True,axis=1)


X = pd.get_dummies(df_class.drop('label', axis=1))
y = df_class['label']



In [None]:
mlflow.set_experiment("Obsesity Classification")

# TODO Ensemble methods https://scikit-learn.org/stable/modules/ensemble.html
models = {
    'RandomForestClassifier' : RandomForestClassifier()
}

params = {
    'RandomForestClassifier': {
        'n_estimators': random.randint(1, 1000, 50),
        'max_depth': random.randint(1, 1000, 50),
        'max_features': ['sqrt', 'log2', 'auto'],
        'min_samples_split': random.randint(1, 1000, 50),
        'min_samples_leaf': random.randint(1, 1000, 50),
    }
}

# X = pd.DataFrame(pipeline.fit_transform(df_class)).drop(18, axis=1)
# y = df_class['label']
# y = y.astype(int)
X
cv = 5
for key in models.keys():
    with mlflow.start_run():
        print("Running RandomizedSearchCV for %s." % key)
        model = models[key]
        param = params[key]
        gs = RandomizedSearchCV(model, param, n_iter=50, cv=cv, n_jobs=-2,
                                verbose=3, scoring='accuracy'
                                )
        gs.fit(X_resampled, y_resampled)
        mlflow.log_param("Classifier", key)
        mlflow.log_metric("best_score", float(gs.best_score_))
        mlflow.log_metric("CV", cv)
        for key in gs.best_params_:
            mlflow.log_param(key, gs.best_params_[key])
        mlflow.sklearn.log_model(gs.best_estimator_, "best_model")
# def eval(predict, target):

#     rmse = np.sqrt(mean_squared_error(predict, target))
#     mae = mean_absolute_error(predict, target)
#     r2 = r2_score(predict, target)
#     return rmse, mae, r2
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.1)

lg = KNeighborsClassifier()
lg.fit(X_train, y_train)
preds = lg.predict(X_test)

conf_matrix = confusion_matrix(y_test, preds, labels=[1,2,3,4])

sns.heatmap(conf_matrix,annot=True,fmt='g',xticklabels=['underweight','normal','overweight','obese'],yticklabels=['underweight','normal','overweight','obese'])
