# Data preparation and model selection


In [20]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import NuSVR
from numpy import random
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import NuSVR
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold

import warnings
warnings.simplefilter("ignore")

#TODO Deep learning model with Tensorflow

In [21]:
X1 = pd.read_csv("data/X1.csv", index_col=0)
Y1 = pd.read_csv("data/Y1.csv", header=None, names=['Weight'])
X2 = pd.read_csv("data/X2.csv", index_col=0)

## Data preparation

### Data encoding

In [22]:
class DataEncoder(BaseEstimator):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        columns = ['Gender', 'Age', 'Height', 'Family history with overweight', 'Consumption of high caloric food',
           'Consumption frequency of vegetables', 'Number of main meals daily', 'Food Consumption between meals', 'Smoke',
           'Water consumption daily', 'Calories consumption monitoring', 'Physical activity frequency', 'Time using technology devices',
           'Alcohol consumption', 'Usual transportation used']
        X.columns = columns
        X['Consumption frequency of vegetables'] = X['Consumption frequency of vegetables'].replace(
            {1: '1. Never', 2: '2. Sometimes', 3: '3. Always'})
        X['Number of main meals daily'] = X['Number of main meals daily'].replace(
            {1: '1', 2: '2', 3: '3', 4: '3+'})
        X['Water consumption daily'] = X['Water consumption daily'].replace(
            {1: '1. Less than 1L', 2: '2. Between 1L and 2L', 3: '3. More than 2L'})
        X['Physical activity frequency'] = X['Physical activity frequency'].replace(
            {0: '1. I do not have', 1: '2. 1 or 2 days', 2: '3. 2 or 4 days', 3: '4. 4 or 5 days'})
        X['Time using technology devices'] = X['Time using technology devices'].replace(
            {0: '1. 0–2 hours', 1: '2. 3–5 hours', 2: '3. More than 5 hours'})
        
        ordinal_cat = ['Consumption frequency of vegetables', 'Number of main meals daily', 'Food Consumption between meals',
               'Water consumption daily', 'Physical activity frequency', 'Time using technology devices',
               'Alcohol consumption']
        for cat in ordinal_cat:
            X[cat] = X[cat].map(lambda x: sorted(X[cat].unique()).index(x)+1)

        # Handling binary categorical features
        X[['Family history with overweight', 'Consumption of high caloric food', 'Smoke', 'Calories consumption monitoring']] = X[[
            'Family history with overweight', 'Consumption of high caloric food', 'Smoke', 'Calories consumption monitoring']].replace(to_replace=['no', 'yes'], value=[0, 1])
        X[['Gender']] = X[['Gender']].replace(
            to_replace=['Female', 'Male'], value=[0, 1])

        X['Age'] = X['Age'].map(lambda x: np.log(x))

        X[ordinal_cat] = X[ordinal_cat].apply(lambda x : np.exp(x))

        return pd.get_dummies(X),y

### Handling outliers

In [23]:
class OutlierImputer(BaseEstimator):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y):
        df = pd.concat([X,y], axis=1)
        for col in ['Age', 'Height', 'Weight']:
            upper_lim = df[col].quantile(.95)
            lower_lim = df[col].quantile(.05)

            df.loc[(df[col] > upper_lim), col] = np.nan
            df.loc[(df[col] < lower_lim), col] = np.nan
        
        df.dropna(inplace=True)
        return df.drop("Weight",axis=1), df['Weight']


## Model selection (Regression)

In [None]:

# def score_weight_class(bmi_pred, bmi_true, low, high):
#     tol = 1
#     vpred = (bmi_pred >= low - tol) & (bmi_pred < high+tol)
#     vtrue = (bmi_true >= low) & (bmi_true < high)
#     if vtrue.sum() == 0:
#         print("no true samples here")
#         return 0
#     rmse = np.sqrt(((bmi_true[vtrue]-bmi_pred[vtrue])**2).mean())
#     rmse = rmse/(high-low+tol)  # normalize rmse in interval
#     acc = (vpred & vtrue).sum()/vtrue.sum()
#     return rmse*(1-acc)


# def score_regression(ytrue, ypred, height):
#     bmi_pred = ypred/(height*height)
#     bmi_true = ytrue/(height*height)
#     scores = []
#     for bmi_low, bmi_high in zip([0, 18.5, 25, 30], [18.5, 25, 30, 100]):
#         scores.append(score_weight_class(bmi_pred, bmi_true,
#                                          low=bmi_low, high=bmi_high))
#     return np.mean(scores)

# def _score(estimator, X, y):
#     ypred = estimator.predict_proba(X)
#     print(ypred)
#     return score_regression(y, ypred, X[6])


In [24]:
# TODO Ensemble methods https://scikit-learn.org/stable/modules/ensemble.html

models = {
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'ElasticNet': ElasticNet(),
    'NuSVR': NuSVR(),
}

params = {
    'Lasso__alpha': random.uniform(low=0.0001, high=100, size=400),

    'Ridge__alpha': random.uniform(low=0.0001, high=100, size=400),

    'KNeighborsRegressor__weights': ['uniform', 'distance'],
    'KNeighborsRegressor__n_neighbors': random.randint(5, 1000, size=400),

    'ElasticNet__alpha':  random.uniform(low=0.0001, high=100, size=400),
    'ElasticNet__l1_ratio': random.uniform(low=0.0001, high=1, size=400),

    'NuSVR__nu': random.uniform(low=0.0001, high=1, size=400),
    'NuSVR__C': random.uniform(low=0.001, high=100, size=400),
    'NuSVR__kernel': ['linear', 'rbf'],
    
}

X,y = OutlierImputer().transform(X1.copy(),Y1.copy())
X,y = DataEncoder().transform(X,y)

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# random_state=42,k=k,pert=0.04
mlflow.set_experiment("Obsesity Regression")
for cv in [3,5,10]:
    for key in models.keys():
        with mlflow.start_run():
            print("Running RandomizedSearchCV for %s." % key)
            model = models[key]
            param = {k : v for k,v in params.items() if key in k }
            pipeline = Pipeline(steps=[
                ('Scaler', StandardScaler()),
                (key,model)
            ])
            
            gs = RandomizedSearchCV(pipeline, param, n_iter=400, cv=KFold(n_splits=cv), n_jobs=-2,
                                    verbose=8, scoring='neg_mean_absolute_error', refit='neg_mean_absolute_error',
                                    random_state=42
                                    )
            
            gs.fit(train_x, train_y)
            mlflow.log_param("__MODEL__", key)
            mlflow.log_metric("score_train_mae", float(gs.best_score_))
            mlflow.log_metric("CV", cv)
            preds = gs.predict(test_x)
            mlflow.log_metric("score_test_mae", mean_absolute_error(preds,test_y))
            mlflow.log_metric("score_test_rmse", mean_squared_error(preds,test_y,squared=False))
            
            for key in gs.best_params_:
                mlflow.log_param(key.split('__')[1], gs.best_params_[key])
            mlflow.sklearn.log_model(gs.best_estimator_, "best_model")


Running RandomizedSearchCV for Lasso.
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Running RandomizedSearchCV for Ridge.
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Running RandomizedSearchCV for KNeighborsRegressor.
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Running RandomizedSearchCV for ElasticNet.
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Running RandomizedSearchCV for NuSVR.
Fitting 3 folds for each of 400 candidates, totalling 1200 fits
Running RandomizedSearchCV for Lasso.
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Running RandomizedSearchCV for Ridge.
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Running RandomizedSearchCV for KNeighborsRegressor.
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Running RandomizedSearchCV for ElasticNet.
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Running RandomizedSearchCV for NuSVR.
Fittin

In [29]:
#Getting the best experients from Mlflow
df_experients = mlflow.search_runs(filter_string="metrics.score_test_mae < 9")

#Getting the best run id
best_experients_run_id = df_experients.loc[df_experients['metrics.score_test_mae'].idxmin()]
best_experients_run_id

run_id                                            b9db28108ff04907bc77c7f217d73dd4
experiment_id                                                                    1
status                                                                    FINISHED
artifact_uri                     file:///C:/Users/marci/Dropbox%20%28PCSAMU%20S...
start_time                                        2021-11-12 10:51:52.839000+00:00
end_time                                          2021-11-12 10:52:04.354000+00:00
metrics.score_test_rmse                                                   9.454336
metrics.CV                                                                     3.0
metrics.score_test_mae                                                    7.013637
metrics.score_train_mae                                                  -7.984738
params.__MODEL__                                                             Ridge
params.C                                                                      None
para

In [30]:
#Getting the best experients from Mlflow
df_experients = mlflow.search_runs(filter_string="metrics.score_test_mae < 9")

#Getting the best run id
best_experients_run_id = df_experients.loc[df_experients['metrics.score_test_mae'].idxmin()]['run_id']
best_experients_mae_score = df_experients.loc[df_experients['metrics.score_test_mae'].idxmin()]['metrics.score_test_mae']
#Load model
model = mlflow.sklearn.load_model("runs:/" + best_experients_run_id + "/best_model")
print(model)

Pipeline(steps=[('Scaler', StandardScaler()),
                ('Ridge', Ridge(alpha=28.167307751568504))])


In [35]:
#Predict Y2
X,_ = DataEncoder().transform(X2.copy())
y_pred = model.predict(X)
series_y = pd.Series(y_pred)
series_y = series_y.append(pd.Series(best_experients_mae_score))
series_y.to_csv('data/Y2.csv',index=False,header=False,sep='\n')


## Model selection (Classification)


### Getting the labels

In [None]:
def get_label(height, weight):
    bmi = weight/height**2
    if 0 < bmi < 18.5:
        return 1
    if 18.5 < bmi < 25.0:
        return 2
    if 25.0 < bmi < 30.0:
        return 3
    if bmi >=30.0:
        return 4

df = pd.concat([X1,Y1], axis=1)
labels = df.apply(lambda x : get_label(x['Height'],x['Weight']),axis=1)

In [None]:

df_class = df.copy()


def get_label(height, weight):
    bmi = weight/height**2
    if 0 < bmi < 18.5:
        return 1
    if 18.5 < bmi < 25.0:
        return 2
    if 25.0 < bmi < 30.0:
        return 3
    if bmi >=30.0:
        return 4

df_class['label'] = df_class.apply(lambda x : get_label(x['Height'],x['Weight']),axis=1)
df_class.drop('Weight', inplace=True,axis=1)


X = pd.get_dummies(df_class.drop('label', axis=1))
y = df_class['label']



In [None]:
mlflow.set_experiment("Obsesity Classification")

# TODO Ensemble methods https://scikit-learn.org/stable/modules/ensemble.html
models = {
    'RandomForestClassifier' : RandomForestClassifier()
}

params = {
    'RandomForestClassifier': {
        'n_estimators': random.randint(1, 1000, 50),
        'max_depth': random.randint(1, 1000, 50),
        'max_features': ['sqrt', 'log2', 'auto'],
        'min_samples_split': random.randint(1, 1000, 50),
        'min_samples_leaf': random.randint(1, 1000, 50),
    }
}

# X = pd.DataFrame(pipeline.fit_transform(df_class)).drop(18, axis=1)
# y = df_class['label']
# y = y.astype(int)
X
cv = 5
for key in models.keys():
    with mlflow.start_run():
        print("Running RandomizedSearchCV for %s." % key)
        model = models[key]
        param = params[key]
        gs = RandomizedSearchCV(model, param, n_iter=50, cv=cv, n_jobs=-2,
                                verbose=3, scoring='accuracy'
                                )
        gs.fit(X_resampled, y_resampled)
        mlflow.log_param("Classifier", key)
        mlflow.log_metric("best_score", float(gs.best_score_))
        mlflow.log_metric("CV", cv)
        for key in gs.best_params_:
            mlflow.log_param(key, gs.best_params_[key])
        mlflow.sklearn.log_model(gs.best_estimator_, "best_model")
# def eval(predict, target):

#     rmse = np.sqrt(mean_squared_error(predict, target))
#     mae = mean_absolute_error(predict, target)
#     r2 = r2_score(predict, target)
#     return rmse, mae, r2
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, test_size=0.1)

lg = KNeighborsClassifier()
lg.fit(X_train, y_train)
preds = lg.predict(X_test)

conf_matrix = confusion_matrix(y_test, preds, labels=[1,2,3,4])

sns.heatmap(conf_matrix,annot=True,fmt='g',xticklabels=['underweight','normal','overweight','obese'],yticklabels=['underweight','normal','overweight','obese'])
