## **IMPORT NECESSARY LIBRARIES**

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path

## Remove outliers

In [2]:
data = pd.read_csv('data/edited-train-set.csv')

#Create bounds for the features to remove outliers
quartile = data[["Height", "Weight", "BMI"]].quantile([0.25,0.75])
iqr = quartile.iloc[1] - quartile.iloc[0]
upper = quartile.iloc[1] + 1.5*iqr
lower = quartile.iloc[0] - 1.5*iqr

#Remove outliers from the dataset
data.drop(data[(data["Height"] < lower["Height"]) | (data["Height"] > upper["Height"])].index, inplace=True)
data.drop(data[(data["Weight"] < lower["Weight"]) | (data["Weight"] > upper["Weight"])].index, inplace=True)
data.drop(data[(data["BMI"] < lower["BMI"]) | (data["BMI"] > upper["BMI"])].index, inplace=True)

#### > CREATE A CSV FILE FOR NEW DATASET

In [3]:
filepath = Path('data/remove-outliers.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)  
data.to_csv(filepath, index=False) 

## Split features with target

In [4]:
target = 'NObeyesdad'
features = data.columns.drop(target)

#Features
X = data[features]

#Target
y = data[target]

## Split train and test set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Shape of X_train is {X_train.shape} and y_train is {y_train.shape}')
print(f'Shape of X_test is {X_test.shape} and y_test is {y_test.shape}')
print(type(X_train))

Shape of X_train is (16603, 17) and y_train is (16603,)
Shape of X_test is (4151, 17) and y_test is (4151,)
<class 'pandas.core.frame.DataFrame'>


## Feature scaling and encoding

In [6]:
scaler = StandardScaler()

def processingData(X, isTest=False):
    #Numeric Data
    X_numeric = X.select_dtypes(include='number')
    numeric = X_numeric.columns

    #Categorical Data
    X_categorical = X.select_dtypes(exclude='number')
    categorical = X_categorical.columns

    # Process Numeric Data
    if not isTest:
        X_numeric = pd.DataFrame(scaler.fit_transform(X_numeric))
    else:
        X_numeric = pd.DataFrame(scaler.transform(X_numeric))

    X_numeric = X_numeric.set_axis(numeric, axis=1)

    # Process Categorical Data
    X_categorical = pd.get_dummies(X_categorical, drop_first = True)
    X_scaled = pd.concat([X_numeric, X_categorical.set_axis(X_numeric.index)], axis=1)
    X_scaled.reset_index(drop=True, inplace=True)
    return X_scaled

X_train = processingData(X_train)
X_test = processingData(X_test, True)
print(X_train)
print(X_test)
print(f'Shape of X_train is {X_train.shape} and y_train is {y_train.shape}')
print(f'Shape of X_test is {X_test.shape} and y_test is {y_test.shape}')

         Height    Weight       BMI      FCVC       NCP      CH2O       FAF  \
0      0.568422 -0.491415 -0.691813  1.039138  0.334952  1.592230  0.023253   
1     -0.578257 -0.681063 -0.545590 -0.834965 -2.493233 -0.052491  0.023253   
2      0.224418 -0.222115 -0.301085 -1.209785  0.334952 -0.266305 -0.012592   
3     -0.578257 -0.870711 -0.764925  1.039138  0.334952 -1.697212  0.023253   
4      0.224418 -1.363795 -1.520013 -0.834965  0.334952 -0.052491 -0.179870   
...         ...       ...       ...       ...       ...       ...       ...   
16598 -0.922261  0.839912  1.400861  1.039138  0.334952  0.605398 -1.159640   
16599 -0.692925  0.881635  1.327749  1.039138  0.334952  1.082367 -1.171588   
16600 -0.578257  0.908185  1.298984  1.039138  0.334952 -1.582082 -1.171588   
16601 -0.578257  0.908185  1.298984  1.039138  0.334952  1.213945 -1.171588   
16602 -0.692925 -1.629302 -1.621890  1.039138  0.334952 -0.134727 -0.359096   

            TUE  Gender_Male  Age_30-39  ...  CAEC_

## Feature scaling and encoding

In [7]:
scaler = StandardScaler()

def processingData(X, isTest=False):
    #Numeric Data
    X_numeric = X.select_dtypes(include='number')
    numeric = X_numeric.columns

    #Categorical Data
    X_categorical = X.select_dtypes(exclude='number')
    categorical = X_categorical.columns

    # Process Numeric Data
    if not isTest:
        X_numeric = pd.DataFrame(scaler.fit_transform(X_numeric))
    else:
        X_numeric = pd.DataFrame(scaler.transform(X_numeric))

    X_numeric = X_numeric.set_axis(numeric, axis=1)

    # Process Categorical Data
    X_categorical = pd.get_dummies(X_categorical, drop_first = True)
    X_scaled = pd.concat([X_numeric, X_categorical.set_axis(X_numeric.index)], axis=1)
    X_scaled.reset_index(drop=True, inplace=True)
    return X_scaled

X_train = processingData(X_train)
X_test = processingData(X_test, True)
print(X_train)
print(X_test)
print(f'Shape of X_train is {X_train.shape} and y_train is {y_train.shape}')
print(f'Shape of X_test is {X_test.shape} and y_test is {y_test.shape}')

         Height    Weight       BMI      FCVC       NCP      CH2O       FAF  \
0      0.568422 -0.491415 -0.691813  1.039138  0.334952  1.592230  0.023253   
1     -0.578257 -0.681063 -0.545590 -0.834965 -2.493233 -0.052491  0.023253   
2      0.224418 -0.222115 -0.301085 -1.209785  0.334952 -0.266305 -0.012592   
3     -0.578257 -0.870711 -0.764925  1.039138  0.334952 -1.697212  0.023253   
4      0.224418 -1.363795 -1.520013 -0.834965  0.334952 -0.052491 -0.179870   
...         ...       ...       ...       ...       ...       ...       ...   
16598 -0.922261  0.839912  1.400861  1.039138  0.334952  0.605398 -1.159640   
16599 -0.692925  0.881635  1.327749  1.039138  0.334952  1.082367 -1.171588   
16600 -0.578257  0.908185  1.298984  1.039138  0.334952 -1.582082 -1.171588   
16601 -0.578257  0.908185  1.298984  1.039138  0.334952  1.213945 -1.171588   
16602 -0.692925 -1.629302 -1.621890  1.039138  0.334952 -0.134727 -0.359096   

            TUE  Gender_Male  Age_30-39  ...  CAEC_