# Explanation
This is the same work as the other notebook, but here instead of predicting the body density, we will try to predict directly the body fat percentage and see if we can obtain better models.
The same dataset is used.

In [2]:
## Import Necessary libraries

## Preprocessing libraries
import pandas as pd
import numpy as np

## Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

## Identify outliers with Z-Score Method
import scipy.stats as stats

## Train-test sklearn library
from sklearn.model_selection import train_test_split

## Preprocess the data to normalize it
from sklearn.preprocessing import PowerTransformer

## Machine Learning Model library
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression , ElasticNet , Lasso , Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge

## Evaluation metrics library
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

## Save the model
import joblib

In [3]:
## Import Dataset

df = pd.read_csv('../body_fat/bodyfat.csv')
df.head()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,1.0708,12.3,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.034,28.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


The dataset is the same, so we can directly go to the data preparation and analysis.

In [4]:
#Convert to metric units:
df['Height'] = (df['Height'] * 0.0254).round(2) # converting to m

df['Weight'] = (df['Weight'] * 0.454).round(2) # converting to kg
df.head()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,1.0708,12.3,23,70.03,1.72,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,78.66,1.84,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,69.92,1.68,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,83.88,1.84,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.034,28.7,24,83.65,1.81,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


In [5]:
print(df[df['BodyFat'] < 3]) # people with extremly low body-fat (under essential fat threshold)
print(df[df['Height'] < 1]) # men lower than 1 m

     Density  BodyFat  Age  Weight  Height  Neck  Chest  Abdomen   Hip  Thigh  \
171   1.0983      0.7   35   57.09    1.66  34.0   90.8     75.0  89.2   50.0   
181   1.1089      0.0   40   53.80    1.73  33.8   79.3     69.4  85.0   47.2   

     Knee  Ankle  Biceps  Forearm  Wrist  
171  34.8   22.0    24.8     25.9   16.9  
181  33.5   20.2    27.7     24.6   16.5  
    Density  BodyFat  Age  Weight  Height  Neck  Chest  Abdomen    Hip  Thigh  \
41    1.025     32.9   44   93.07    0.75  36.6  106.0    104.3  115.5   70.6   

    Knee  Ankle  Biceps  Forearm  Wrist  
41  42.5   23.7    33.6     28.7   17.4  


In [6]:
df.drop([41, 171, 181], axis=0, inplace=True)
df.describe()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
count,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0
mean,1.055311,19.246586,44.947791,81.391205,1.787108,38.030522,100.93012,92.672289,99.94498,59.447791,38.610442,23.116064,32.316466,28.691165,18.245382
std,0.018545,8.201219,12.658403,13.198728,0.066385,2.415865,8.339071,10.663334,7.043041,5.141098,2.379646,1.693198,2.986639,2.008591,0.927312
min,0.995,3.0,22.0,56.75,1.63,31.1,83.4,70.4,85.3,49.3,33.0,19.1,25.3,21.0,15.8
25%,1.0414,12.5,36.0,72.3,1.74,36.4,94.6,85.2,95.6,56.1,37.1,22.0,30.3,27.3,17.6
50%,1.0549,19.2,43.0,80.24,1.78,38.0,99.7,91.0,99.3,59.0,38.5,22.8,32.1,28.8,18.3
75%,1.0703,25.3,54.0,89.44,1.84,39.5,105.3,99.2,103.5,62.3,39.9,24.0,34.4,30.0,18.8
max,1.0991,47.5,81.0,164.87,1.97,51.2,136.2,148.1,147.7,87.3,49.1,33.9,45.0,34.9,21.4


# 1. Data analysis

We go directly to the data preparation.

In [7]:
#Insert BMI columns to the dataset:

h_squared = df['Height'] ** 2

df['BMI'] = df['Weight'] / h_squared #creating bmi

#rounding the value to 1 decimal
df['BMI'] = round(df['BMI'], 1)
df.head()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist,BMI
0,1.0708,12.3,23,70.03,1.72,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1,23.7
1,1.0853,6.1,22,78.66,1.84,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2,23.2
2,1.0414,25.3,22,69.92,1.68,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6,24.8
3,1.0751,10.4,26,83.88,1.84,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2,24.8
4,1.034,28.7,24,83.65,1.81,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7,25.5


In [8]:
df['ACratio'] = df['Abdomen']/df['Chest']
df['HTratio'] = df['Hip']/df['Thigh']
df.drop(['Weight','Height','Abdomen','Chest','Hip','Thigh'],axis=1,inplace=True)
df.head()

Unnamed: 0,Density,BodyFat,Age,Neck,Knee,Ankle,Biceps,Forearm,Wrist,BMI,ACratio,HTratio
0,1.0708,12.3,23,36.2,37.3,21.9,32.0,27.4,17.1,23.7,0.915145,1.601695
1,1.0853,6.1,22,38.5,37.3,23.4,30.5,28.9,18.2,23.2,0.886752,1.681431
2,1.0414,25.3,22,34.0,38.9,24.0,28.8,25.2,16.6,24.8,0.917537,1.66443
3,1.0751,10.4,26,37.4,37.3,22.8,32.4,29.4,18.2,24.8,0.848723,1.68386
4,1.034,28.7,24,34.4,42.2,24.0,32.2,27.7,17.7,25.5,1.027749,1.612342


Here is where the change starts. We will try to predict Body Fat directly.

In [9]:
X = df.drop(['BodyFat','Density'],axis=1)
y = df['BodyFat']
X.head()

Unnamed: 0,Age,Neck,Knee,Ankle,Biceps,Forearm,Wrist,BMI,ACratio,HTratio
0,23,36.2,37.3,21.9,32.0,27.4,17.1,23.7,0.915145,1.601695
1,22,38.5,37.3,23.4,30.5,28.9,18.2,23.2,0.886752,1.681431
2,22,34.0,38.9,24.0,28.8,25.2,16.6,24.8,0.917537,1.66443
3,26,37.4,37.3,22.8,32.4,29.4,18.2,24.8,0.848723,1.68386
4,24,34.4,42.2,24.0,32.2,27.7,17.7,25.5,1.027749,1.612342


In [12]:
# Remove outliers
z = np.abs(stats.zscore(X))

#only keep rows in dataframe with all z-scores less than absolute value of 3 
X_clean_BF = X[(z<3).all(axis=1)]
y_clean_BF = y[(z<3).all(axis=1)]
#find how many rows are left in the dataframe 
X_clean_BF.shape

(239, 10)

# 2. Model extra3_BF
This model takes all the input data.
## 2.1 Drop certain data if needed
Here, no data is going to be dropped
## 2.2 Train-test split
We will use the default 25/75 split in train_test_split from scikit-learn.
We will set random_state=42: This sets a fixed random seed to ensure the split is reproducible. Using a specific random state ensures you always get the same split of training and testing data each time you run the code. You can set it to any number, but using 42 is a common convention.

In [13]:
X_extra3_BF = X_clean_BF
y_extra3_BF = y_clean_BF
X_extra3_BF_train,X_extra3_BF_test,y_extra3_BF_train,y_extra3_BF_test = train_test_split(X_extra3_BF,y_extra3_BF,random_state=42)

## 2.3 Applying Power Transformation to Normalize Data

In [14]:
trans_extra3_BF = PowerTransformer()
X_extra3_BF_train_transformed = pd.DataFrame(trans_extra3_BF.fit_transform(X_extra3_BF_train), columns=X_extra3_BF_train.columns)
X_extra3_BF_test_transformed = pd.DataFrame(trans_extra3_BF.transform(X_extra3_BF_test), columns=X_extra3_BF_test.columns)

## 2.4 Save the Power Transformation

# TO DO later if the model performs better!

## 2.5. Test Machine Learning Models and check the metrics
We will use several Machine Learning Models and we will test them with the following metrics:
- MAE (Mean Absolute Error) to interpret the average error in predicting body density (which gives a more interpretable result).
- RMSE (Root Mean Squared Error) to emphasize larger errors more (especially if predicting extremely high or low body density is critical).
- R² score to understand how much of the variation in body fat the model can explain, which gives you a sense of the model’s overall performance.

In [15]:
# Models
kernel = KernelRidge()
random = RandomForestRegressor()
linear = LinearRegression()
elastic = ElasticNet()
lasso  = Lasso()
ridge = Ridge()
svr=SVR()
grad = GradientBoostingRegressor()
sgd = SGDRegressor()
bay = BayesianRidge()
clf = [linear,elastic,lasso,ridge,svr,grad,sgd,bay,random,kernel]

In [16]:
# Build models and their metrics
hashmap_extra3_BF={}
def compute(model):
    model.fit(X_extra3_BF_train_transformed,y_extra3_BF_train)
    y_extra3_BF_pred = model.predict(X_extra3_BF_test_transformed)
    r2=r2_score(y_extra3_BF_test,y_extra3_BF_pred)
    rmse=root_mean_squared_error(y_extra3_BF_test,y_extra3_BF_pred)
    mae=mean_absolute_error(y_extra3_BF_test,y_extra3_BF_pred)
    hashmap_extra3_BF[str(model)]=(r2,rmse,mae)

In [17]:
for i in clf:
    compute(i)

In [18]:
# Show the scores of models

score_extra3_BF=pd.DataFrame(hashmap_extra3_BF)
score_extra3_BF = score_extra3_BF.transpose()
score_extra3_BF.columns=['R2_score','RMSE','MAE']
score_extra3_BF = score_extra3_BF.sort_values('R2_score',ascending=False)

In [19]:
score_extra3_BF

Unnamed: 0,R2_score,RMSE,MAE
BayesianRidge(),0.649167,4.93352,4.157464
SGDRegressor(),0.647675,4.943999,4.162983
Ridge(),0.645954,4.956065,4.171679
LinearRegression(),0.644845,4.963816,4.175614
Lasso(),0.633321,5.043709,4.193833
RandomForestRegressor(),0.615775,5.162969,4.258333
GradientBoostingRegressor(),0.605651,5.230547,4.230224
ElasticNet(),0.584245,5.370633,4.55504
SVR(),0.493862,5.925723,4.992596
KernelRidge(),-4.60659,19.722244,19.089434


The difference is not large, but R2 is sligtly lower (worse) and RMSE and MAE are slightly higher (worse), when compared to the model that predicts the body density, recalculated with Siri equation to the body fat. Therefore, we will stop here and the original models will be used.