In [14]:
import pandas as pd


data_file_path = "data/bodyfat.csv"

df = pd.read_csv(data_file_path)

In [15]:
df.drop(columns=['Original'], inplace=True)

df['Sex'] = df['Sex'].replace({'M': 'Male', 'F': 'Female'})

print("NaN values:")
display(df.isna().sum())

NaN values:


BodyFat    0
Sex        0
Age        0
Weight     0
Height     0
Neck       0
Chest      0
Abdomen    0
Hip        0
Thigh      0
Knee       0
Ankle      0
Biceps     0
Forearm    0
Wrist      0
dtype: int64

In [16]:
sex_mapping = {'Male': 1, 'Female': 0}

df['Sex'] = df['Sex'].map(sex_mapping)
display(df.head())

Unnamed: 0,BodyFat,Sex,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,12.3,1,23,69.97,1.72,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,6.1,1,22,78.59,1.84,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,25.3,1,22,69.85,1.68,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,10.4,1,26,83.8,1.84,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,28.7,1,24,83.58,1.81,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


In [17]:
from sklearn.model_selection import train_test_split


x = df.drop(columns='BodyFat')
y = df['BodyFat']

test_size = 0.2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)

In [18]:
print("size of train data:", x_train.shape)
print("size of test data:", x_test.shape)
print("\nexamples in train data:")
print(x_train.head())
print("\nexamples in train label:")
print(y_train.head())

size of train data: (348, 14)
size of test data: (88, 14)

examples in train data:
     Sex  Age   Weight  Height  Neck  Chest  Abdomen    Hip  Thigh  Knee  \
265    0   18  53.9784  1.6002  30.5   86.0     66.4   92.3   48.5  35.0   
405    0   19  64.4112  1.5367  34.0   94.0     77.0   99.0   55.5  35.0   
31     1   29  72.6900  1.8100  37.3   93.5     84.5  100.6   58.5  38.8   
84     1   72  76.2000  1.7600  38.5  101.4     99.8   96.2   56.3  36.6   
299    0   21  67.5864  1.6637  32.0   86.3     72.3  107.3   57.8  39.0   

     Ankle  Biceps  Forearm  Wrist  
265   20.0    24.5     21.6   15.6  
405   22.5    32.5     26.5   17.3  
31    21.5    30.1     26.4   17.9  
84    22.0    29.7     26.3   18.0  
299   23.0    29.7     25.1   16.7  

examples in train label:
265    14.93
405    26.64
31      5.70
84     27.00
299    32.23
Name: BodyFat, dtype: float64


In [19]:
from sklearn.preprocessing import StandardScaler


# scale on train data and apply it to test data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# model
n_estimators = 500
max_depth = 8
max_samples = 0.65
random_state = 0
model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, max_samples=max_samples, random_state=random_state)

# train
model.fit(x_train_scaled, y_train)

In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# eval on train data
y_train_pred = model.predict(x_train_scaled)

mse_train = mean_squared_error(y_train, y_train_pred).round(3)
mae_train = mean_absolute_error(y_train, y_train_pred).round(3)
r2_train = r2_score(y_train, y_train_pred).round(3)

# eval on test data
y_test_pred = model.predict(x_test_scaled)

mse_test = mean_squared_error(y_test, y_test_pred).round(3)
mae_test = mean_absolute_error(y_test, y_test_pred).round(3)
r2_test = r2_score(y_test, y_test_pred).round(3)

print(f"Train | Mean Squared Error: {mse_train} | Mean Absolute Error: {mae_train} | R-squared: {r2_train}")
print(f"Test | Mean Squared Error: {mse_test} | Mean Absolute Error: {mae_test} | R-squared: {r2_test}")

Train | Mean Squared Error: 6.361 | Mean Absolute Error: 2.059 | R-squared: 0.892
Test | Mean Squared Error: 17.907 | Mean Absolute Error: 3.376 | R-squared: 0.613


In [36]:
import pickle

with open('rfr_bf.pkl', 'wb') as file:
    pickle.dump(model, file)