## **Import library**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## **Import dataset**

In [2]:
dataset = pd.read_csv('obesity_data.csv')

In [3]:
print(dataset.head())

   Age  Gender      Height     Weight        BMI  PhysicalActivityLevel  \
0   56    Male  173.575262  71.982051  23.891783                      4   
1   69    Male  164.127306  89.959256  33.395209                      2   
2   46  Female  168.072202  72.930629  25.817737                      4   
3   32    Male  168.459633  84.886912  29.912247                      3   
4   60    Male  183.568568  69.038945        NaN                      3   

  ObesityCategory  
0   Normal weight  
1           Obese  
2      Overweight  
3      Overweight  
4   Normal weight  


In [4]:
# Memisahkan fitur (X) dan target (y)
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [5]:
print(X)

[['Male' 173.57526243837 71.98205082004 23.891782623968 4]
 ['Male' 164.12730582234 89.959255532644 33.395209450798 2]
 ['Female' 168.07220212761 72.930629265276 25.817737455643 4]
 ['Male' 168.45963284033 84.886912472418 29.912246975759 3]
 ['Male' 183.56856774219 69.038945498073 nan 3]
 ['Female' 166.40562725975 61.145867749603 22.081628323425 4]
 ['Male' 183.56633382183 92.208520833128 27.36434055905 3]
 ['Male' 142.87509508152 59.359745680959 29.078966474628 1]
 ['Male' 183.47855781329 75.157672346864 22.325576683307 4]
 ['Male' nan 81.533460448367 24.353244006477 2]
 ['Male' 179.02254673389 82.622390567063 25.779963285893 4]
 ['Female' 149.88081996796 52.5183599467 23.378628760763 1]
 ['Male' 180.18886995222 85.779256087424 26.41960695286 1]
 ['Male' 169.49880824325 55.315671308391 nan 1]
 ['Male' 144.70662560075 82.16055483025 39.236163052315 1]
 ['Male' 182.98177707835 78.027602065736 23.304113045545 1]
 ['Male' 184.44173078549 nan 24.18650154871 2]
 ['Female' nan 51.92494718351

In [6]:
print(y)

['Normal weight' 'Obese' 'Overweight' 'Overweight' 'Normal weight'
 'Normal weight' 'Overweight' 'Overweight' 'Normal weight' 'Normal weight'
 'Overweight' 'Normal weight' 'Overweight' 'Normal weight' 'Obese'
 'Normal weight' 'Normal weight' 'Normal weight' 'Overweight' 'Overweight'
 'Normal weight' 'Underweight' 'Obese' 'Normal weight' 'Underweight'
 'Underweight' 'Normal weight' 'Obese' 'Overweight' 'Obese' 'Overweight'
 'Obese' 'Normal weight' 'Normal weight' 'Overweight' 'Normal weight'
 'Normal weight' 'Obese' 'Obese' 'Obese' 'Underweight' 'Overweight'
 'Overweight' 'Normal weight' 'Normal weight' 'Underweight' 'Obese'
 'Normal weight' 'Normal weight' 'Normal weight' 'Overweight'
 'Normal weight' 'Underweight' 'Overweight' 'Underweight' 'Underweight'
 'Overweight' 'Overweight' 'Overweight' 'Normal weight' 'Overweight'
 'Obese' 'Overweight' 'Normal weight' 'Normal weight' 'Normal weight'
 'Underweight' 'Obese' 'Overweight' 'Obese' 'Obese' 'Normal weight'
 'Overweight' 'Obese' 'Norm

In [7]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    100 non-null    int64  
 1   Gender                 100 non-null    object 
 2   Height                 94 non-null     float64
 3   Weight                 95 non-null     float64
 4   BMI                    91 non-null     float64
 5   PhysicalActivityLevel  100 non-null    int64  
 6   ObesityCategory        100 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 5.6+ KB
None


## **Menghilangkan Missing Value**

In [8]:
# periksa nilai yang hilang di setiap kolom
print(dataset.isnull().sum())

Age                      0
Gender                   0
Height                   6
Weight                   5
BMI                      9
PhysicalActivityLevel    0
ObesityCategory          0
dtype: int64


In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:5])
X[:, 1:5] = imputer.transform(X[:, 1:5])

In [10]:
print(X)

[['Male' 173.57526243837 71.98205082004 23.891782623968 4.0]
 ['Male' 164.12730582234 89.959255532644 33.395209450798 2.0]
 ['Female' 168.07220212761 72.930629265276 25.817737455643 4.0]
 ['Male' 168.45963284033 84.886912472418 29.912246975759 3.0]
 ['Male' 183.56856774219 69.038945498073 24.647862924293513 3.0]
 ['Female' 166.40562725975 61.145867749603 22.081628323425 4.0]
 ['Male' 183.56633382183 92.208520833128 27.36434055905 3.0]
 ['Male' 142.87509508152 59.359745680959 29.078966474628 1.0]
 ['Male' 183.47855781329 75.157672346864 22.325576683307 4.0]
 ['Male' 170.8731509163998 81.533460448367 24.353244006477 2.0]
 ['Male' 179.02254673389 82.622390567063 25.779963285893 4.0]
 ['Female' 149.88081996796 52.5183599467 23.378628760763 1.0]
 ['Male' 180.18886995222 85.779256087424 26.41960695286 1.0]
 ['Male' 169.49880824325 55.315671308391 24.647862924293513 1.0]
 ['Male' 144.70662560075 82.16055483025 39.236163052315 1.0]
 ['Male' 182.98177707835 78.027602065736 23.304113045545 1.0]


## **Encoding data kategori (Atribut)**

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))


In [12]:
# Tampilkan hasil
print(X)

[[0.0 1.0 173.57526243837 71.98205082004 23.891782623968 4.0]
 [0.0 1.0 164.12730582234 89.959255532644 33.395209450798 2.0]
 [1.0 0.0 168.07220212761 72.930629265276 25.817737455643 4.0]
 [0.0 1.0 168.45963284033 84.886912472418 29.912246975759 3.0]
 [0.0 1.0 183.56856774219 69.038945498073 24.647862924293513 3.0]
 [1.0 0.0 166.40562725975 61.145867749603 22.081628323425 4.0]
 [0.0 1.0 183.56633382183 92.208520833128 27.36434055905 3.0]
 [0.0 1.0 142.87509508152 59.359745680959 29.078966474628 1.0]
 [0.0 1.0 183.47855781329 75.157672346864 22.325576683307 4.0]
 [0.0 1.0 170.8731509163998 81.533460448367 24.353244006477 2.0]
 [0.0 1.0 179.02254673389 82.622390567063 25.779963285893 4.0]
 [1.0 0.0 149.88081996796 52.5183599467 23.378628760763 1.0]
 [0.0 1.0 180.18886995222 85.779256087424 26.41960695286 1.0]
 [0.0 1.0 169.49880824325 55.315671308391 24.647862924293513 1.0]
 [0.0 1.0 144.70662560075 82.16055483025 39.236163052315 1.0]
 [0.0 1.0 182.98177707835 78.027602065736 23.30411304

## **Encoding data kategori (Label)**

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
print(y)

[0 1 2 2 0 0 2 2 0 0 2 0 2 0 1 0 0 0 2 2 0 3 1 0 3 3 0 1 2 1 2 1 0 0 2 0 0
 1 1 1 3 2 2 0 0 3 1 0 0 0 2 0 3 2 3 3 2 2 2 0 2 1 2 0 0 0 3 1 2 1 1 0 2 1
 0 0 3 3 0 0 3 0 0 2 0 3 0 0 0 0 3 0 1 3 0 0 0 2 2 1]


## **Membagi dataset ke dalam training set dan test set**

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [16]:
print(X_train)

[[1.0 0.0 166.68413375737 37.658484529378 13.554213249636 1.0]
 [1.0 0.0 170.43271694744 57.069251778933 19.646999946733 2.0]
 [0.0 1.0 172.56161458434 62.105582031595 24.647862924293513 4.0]
 [0.0 1.0 154.08685504124 67.208164783717 28.306805067049 3.0]
 [1.0 0.0 169.96987396816 89.895132969808 31.116609835642 3.0]
 [0.0 1.0 182.98177707835 78.027602065736 23.304113045545 1.0]
 [0.0 1.0 174.545634823 33.213896148516 10.901891092284 3.0]
 [0.0 1.0 181.01656713996 63.501101747687 19.379591186017 4.0]
 [0.0 1.0 170.8731509163998 81.533460448367 24.353244006477 2.0]
 [0.0 1.0 183.78274786336 92.719553598705 24.647862924293513 4.0]
 [1.0 0.0 149.88081996796 52.5183599467 23.378628760763 1.0]
 [0.0 1.0 178.0185733204 64.969064158393 20.501044044821 2.0]
 [1.0 0.0 171.02128920871 35.699230776768 12.205582384203 1.0]
 [0.0 1.0 176.40903849409 89.952746294593 28.904985863755 4.0]
 [1.0 0.0 190.16887544187 42.915917906399 11.866961386675 1.0]
 [1.0 0.0 166.40562725975 61.145867749603 22.0816283

In [17]:
print(X_test)

[[0.0 1.0 179.74850284696 86.063428470094 26.637169812729 2.0]
 [1.0 0.0 170.54318886454 76.974712446116 26.465452059158 4.0]
 [0.0 1.0 169.12891156638 88.331849535596 30.880307483772 1.0]
 [1.0 0.0 170.8731509163998 47.724667625787 18.373122686814 2.0]
 [1.0 0.0 167.54930381684 67.380432222851 24.002061114809 2.0]
 [1.0 0.0 167.00261423391 83.763178099962 30.033545730575 2.0]
 [0.0 1.0 143.52747120102 83.79621421492 40.677507335848 2.0]
 [1.0 0.0 175.06734662487 53.558154628929 17.474924408752 4.0]
 [0.0 1.0 179.02254673389 82.622390567063 25.779963285893 4.0]
 [0.0 1.0 173.57526243837 71.98205082004 23.891782623968 4.0]
 [0.0 1.0 171.754210753 78.818946141016 26.718731616103 1.0]
 [0.0 1.0 169.75722720108 70.89119445815815 25.50025605552 1.0]
 [1.0 0.0 176.83585745201 100.53517478558 32.149733119416 1.0]
 [1.0 0.0 175.67853473771 70.102046475892 22.713982947471 2.0]
 [1.0 0.0 171.5136947457 53.871031176605 18.312923597353 1.0]
 [0.0 1.0 183.56856774219 69.038945498073 24.647862924293

In [18]:
print(y_train)

[3 0 0 2 1 0 3 0 0 2 0 0 3 2 3 0 3 0 0 0 0 2 2 0 1 2 0 3 2 0 3 2 0 1 0 0 2
 0 0 0 2 1 3 0 2 1 1 2 1 2 0 2 2 0 2 2 0 0 0 0 0 0 1 1 1 3 3 2 0 0 0 0 0 0
 0 2 0 1 1 0]


In [19]:
print(y_test)

[2 2 1 3 0 1 1 3 2 0 2 2 1 0 3 0 3 3 2 1]


## **Feature Scaling**

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 2:] = sc.fit_transform(X_train[:, 2:]) 
X_test[:, 2:] = sc.transform(X_test[:,2:])

In [21]:
print(X_train)

[[1.0 0.0 -0.4378126256108029 -2.0248107197311995 -1.833897162966501
  -1.3378351130848638]
 [1.0 0.0 -0.05713974201662717 -0.8291769485136596 -0.8094355774464029
  -0.4605661864718383]
 [0.0 1.0 0.15905223271727387 -0.5189570352217058 0.031426292443350747
  1.2939716667542127]
 [0.0 1.0 -1.7170805973105818 -0.20465620514972283 0.6466530934537317
  0.4167027401411872]
 [1.0 0.0 -0.10414197263020371 1.1927798148226565 1.1191030890536493
  0.4167027401411872]
 [0.0 1.0 1.2172315775487779 0.46178239738664034 -0.19451631801781152
  -1.3378351130848638]
 [0.0 1.0 0.3605317429921425 -2.298581452180816 -2.2798675042432714
  0.4167027401411872]
 [0.0 1.0 1.0176622746461286 -0.4329980167291335 -0.8543985831455204
  1.2939716667542127]
 [0.0 1.0 -0.012413171871170131 0.6777307226377101 -0.01811192031146815
  -0.4605661864718383]
 [0.0 1.0 1.298571071150558 1.366754014019247 0.031426292443350747
  1.2939716667542127]
 [1.0 0.0 -2.144208239584897 -1.1094955929969468 -0.18198699579572103
  -1.33783

In [22]:
print(X_test)

[[0.0 1.0 0.8888889044091199 0.9567605353019406 0.3659150231884614
  -0.4605661864718383]
 [1.0 0.0 -0.04592119327265537 0.3969281662067303 0.33704182430081386
  1.2939716667542127]
 [0.0 1.0 -0.18954266036948098 1.0964871518949895 1.0793704192435916
  -1.3378351130848638]
 [1.0 0.0 -0.012413171871170131 -1.4047698783470621 -1.0236295714814185
  -0.4605661864718383]
 [1.0 0.0 -0.3499536227186637 -0.1940451476236738 -0.07716098929941598
  -0.4605661864718383]
 [1.0 0.0 -0.4054705712330085 0.8150733474939211 0.9369930587081738
  -0.4605661864718383]
 [0.0 1.0 -2.7893980280887996 0.8171082539046663 2.726704454242266
  -0.4605661864718383]
 [1.0 0.0 0.4135121694500802 -1.0454479630799014 -1.1746556417840681
  1.2939716667542127]
 [0.0 1.0 0.8151672360045795 0.7448049195342139 0.22178144339007405
  1.2939716667542127]
 [0.0 1.0 0.26198932440696004 0.08939808397697226 -0.09570358453413255
  1.2939716667542127]
 [0.0 1.0 0.07705945594887514 0.5105263593791457 0.3796290982999731
  -1.337835113