**Data Preparation**

sumber : https://www.kaggle.com/datasets/anasfikrihanif/indonesian-food-and-drink-nutrition-dataset

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/archive.zip")

df

Unnamed: 0,id,calories,proteins,fat,carbohydrate,name,image
0,1,280.0,9.2,28.4,0.0,Abon,https://img-cdn.medkomtek.com/PbrY9X3ignQ8sVuj...
1,2,513.0,23.7,37.0,21.3,Abon haruwan,https://img-global.cpcdn.com/recipes/cbf330fbd...
2,3,0.0,0.0,0.2,0.0,Agar-agar,https://res.cloudinary.com/dk0z4ums3/image/upl...
3,4,45.0,1.1,0.4,10.8,Akar tonjong segar,https://images.tokopedia.net/img/cache/200-squ...
4,5,37.0,4.4,0.5,3.8,Aletoge segar,https://nilaigizi.com/assets/images/produk/pro...
...,...,...,...,...,...,...,...
1341,1342,42.0,1.2,0.6,9.3,Wortel Segar,https://www.astronauts.id/blog/wp-content/uplo...
1342,1343,37.0,1.0,0.6,8.3,Wortel kukus,https://www.wikihow.com/images_en/thumb/b/bf/S...
1343,1344,28.0,0.7,0.5,6.3,Wortel rebus,https://asset-a.grid.id/crop/0x222:594x690/700...
1344,1345,254.0,3.0,1.1,58.1,Yangko,https://serikatnews.com/wp-content/uploads/202...


**Data Cleaning**

Missing Values

In [4]:
(df.isna().sum() / len(df)) * 100

Unnamed: 0,0
id,0.0
calories,0.0
proteins,0.0
fat,0.0
carbohydrate,0.0
name,0.0
image,0.0


In [5]:
pd.DataFrame(df.isna().sum() / len(df) * 100, columns=['Null Ratio %'])

Unnamed: 0,Null Ratio %
id,0.0
calories,0.0
proteins,0.0
fat,0.0
carbohydrate,0.0
name,0.0
image,0.0


Duplicated Values

In [6]:
df[df.duplicated()]

Unnamed: 0,id,calories,proteins,fat,carbohydrate,name,image


Outliers

In [7]:
results = []

cols = df.select_dtypes(include=['float64', 'int64'])

for col in cols:
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1
  lower_bound = q1 - 1.5*iqr
  upper_bound = q3 + 1.5*iqr
  outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
  percent_outliers = (len(outliers)/len(df))*100
  results.append({'Kolom': col, 'Persentase Outliers': percent_outliers})

# Dataframe dari list hasil
results_df = pd.DataFrame(results)
results_df.set_index('Kolom', inplace=True)
results_df = results_df.rename_axis(None, axis=0).rename_axis('Kolom', axis=1)

# Tampilkan dataframe
display(results_df)

Kolom,Persentase Outliers
id,0.0
calories,0.965825
proteins,4.754829
fat,12.035661
carbohydrate,1.263001


In [10]:
columns_to_impute = ["id", "calories", "proteins", "fat", "carbohydrate"]

for col in columns_to_impute:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Menggunakan .loc[] agar tidak muncul SettingWithCopyWarning
    df.loc[:, col] = df[col].clip(lower=lower_bound, upper=upper_bound)

In [11]:
df

Unnamed: 0,id,calories,proteins,fat,carbohydrate,name,image
0,1,280.0,9.2,19.9375,0.0,Abon,https://img-cdn.medkomtek.com/PbrY9X3ignQ8sVuj...
1,2,513.0,23.7,19.9375,21.3,Abon haruwan,https://img-global.cpcdn.com/recipes/cbf330fbd...
2,3,0.0,0.0,0.2000,0.0,Agar-agar,https://res.cloudinary.com/dk0z4ums3/image/upl...
3,4,45.0,1.1,0.4000,10.8,Akar tonjong segar,https://images.tokopedia.net/img/cache/200-squ...
4,5,37.0,4.4,0.5000,3.8,Aletoge segar,https://nilaigizi.com/assets/images/produk/pro...
...,...,...,...,...,...,...,...
1341,1342,42.0,1.2,0.6000,9.3,Wortel Segar,https://www.astronauts.id/blog/wp-content/uplo...
1342,1343,37.0,1.0,0.6000,8.3,Wortel kukus,https://www.wikihow.com/images_en/thumb/b/bf/S...
1343,1344,28.0,0.7,0.5000,6.3,Wortel rebus,https://asset-a.grid.id/crop/0x222:594x690/700...
1344,1345,254.0,3.0,1.1000,58.1,Yangko,https://serikatnews.com/wp-content/uploads/202...


Incosistent Value

In [12]:
df

Unnamed: 0,id,calories,proteins,fat,carbohydrate,name,image
0,1,280.0,9.2,19.9375,0.0,Abon,https://img-cdn.medkomtek.com/PbrY9X3ignQ8sVuj...
1,2,513.0,23.7,19.9375,21.3,Abon haruwan,https://img-global.cpcdn.com/recipes/cbf330fbd...
2,3,0.0,0.0,0.2000,0.0,Agar-agar,https://res.cloudinary.com/dk0z4ums3/image/upl...
3,4,45.0,1.1,0.4000,10.8,Akar tonjong segar,https://images.tokopedia.net/img/cache/200-squ...
4,5,37.0,4.4,0.5000,3.8,Aletoge segar,https://nilaigizi.com/assets/images/produk/pro...
...,...,...,...,...,...,...,...
1341,1342,42.0,1.2,0.6000,9.3,Wortel Segar,https://www.astronauts.id/blog/wp-content/uplo...
1342,1343,37.0,1.0,0.6000,8.3,Wortel kukus,https://www.wikihow.com/images_en/thumb/b/bf/S...
1343,1344,28.0,0.7,0.5000,6.3,Wortel rebus,https://asset-a.grid.id/crop/0x222:594x690/700...
1344,1345,254.0,3.0,1.1000,58.1,Yangko,https://serikatnews.com/wp-content/uploads/202...


Didalam data tidak ada yang incosistent



Construct Data

In [14]:

df["bad_nutrisi"] = df["calories"] + df["fat"]
print(df.head())

   id  calories  proteins      fat  carbohydrate                name  \
0   1     280.0       9.2  19.9375           0.0                Abon   
1   2     513.0      23.7  19.9375          21.3        Abon haruwan   
2   3       0.0       0.0   0.2000           0.0           Agar-agar   
3   4      45.0       1.1   0.4000          10.8  Akar tonjong segar   
4   5      37.0       4.4   0.5000           3.8       Aletoge segar   

                                               image  bad_nutrisi  
0  https://img-cdn.medkomtek.com/PbrY9X3ignQ8sVuj...     299.9375  
1  https://img-global.cpcdn.com/recipes/cbf330fbd...     532.9375  
2  https://res.cloudinary.com/dk0z4ums3/image/upl...       0.2000  
3  https://images.tokopedia.net/img/cache/200-squ...      45.4000  
4  https://nilaigizi.com/assets/images/produk/pro...      37.5000  


Data Reduction

In [16]:
df = df.drop('id', axis=1)

In [17]:
df

Unnamed: 0,calories,proteins,fat,carbohydrate,name,image,bad_nutrisi
0,280.0,9.2,19.9375,0.0,Abon,https://img-cdn.medkomtek.com/PbrY9X3ignQ8sVuj...,299.9375
1,513.0,23.7,19.9375,21.3,Abon haruwan,https://img-global.cpcdn.com/recipes/cbf330fbd...,532.9375
2,0.0,0.0,0.2000,0.0,Agar-agar,https://res.cloudinary.com/dk0z4ums3/image/upl...,0.2000
3,45.0,1.1,0.4000,10.8,Akar tonjong segar,https://images.tokopedia.net/img/cache/200-squ...,45.4000
4,37.0,4.4,0.5000,3.8,Aletoge segar,https://nilaigizi.com/assets/images/produk/pro...,37.5000
...,...,...,...,...,...,...,...
1341,42.0,1.2,0.6000,9.3,Wortel Segar,https://www.astronauts.id/blog/wp-content/uplo...,42.6000
1342,37.0,1.0,0.6000,8.3,Wortel kukus,https://www.wikihow.com/images_en/thumb/b/bf/S...,37.6000
1343,28.0,0.7,0.5000,6.3,Wortel rebus,https://asset-a.grid.id/crop/0x222:594x690/700...,28.5000
1344,254.0,3.0,1.1000,58.1,Yangko,https://serikatnews.com/wp-content/uploads/202...,255.1000
