In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/raw/food.csv")

# Basic info
print("Shape:", df.shape)
df.head()


Shape: (7413, 48)


Unnamed: 0,Category,Description,Nutrient Data Bank Number,Data.Alpha Carotene,Data.Ash,Data.Beta Carotene,Data.Beta Cryptoxanthin,Data.Carbohydrate,Data.Cholesterol,Data.Choline,...,Data.Major Minerals.Potassium,Data.Major Minerals.Sodium,Data.Major Minerals.Zinc,Data.Vitamins.Vitamin A - IU,Data.Vitamins.Vitamin A - RAE,Data.Vitamins.Vitamin B12,Data.Vitamins.Vitamin B6,Data.Vitamins.Vitamin C,Data.Vitamins.Vitamin E,Data.Vitamins.Vitamin K
0,BUTTER,"BUTTER,WITH SALT",1001,0,2.11,158,0,0.06,215,19,...,24,576,0.09,2499,684,0.17,0.003,0.0,2.32,7.0
1,BUTTER,"BUTTER,WHIPPED,WITH SALT",1002,0,2.11,158,0,0.06,219,19,...,26,827,0.05,2499,684,0.13,0.003,0.0,2.32,7.0
2,BUTTER OIL,"BUTTER OIL,ANHYDROUS",1003,0,0.0,193,0,0.0,256,22,...,5,2,0.01,3069,840,0.01,0.001,0.0,2.8,8.6
3,CHEESE,"CHEESE,BLUE",1004,0,5.11,74,0,2.34,75,15,...,256,1395,2.66,763,198,1.22,0.166,0.0,0.25,2.4
4,CHEESE,"CHEESE,BRICK",1005,0,3.18,76,0,2.79,94,15,...,136,560,2.6,1080,292,1.26,0.065,0.0,0.26,2.5


In [3]:
# I kept only the essential columns for diabetic analysis
columns_to_keep = [
    'Description',
    'Data.Sugar Total',
    'Data.Carbohydrate',
    'Data.Fiber',
    'Data.Kilocalories',
    'Data.Fat.Saturated Fat'
]

df = df[columns_to_keep]
df.head()

Unnamed: 0,Description,Data.Sugar Total,Data.Carbohydrate,Data.Fiber,Data.Kilocalories,Data.Fat.Saturated Fat
0,"BUTTER,WITH SALT",0.06,0.06,0.0,717,51.368
1,"BUTTER,WHIPPED,WITH SALT",0.06,0.06,0.0,717,50.489
2,"BUTTER OIL,ANHYDROUS",0.0,0.0,0.0,876,61.924
3,"CHEESE,BLUE",0.5,2.34,0.0,353,18.669
4,"CHEESE,BRICK",0.51,2.79,0.0,371,18.764


In [4]:
# Identifying  missing values in each column
df.isnull().sum()


Description               0
Data.Sugar Total          0
Data.Carbohydrate         0
Data.Fiber                0
Data.Kilocalories         0
Data.Fat.Saturated Fat    0
dtype: int64

In [5]:
# Cleaned dataset before adding labels saved
df.to_csv("../data/cleaned/food_cleaned.csv", index=False)



In [None]:
# Diabetic suitability based on 4 NHS references
def is_suitable(row):
    sugar = row['Data.Sugar Total']
    carbs = row['Data.Carbohydrate']
    sat_fat = row['Data.Fat.Saturated Fat']

    if pd.isnull(sugar) or pd.isnull(carbs) or pd.isnull(sat_fat):
        return None  # Safe fallback for missing values

    if sugar > 15 or carbs > 45 or sat_fat > 5:
        return 0  # Not suitable for diabetics
    else:
        return 1  # Suitable

# Logic
df['Diabetic_Suitability'] = df.apply(is_suitable, axis=1)

# View results
df[['Description', 'Data.Sugar Total', 'Data.Carbohydrate', 'Data.Fat.Saturated Fat', 'Diabetic_Suitability']].head()


Unnamed: 0,Description,Data.Sugar Total,Data.Carbohydrate,Data.Fat.Saturated Fat,Diabetic_Suitability
0,"BUTTER,WITH SALT",0.06,0.06,51.368,0
1,"BUTTER,WHIPPED,WITH SALT",0.06,0.06,50.489,0
2,"BUTTER OIL,ANHYDROUS",0.0,0.0,61.924,0
3,"CHEESE,BLUE",0.5,2.34,18.669,0
4,"CHEESE,BRICK",0.51,2.79,18.764,0


In [8]:
# Save final cleaned and labelled dataset
df.to_csv("../data/cleaned/food_labelled.csv", index=False)
