In [None]:
import pandas as pd
import numpy as np
import ast
import re

In [None]:
# 1) Read CSV
df = pd.read_csv(
    "snacks_openfoodfacts.csv",
    dtype={
        "barcode": "str",     # avoid scientific notation
    },
    keep_default_na=True,
    engine="python",
    on_bad_lines="skip"
)


In [None]:
# quick diagnostics
print(df.shape)
print(df.columns.tolist())
print(df.dtypes)
print(df.isna().sum().sort_values(ascending=False).head(30))

In [None]:
df.head(3)

In [None]:
# List of the incorrect column names in order
wrong_cols = [
    "fruits_vegetables_nuts_percent",
    "nutrient_level_fat",
    "nutrient_level_saturated_fat",
    "nutrient_level_sugars",
    "nutrient_level_salt",
    "additives",
    "packaging",
    "stores",
    "countries",
    "origins",
    "manufacturing_places",
    "ecoscore_grade",
    "ecoscore_score",
    "carbon_footprint_100g"
]

# Shift column names left by one: remove the first name
correct_cols = wrong_cols[1:]  # everything except the first

# The last column should keep its correct name
# So we append the final intended column name
correct_cols.append("carbon_footprint_100g")

# Now assign the corrected names to the dataframe (only for these columns)
df.rename(columns=dict(zip(wrong_cols, correct_cols)), inplace=True)
df = df.iloc[:, :-1]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df["barcode"] = df["barcode"].astype(str)
df = df.drop_duplicates(subset="barcode", keep="first")

In [None]:
df.shape

In [None]:
# 2) Normalize "unknown"/empty to NaN
df = df.replace(
    ["", "unknown", "Unknown", "NONE", "None", "nan", "NaN"],
    np.nan
)

In [None]:
numeric_cols = [
    "energy_kj_100g", "energy_kcal_100g",
    "fat_100g", "saturated_fat_100g",
    "carbohydrates_100g", "sugars_100g",
    "fiber_100g", "proteins_100g",
    "salt_100g",
    "ecoscore_score",
    "carbon_footprint_100g",
]

In [None]:
#Convert categorical columns to category dtype when appropriate (saves memory)

df["brand"] = df["brand"].astype("category")

In [None]:
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df.head()

In [None]:
missing = (df.isna().sum() / len(df)).sort_values(ascending=False)
print(missing.head(30))

In [None]:
#negative values and outliers
df.loc[df["sugars_100g"] < 0, "sugars_100g"] = np.nan

# Encoding

In [None]:
# Normalize to uppercase string, "nan" will just become "NAN"
df["contains_palm_oil"] = df["contains_palm_oil"].astype(str).str.strip().str.upper()

# TRUE -> 1, everything else -> 0
df["contains_palm_oil"] = (df["contains_palm_oil"] == "TRUE").astype(int)

In [None]:
level_map = {"low": 0, "moderate": 1, "high": 2}

for col in [
    "nutrient_level_fat",
    "nutrient_level_saturated_fat",
    "nutrient_level_sugars",
    "nutrient_level_salt"
]:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype("string")
            .str.lower()
            .map(level_map)
        )

In [None]:
def count_additives(val):
    if isinstance(val, str) and val.startswith("["):
        try:
            lst = ast.literal_eval(val)
            return len(lst)
        except Exception:
            return np.nan
    return 0

if "additives" in df.columns:
    df["additives_count"] = df["additives"].apply(count_additives)


In [None]:
if "nutriscore_letter" in df.columns:
    df["nutriscore_letter"] = (
        df["nutriscore_letter"]
        .astype("string")
        .str.strip()
        .str.upper()
    )


# Drop rows without label
df = df[~df["nutriscore_letter"].isna()].copy()

In [None]:
# Encode NutriScore Letters (A–E)
if "nutriscore_letter" in df.columns:
    mapping = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
    df["nutriscore_letter"] = df["nutriscore_letter"].map(mapping).astype("float")

In [None]:
# Encode Vegetarian Status
if "vegetarian_status" in df.columns:
    veg_map = {"yes": 1, "no": 0}
    df["vegetarian_status"] = df["vegetarian_status"].map(veg_map).astype("float")

In [None]:
# Encode Vegan Status
if "vegan_status" in df.columns:
    vegan_map = {"yes": 1, "no": 0}
    df["vegan_status"] = df["vegan_status"].map(vegan_map).astype("float")

In [None]:
# Encode EcoScore Grade (A–E)
if "ecoscore_grade" in df.columns:
    eco_map = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
    df["ecoscore_grade"] = df["ecoscore_grade"].map(eco_map).astype("float")

# Handling missing values

In [None]:
nutr_cols = [
    "energy_kj_100g",
    "energy_kcal_100g",
    "fat_100g",
    "saturated_fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g"
]

# mask rows where all nutrition cols are 0 or NaN
mask_all_empty = (df[nutr_cols].fillna(0) == 0).all(axis=1)

# keep only rows that are NOT all-empty
df = df[~mask_all_empty].copy()

In [None]:
# --- NOVA group ---
if "nova_group" in df.columns:
    # convert to numeric if it's not already
    df["nova_group"] = pd.to_numeric(df["nova_group"], errors="coerce")
    nova_mode = df["nova_group"].mode(dropna=True)
    if not nova_mode.empty:
        df["nova_group"] = df["nova_group"].fillna(nova_mode.iloc[0])

In [None]:
# --- Energy: choose kcal as main, fix from kJ when possible ---
if "energy_kcal_100g" in df.columns and "energy_kj_100g" in df.columns:
    # if kcal missing but kJ present, compute kcal ≈ kJ / 4.184
    mask_missing_kcal = df["energy_kcal_100g"].isna() & df["energy_kj_100g"].notna()
    df.loc[mask_missing_kcal, "energy_kcal_100g"] = df.loc[mask_missing_kcal, "energy_kj_100g"] / 4.184

    # now drop kJ to avoid redundancy
    df = df.drop(columns=["energy_kj_100g"])

In [None]:
numeric_nut_cols = [
    "energy_kcal_100g",
    "fat_100g", "saturated_fat_100g",
    "carbohydrates_100g", "sugars_100g",
    "fiber_100g", "proteins_100g",
    "salt_100g",
]

for col in numeric_nut_cols:
    if col in df.columns:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)

In [None]:
# --- additives_count ---
if "additives_count" in df.columns:
    df["additives_count"] = df["additives_count"].fillna(0)