In [None]:
import pandas as pd
import numpy as np
import ast
import re

In [None]:
# 1) Read CSV
df = pd.read_csv(
    "snacks_openfoodfacts.csv",
    dtype={
        "barcode": "str",     # avoid scientific notation
    },
    keep_default_na=True,
    engine="python",
    on_bad_lines="skip"
)


In [None]:
# quick diagnostics
print(df.shape)
print(df.columns.tolist())
print(df.dtypes)
print(df.isna().sum().sort_values(ascending=False).head(30))

In [None]:
df.head(3)

In [None]:
# List of the incorrect column names in order
wrong_cols = [
    "fruits_vegetables_nuts_percent",
    "nutrient_level_fat",
    "nutrient_level_saturated_fat",
    "nutrient_level_sugars",
    "nutrient_level_salt",
    "additives",
    "packaging",
    "stores",
    "countries",
    "origins",
    "manufacturing_places",
    "ecoscore_grade",
    "ecoscore_score",
    "carbon_footprint_100g"
]

# Shift column names left by one: remove the first name
correct_cols = wrong_cols[1:]  # everything except the first

# The last column should keep its correct name
# So we append the final intended column name
correct_cols.append("carbon_footprint_100g")

# Now assign the corrected names to the dataframe (only for these columns)
df.rename(columns=dict(zip(wrong_cols, correct_cols)), inplace=True)
df = df.iloc[:, :-1]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df["barcode"] = df["barcode"].astype(str)
df = df.drop_duplicates(subset="barcode", keep="first")

In [None]:
df.shape

In [None]:
# 2) Normalize "unknown"/empty to NaN
df = df.replace(
    ["", "unknown", "Unknown", "NONE", "None", "nan", "NaN"],
    np.nan
)

In [None]:
numeric_cols = [
    "energy_kj_100g", "energy_kcal_100g",
    "fat_100g", "saturated_fat_100g",
    "carbohydrates_100g", "sugars_100g",
    "fiber_100g", "proteins_100g",
    "salt_100g",
    "ecoscore_score",
    "carbon_footprint_100g",
]

In [None]:
#Convert categorical columns to category dtype when appropriate (saves memory)

df["brand"] = df["brand"].astype("category")

In [None]:
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

In [None]:
pd.set_option("display.max_columns", None)