# Feature Selection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("en.openfoodfacts.org.products.csv", sep='\t')

KeyboardInterrupt: 

In [None]:
head = df.sample(100)

In [None]:
df["stores"] = df["stores"].str.lower().str.strip()

In [None]:
for store in df["stores"].unique():
    print(store)

In [None]:
df = df[df["brands_tags"].isin(["lidl", "rewe", "aldi", "netto"])]

In [None]:
cols = ["code", "product_name", "quantity", "packaging", "packaging_text", "packaging_tags", "brands", 
        "categories_tags", "categories_en", "origins_en", "labels_en", "stores", "serving_size", "image_url"]

# Feature Engineering

In [None]:
for e in df[cols]["packaging_tags"].unique():
    print(e)

In [None]:
PLASTIC_TAGS = ["plastic", "plastique", "plastico", "kunststoff", "pet", "pp", "foil", "plastik"]
plastic = df["packaging_tags"].map(lambda x: any([tag in str(x).lower().split(",") for tag in PLASTIC_TAGS]))

In [None]:
CARTON_TAGS = ["paper", "papier", "carton", "card", "karton", "tetra", "papel"]
carton = df["packaging_tags"].map(lambda x: any([tag in str(x).lower().split(",") for tag in CARTON_TAGS]))

In [None]:
GLASS_TAGS = ["glass", "glas"]
glass = df["packaging_tags"].map(lambda x: any([tag in str(x).lower().split(",") for tag in GLASS_TAGS]))

In [None]:
METAL_TAGS = ["metalique", "aluminium", "metal"]
metal = df["packaging_tags"].map(lambda x: any([tag in str(x).lower().split(",") for tag in METAL_TAGS]))

## Approximate area of packaging based on volumne

In [None]:
from quantulum3 import parser

In [None]:
quants = parser.parse('130 ml')
quants

In [None]:
def get_weight(s):
    if len(s) == 0:
        return float("nan")
    quants = parser.parse(str(s))
    if len(quants) == 0:
        return float("nan")
    quant = quants[0]
    unit = quant.unit.name
    if quant.value < 0:
        return float("nan")
    if unit == "gram":
        return float(quant.value)
    if unit == "litre":
        return float(quant.value) * 1000
    if unit == "centilitre":
        return float(quant.value) * 100
    if unit == "kilogram":
        return float(quant.value) * 1000
    if unit == "ounce":
        return float(quant.value) * 28.35
    if unit == "pound-mass":
        return float(quant.value) * 453.592
    if unit == "gallon":
        return float(quant.value) * 3.78541 * 1000
    if unit == "cubic centimetre":
        return float(quant.value)
    return float("nan")

For the sake of simplicity, we assume that the volume is equal to the weight (i.e. 750 g = 0.75l). Weights are given in gramms. Packaging area is given square cm.

In [None]:
weight = df["quantity"].fillna("").map(lambda x: get_weight(x))

In [None]:
weight[weight > 50*1000] = float("nan")
weight[weight < 0] = float("nan")

In [None]:
packaging_area = weight.map(lambda x: (((x/1000.0)**(1./3.))**2) * 6)

In [None]:
packaging_area.max()

In [None]:
df["weight"] = weight
cols.append("weight")
df["packaging area"] = packaging_area
cols.append("packaging area")

In [None]:
df["materials"] = df["weight"].map(lambda x: [])

In [None]:
df["glass"] = glass
df["carton"] = carton
df["metal"] = metal
df["plastic"] = plastic

In [None]:
cols += ["glass", "carton", "metal", "plastic"]

In [None]:
df[cols]

# Export data

In [None]:
df[cols].to_csv("min_food_data.csv")