<a href="https://colab.research.google.com/github/maliozdemir1/dsa210-project-impact-of-climate-on-gastronomic-diversity/blob/main/DataGathering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re

# ==========
# INPUT / OUTPUT
# ==========
MAPPED_PATH = "TurkPatent_ALL_kcal_list.csv"   # your mean-mapped output
OUT_MAPPED  = "mean_mapped_clean_en.csv"

# ==========
# Helpers
# ==========
TR_LOWER_MAP = str.maketrans({"I":"ı","İ":"i"})
TR_UPPER_MAP = str.maketrans({"i":"İ","ı":"I"})

def tr_title(s: str):
    if pd.isna(s):
        return np.nan
    s = re.sub(r"\s+", " ", str(s).strip())
    if not s:
        return s
    s_low = s.translate(TR_LOWER_MAP).lower()
    parts = []
    for w in s_low.split(" "):
        if not w:
            continue
        parts.append(w[0].translate(TR_UPPER_MAP).upper() + w[1:])
    return " ".join(parts)

def clean_text(s):
    if pd.isna(s):
        return np.nan
    return re.sub(r"\s+", " ", str(s).strip())

# Optional: TR -> EN group names (nice for reporting)
GROUP_TR_TO_EN = {
    "Yemekler ve çorbalar": "Meals & soups",
    "Yiyecekler için çeşni / lezzet vericiler, soslar ve tuz": "Seasonings, sauces & salt",
    "Peynirler": "Cheese",
    "Peynirler ve tereyağı dışında kalan süt ürünleri": "Dairy (excluding cheese & butter)",
    "İşlenmiş ve işlenmemiş meyve ve sebzeler ile mantarlar": "Fruits, vegetables & mushrooms",
    "İşlenmiş ve işlenmemiş et ürünleri": "Meat products",
    "Fırıncılık ve pastacılık mamulleri, hamur işleri, tatlılar": "Bakery, pastry & desserts",
    "Dondurmalar ve yenilebilir buzlar": "Ice cream & edible ice",
    "Alkolsüz içecekler": "Non-alcoholic beverages",
    "Bal": "Honey",
    "Biralar ve diğer alkollü içkiler": "Alcoholic beverages",
    "Çikolata, şekerleme ve türevi ürünler": "Chocolate & confectionery",
    "Diğer ürünler": "Other products",
}

# ==========
# Load & rename
# ==========
mapped = pd.read_csv(MAPPED_PATH, encoding="utf-8-sig")

# Expect columns: food_name, province, tp_group, kcal_100g, ...
mapped_clean = mapped.rename(columns={
    "tp_group": "product_group_tr",
}).copy()

# ==========
# Clean text + types
# ==========
mapped_clean["province"] = mapped_clean["province"].apply(tr_title)
mapped_clean["food_name"] = mapped_clean["food_name"].apply(clean_text)
mapped_clean["product_group_tr"] = mapped_clean["product_group_tr"].apply(clean_text)
mapped_clean["kcal_100g"] = pd.to_numeric(mapped_clean["kcal_100g"], errors="coerce")

mapped_clean["product_group_en"] = mapped_clean["product_group_tr"].map(GROUP_TR_TO_EN).fillna("Unknown")

# Keep only columns you need
mapped_clean = mapped_clean[["province","food_name","product_group_tr","product_group_en","kcal_100g"]].copy()

# Drop empty essentials, remove duplicates
mapped_clean = mapped_clean.dropna(subset=["province","food_name","product_group_tr"])
mapped_clean = mapped_clean.drop_duplicates(subset=["province","food_name"], keep="first").reset_index(drop=True)

# ==========
# Save
# ==========
mapped_clean.to_csv(OUT_MAPPED, index=False, encoding="utf-8-sig")

print("Saved:", OUT_MAPPED)
#print("Rows:", len(mapped_clean))
#display(mapped_clean.head(10))



DENEME

In [None]:
x =  10