# Feature Engineering - Recipe Dataset

## Introduction
This notebook focuses on **feature engineering** for the recipe recommendation system. We will create and transform features that will be used un our content-based filtering and search functionalities.

**Dataset:** 7000+ International Cuisine Recipes (Kaggle)

**Objective:** Prepare features for recipe recommendation and search systems

**Author:** NGUYEN Ngoc Dang Nguyen - Final-year Student in Computer Science, Aix-Marseille University

**Feature Engineering steps:**
1. Load the cleaned dataset
2. Create time-based features
3. Process ingredients for vectorization
4. Create cuisine and diet categories
5. Engineer search-friendly features
6. Prepare features for similarity calculations
7. Save processed features 

## 1. Load Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

# Load the cleaned CSV file
cleaned_data_path = os.path.join("..", "data", "cleaned", "Food_Recipe_cleaned.csv")
df = pd.read_csv(cleaned_data_path)

print(f"Dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())
display(df.head())

## 2. Time-based Feature Engineering

In [None]:
# Create time categories for filtering (matching search.py logic)
def create_time_categories(df):
    """Create time-based categories for recipe difficulty/duration"""
    df = df.copy()
    
    # Calculate total time if both columns exist
    if 'cook_time (in mins)' in df.columns:
        df['total_time'] = df['prep_time (in mins)'] + df['cook_time (in mins)']
    else:
        df['total_time'] = df['prep_time (in mins)']
    
    # Create time categories (matching search_by_filters logic)
    df['time_category'] = 'Over 1 Hour'
    df.loc[df['total_time'] <= 60, 'time_category'] = 'Under 1 Hour'
    df.loc[df['total_time'] <= 45, 'time_category'] = 'Under 45 Minutes'
    df.loc[df['total_time'] <= 30, 'time_category'] = 'Under 30 Minutes'
    
    return df

df = create_time_categories(df)

# Display time category distribution
print("Time category distribution:")
print(df['time_category'].value_counts())

# Visualize time categories
plt.figure(figsize=(10, 6))
df['time_category'].value_counts().plot(kind='bar')
plt.title('Recipe Distribution by Time Category')
plt.xlabel('Time Category')
plt.ylabel('Number of Recipes')
plt.xticks(rotation=45)
plt.show()

## 3. Ingredients Processing for Vectorization

In [None]:
# Process ingredients for similarity calculations (matching RecipeRecommender logic)
def process_ingredients(df):
    """Clean and process ingredients for vectorization"""
    df = df.copy()
    
    # Fill missing ingredients
    df['ingredients_name'] = df['ingredients_name'].fillna("")
    
    # Clean ingredients text
    df['ingredients_clean'] = df['ingredients_name'].str.lower()
    df['ingredients_clean'] = df['ingredients_clean'].str.replace('[^\w\s,]', '', regex=True)
    
    # Count number of ingredients per recipe
    df['ingredient_count'] = df['ingredients_name'].apply(
        lambda x: len(str(x).split(',')) if pd.notna(x) and str(x).strip() != '' else 0
    )
    
    return df

df = process_ingredients(df)

print(f"Average number of ingredients per recipe: {df['ingredient_count'].mean():.2f}")
print(f"Ingredient count distribution:")
print(df['ingredient_count'].describe())

# Visualize ingredient count distribution
plt.figure(figsize=(10, 6))
plt.hist(df['ingredient_count'], bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of Number of Ingredients per Recipe')
plt.xlabel('Number of Ingredients')
plt.ylabel('Frequency')
plt.show()

## 4. Cuisine and Diet Categories

In [None]:
# Create cuisine categories
print("Original cuisine distribution (top 10):")
top_cuisines = df['cuisine'].value_counts().head(10)
print(top_cuisines)

# Group less common cuisines into 'Other'
def categorize_cuisine(df, min_recipes=50):
    """Group less common cuisines into 'Other' category"""
    df = df.copy()
    cuisine_counts = df['cuisine'].value_counts()
    common_cuisines = cuisine_counts[cuisine_counts >= min_recipes].index
    
    df['cuisine_category'] = df['cuisine'].apply(
        lambda x: x if x in common_cuisines else 'Other'
    )
    
    return df

df = categorize_cuisine(df, min_recipes=30)

print(f"\nCuisine categories after grouping:")
print(df['cuisine_category'].value_counts())

# Process diet information if available
if 'diet' in df.columns:
    # Clean diet information
    df['diet'] = df['diet'].fillna('Not Specified')
    df['diet_clean'] = df['diet'].str.lower().str.strip()
    
    print(f"\nDiet distribution:")
    print(df['diet_clean'].value_counts().head(10))
else:
    print("No diet column found in dataset")

# Process course information if available  
if 'course' in df.columns:
    df['course'] = df['course'].fillna('Not Specified')
    df['course_clean'] = df['course'].str.lower().str.strip()
    
    print(f"\nCourse distribution:")
    print(df['course_clean'].value_counts().head(10))
else:
    print("No course column found in dataset")

## 5. Search-Friendly Features

In [None]:
# Create features for ingredient-based search (matching search_by_ingredients logic)
def create_search_features(df):
    """Create features optimized for search functionality"""
    df = df.copy()
    
    # Create searchable ingredient list (lowercase, cleaned)
    df['searchable_ingredients'] = df['ingredients_name'].str.lower()
    
    # Create a combined text field for general search
    text_fields = ['name', 'description', 'ingredients_name']
    existing_fields = [field for field in text_fields if field in df.columns]
    
    if existing_fields:
        df['searchable_text'] = df[existing_fields].fillna('').astype(str).agg(' '.join, axis=1)
        df['searchable_text'] = df['searchable_text'].str.lower()
    
    # Create binary features for common dietary restrictions
    if 'diet_clean' in df.columns:
        common_diets = ['vegetarian', 'vegan', 'gluten-free', 'dairy-free']
        for diet in common_diets:
            df[f'is_{diet.replace("-", "_")}'] = df['diet_clean'].str.contains(diet, na=False).astype(int)
    
    return df

df = create_search_features(df)

# Show search feature examples
print("Search features created:")
if 'searchable_text' in df.columns:
    print("Sample searchable text:")
    print(df['searchable_text'].iloc[0][:200] + "...")

# Display dietary restriction distribution if available
diet_cols = [col for col in df.columns if col.startswith('is_')]
if diet_cols:
    print(f"\nDietary restriction distribution:")
    for col in diet_cols:
        print(f"{col}: {df[col].sum()} recipes")

## 6. Prepare Features for Similarity Calculations

In [None]:
# Initialize vectorizer (matching RecipeRecommender class)
vectorizer = CountVectorizer(stop_words='english', max_features=1000)

# Fit vectorizer on ingredients for similarity calculations
if 'ingredients_clean' in df.columns and not df['ingredients_clean'].empty:
    try:
        ingredient_matrix = vectorizer.fit_transform(df['ingredients_clean'])
        print(f"Ingredient matrix shape: {ingredient_matrix.shape}")
        print(f"Number of unique ingredient terms: {len(vectorizer.get_feature_names_out())}")
        
        # Show most common ingredient terms
        feature_names = vectorizer.get_feature_names_out()
        ingredient_sums = np.array(ingredient_matrix.sum(axis=0)).flatten()
        top_ingredients_idx = ingredient_sums.argsort()[-20:][::-1]
        
        print("\nTop 20 ingredient terms for similarity:")
        for idx in top_ingredients_idx:
            print(f"{feature_names[idx]}: {ingredient_sums[idx]}")
            
    except Exception as e:
        print(f"Error creating ingredient matrix: {e}")

## 7. Create Encoded Features

In [None]:
# Encode categorical variables for potential ML models
categorical_features = ['cuisine_category']
if 'course_clean' in df.columns:
    categorical_features.append('course_clean')

label_encoders = {}
for feature in categorical_features:
    if feature in df.columns:
        le = LabelEncoder()
        df[f'{feature}_encoded'] = le.fit_transform(df[feature].astype(str))
        label_encoders[feature] = le
        
        print(f"\n{feature} encoding:")
        unique_values = df[feature].nunique()
        print(f"Number of unique values: {unique_values}")

# Create numeric features summary
numeric_features = ['prep_time (in mins)', 'ingredient_count', 'total_time']
if 'cook_time (in mins)' in df.columns:
    numeric_features.append('cook_time (in mins)')

existing_numeric = [feat for feat in numeric_features if feat in df.columns]
if existing_numeric:
    print(f"\nNumeric features summary:")
    display(df[existing_numeric].describe())

## 8. Feature Summary and Quality Check

In [None]:
# Summary of engineered features
print("FEATURE ENGINEERING SUMMARY")
print("=" * 50)

original_features = df.shape[1]
print(f"Total features after engineering: {original_features}")

# Count different types of features
time_features = [col for col in df.columns if 'time' in col]
ingredient_features = [col for col in df.columns if 'ingredient' in col]
search_features = [col for col in df.columns if 'searchable' in col]
encoded_features = [col for col in df.columns if 'encoded' in col]
binary_features = [col for col in df.columns if col.startswith('is_')]

print(f"\nFeature categories:")
print(f"- Time-based features: {len(time_features)}")
print(f"- Ingredient features: {len(ingredient_features)}")
print(f"- Search features: {len(search_features)}")
print(f"- Encoded features: {len(encoded_features)}")
print(f"- Binary features: {len(binary_features)}")

# Check for missing values in new features
new_features = time_features + ingredient_features + search_features + encoded_features + binary_features
if new_features:
    print(f"\nMissing values in engineered features:")
    missing_in_new = df[new_features].isnull().sum()
    if missing_in_new.sum() > 0:
        print(missing_in_new[missing_in_new > 0])
    else:
        print("No missing values in engineered features!")

# Display sample of final dataset
print(f"\nSample of engineered features:")
sample_cols = ['name', 'cuisine_category', 'time_category', 'ingredient_count', 'total_time']
existing_sample_cols = [col for col in sample_cols if col in df.columns]
if existing_sample_cols:
    display(df[existing_sample_cols].head())

## 9. Save Processed Dataset

In [None]:
# Save the feature-engineered dataset
processed_data_path = os.path.join("..", "data", "processed", "Food_Recipe_featured.csv")

# Create processed directory if it doesn't exist
os.makedirs(os.path.dirname(processed_data_path), exist_ok=True)

# Save dataset
df.to_csv(processed_data_path, index=False)
print(f"Feature-engineered dataset saved to: {processed_data_path}")
print(f"Final dataset shape: {df.shape}")

# Save feature information for reference
feature_info = {
    'total_features': df.shape[1],
    'time_features': time_features,
    'ingredient_features': ingredient_features,
    'search_features': search_features,
    'encoded_features': encoded_features,
    'binary_features': binary_features,
    'numeric_features': existing_numeric
}

print(f"\nFeature engineering completed successfully!")
print(f"Dataset ready for recommendation system implementation.")

## Feature Engineering Conclusion
The feature engineering process has transformed the raw recipe data into a robust format for search and recommendation. Key achievements include:

- Creation of time-based categories and total cooking time for flexible filtering
- Cleaning and structuring ingredient data for vectorization and search
- Grouping cuisines and processing diet information for better categorization
- Building search-friendly and encoded features for similarity calculations
- Ensuring data completeness and compatibility with downstream models

These engineered features provide a strong foundation for both ingredient-based search and content-based recommendation, supporting the next steps in model development and deployment.