# Feature Engineering for ML

This notebook prepares the cleaned dataset for machine learning by:
1. Loading the cleaned Parquet data
2. Creating ML-ready features
3. Encoding categorical variables
4. Scaling numeric features
5. Saving processed data ready for modeling

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

## 1. Load Cleaned Data

In [None]:
# Load from Parquet (faster and more efficient)
df = pd.read_parquet('../data/processed/cleaned_spotify_data.parquet')

print(f"Dataset loaded: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
df.head()

## 2. Define Target and Features

In [None]:
# Target variable: popularity (regression) or popularity_category (classification)
TARGET_VARIABLE = 'popularity'  # Change to 'popularity_category' for classification

# Features to exclude from modeling
EXCLUDE_FEATURES = [
    'Unnamed: 0',  # Index column
    'track_id',    # Identifier
    'track_name',  # Identifier
    'album_name',  # High cardinality
    'artists',     # High cardinality
    'popularity',  # Target (if doing classification)
    'popularity_category',  # Derived from target
]

print(f"Target variable: {TARGET_VARIABLE}")
print(f"Target distribution:")
if TARGET_VARIABLE == 'popularity':
    print(df[TARGET_VARIABLE].describe())
else:
    print(df[TARGET_VARIABLE].value_counts())

## 3. Feature Selection and Engineering

In [None]:
# Create a copy for feature engineering
df_ml = df.copy()

# Audio features (already normalized 0-1)
audio_features = ['danceability', 'energy', 'valence', 'acousticness', 
                  'instrumentalness', 'speechiness', 'liveness']

# Numeric features that need scaling
numeric_features = ['duration_ms', 'loudness', 'tempo', 'duration_min']

# Categorical features
categorical_features = ['explicit', 'key', 'mode', 'time_signature', 
                       'track_genre', 'mood_energy', 'energy_category', 'tempo_category']

print(f"Audio features: {len(audio_features)}")
print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

## 4. Create Additional Features

In [None]:
# Feature interactions
df_ml['energy_danceability'] = df_ml['energy'] * df_ml['danceability']
df_ml['valence_energy'] = df_ml['valence'] * df_ml['energy']
df_ml['acousticness_energy'] = df_ml['acousticness'] * df_ml['energy']

# Polynomial features for key audio metrics
df_ml['energy_squared'] = df_ml['energy'] ** 2
df_ml['danceability_squared'] = df_ml['danceability'] ** 2
df_ml['valence_squared'] = df_ml['valence'] ** 2

# Duration categories
df_ml['is_short_track'] = (df_ml['duration_min'] < 3).astype(int)
df_ml['is_long_track'] = (df_ml['duration_min'] > 5).astype(int)

# Energy + valence combinations (mood indicators)
df_ml['high_energy_happy'] = ((df_ml['energy'] > 0.7) & (df_ml['valence'] > 0.7)).astype(int)
df_ml['low_energy_sad'] = ((df_ml['energy'] < 0.3) & (df_ml['valence'] < 0.3)).astype(int)

# Add new features to numeric list
interaction_features = ['energy_danceability', 'valence_energy', 'acousticness_energy',
                       'energy_squared', 'danceability_squared', 'valence_squared',
                       'is_short_track', 'is_long_track', 'high_energy_happy', 'low_energy_sad']

print(f"Created {len(interaction_features)} interaction features")
print(f"Total features before encoding: {len(df_ml.columns)}")

## 5. Encode Categorical Variables

In [None]:
# Binary encoding for explicit
df_ml['explicit'] = df_ml['explicit'].astype(int)

# One-hot encode low-cardinality categoricals
low_cardinality = ['mode', 'time_signature', 'mood_energy', 'energy_category', 'tempo_category']

for col in low_cardinality:
    if col in df_ml.columns:
        dummies = pd.get_dummies(df_ml[col], prefix=col, drop_first=True)
        df_ml = pd.concat([df_ml, dummies], axis=1)
        df_ml.drop(col, axis=1, inplace=True)

# Label encode track_genre (high cardinality)
if 'track_genre' in df_ml.columns:
    le_genre = LabelEncoder()
    df_ml['track_genre_encoded'] = le_genre.fit_transform(df_ml['track_genre'])
    df_ml.drop('track_genre', axis=1, inplace=True)
    print(f"Encoded {len(le_genre.classes_)} unique genres")

# Label encode key (musical key)
if 'key' in df_ml.columns:
    # Key is already numeric (0-11), so just keep it
    pass

print(f"Total features after encoding: {len(df_ml.columns)}")

## 6. Scale Numeric Features

In [None]:
# Features that need scaling (not already 0-1)
features_to_scale = ['duration_ms', 'loudness', 'tempo', 'duration_min', 'track_genre_encoded', 'key']
features_to_scale = [f for f in features_to_scale if f in df_ml.columns]

# Initialize scaler
scaler = StandardScaler()

# Fit and transform
df_ml[features_to_scale] = scaler.fit_transform(df_ml[features_to_scale])

print(f"Scaled {len(features_to_scale)} features")
print(f"Scaled features: {features_to_scale}")

## 7. Prepare Final Dataset

In [None]:
# Remove excluded features
features_to_drop = [col for col in EXCLUDE_FEATURES if col in df_ml.columns]
df_ml.drop(columns=features_to_drop, inplace=True)

# Separate features and target
if TARGET_VARIABLE in df_ml.columns:
    y = df_ml[TARGET_VARIABLE]
    X = df_ml.drop(TARGET_VARIABLE, axis=1)
else:
    # Target was already excluded, use original df
    y = df[TARGET_VARIABLE]
    X = df_ml

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nFeature columns ({len(X.columns)}):")
print(list(X.columns))

## 8. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y if TARGET_VARIABLE == 'popularity_category' else None
)

print("Train-Test Split:")
print(f"  X_train: {X_train.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  y_test: {y_test.shape}")

## 9. Save Processed Data

In [None]:
# Create output directory
import os
os.makedirs('../data/processed', exist_ok=True)

# Save as Parquet (efficient for ML)
X_train.to_parquet('../data/processed/X_train.parquet', index=False)
X_test.to_parquet('../data/processed/X_test.parquet', index=False)
y_train.to_frame().to_parquet('../data/processed/y_train.parquet', index=False)
y_test.to_frame().to_parquet('../data/processed/y_test.parquet', index=False)

# Also save full processed dataset
df_ml_full = pd.concat([X, y], axis=1)
df_ml_full.to_parquet('../data/processed/ml_ready_data.parquet', index=False)

print("✅ Saved processed data:")
print("  - X_train.parquet")
print("  - X_test.parquet")
print("  - y_train.parquet")
print("  - y_test.parquet")
print("  - ml_ready_data.parquet")

## 10. Feature Summary

In [None]:
# Save feature information
feature_info = pd.DataFrame({
    'feature': X.columns,
    'dtype': X.dtypes.values,
    'missing': X.isnull().sum().values,
    'unique': X.nunique().values,
    'mean': X.mean().values if TARGET_VARIABLE == 'popularity' else [np.nan] * len(X.columns),
    'std': X.std().values if TARGET_VARIABLE == 'popularity' else [np.nan] * len(X.columns)
})

feature_info.to_csv('../data/processed/feature_info.csv', index=False)
print("✅ Saved feature_info.csv")

print("\n" + "="*80)
print("FEATURE ENGINEERING COMPLETE")
print("="*80)
print(f"Total features: {len(X.columns)}")
print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")
print(f"Target variable: {TARGET_VARIABLE}")
print("\nData ready for machine learning!")

In [None]:
# Display feature info
feature_info.head(20)