In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Air Quality Prediction - Feature Engineering\n",
    "\n",
    "## Tujuan\n",
    "Notebook ini bertujuan untuk membuat fitur-fitur baru yang dapat meningkatkan performa model.\n",
    "\n",
    "## Langkah-langkah:\n",
    "1. Membuat fitur interaksi\n",
    "2. Membuat fitur aggregasi\n",
    "3. Membuat fitur domain-specific\n",
    "4. Membuat fitur lag dan rolling statistics (untuk time series)\n",
    "5. Seleksi fitur"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.feature_selection import SelectKBest, f_regression, RFE\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load processed data\n",
    "processed_data = pd.read_csv('data/processed/air_quality_processed.csv', index_col='date', parse_dates=True)\n",
    "print(\"Processed data shape:\", processed_data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Buat salinan data untuk feature engineering\n",
    "df = processed_data.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Interaction Features\n",
    "print(\"=== INTERACTION FEATURES ===\")\n",
    "\n",
    "# Interaksi antara polutan dan kondisi meteorologi\n",
    "df['PM_temp_interaction'] = df['PM2.5'] * df['Temperature']\n",
    "df['NO2_humidity_interaction'] = df['NO2'] * df['Humidity']\n",
    "df['O3_wind_interaction'] = df['O3'] * df['WindSpeed']\n",
    "\n",
    "# Interaksi antara polutan\n",
    "df['PM2.5_PM10_ratio'] = df['PM2.5'] / (df['PM10'] + 1e-5)\n",
    "df['NO2_SO2_ratio'] = df['NO2'] / (df['SO2'] + 1e-5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Aggregate Features\n",
    "print(\"=== AGGREGATE FEATURES ===\")\n",
    "\n",
    "# Total particulate matter\n",
    "df['total_particulate'] = df['PM2.5'] + df['PM10']\n",
    "\n",
    "# Average pollutant level\n",
    "pollutants = ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3']\n",
    "df['avg_pollutant'] = df[pollutants].mean(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. Domain-specific Features\n",
    "print(\"=== DOMAIN-SPECIFIC FEATURES ===\")\n",
    "\n",
    "# Air Quality Index categories (dari notebook 01)\n",
    "def categorize_aqi(aqi):\n",
    "    # ... (sama seperti di notebook 01) ...\n",
    "    pass\n",
    "\n",
    "df['AQI_Category'] = df['AQI'].apply(categorize_aqi)\n",
    "\n",
    "# One-hot encoding untuk AQI_Category (jika ingin digunakan sebagai fitur)\n",
    "# Tapi biasanya target tidak digunakan sebagai fitur, jadi mungkin tidak digunakan\n",
    "# Atau bisa digunakan sebagai fitur untuk model yang memerlukan, tapi hati-hati\n",
    "\n",
    "# Weather comfort index\n",
    "df['comfort_index'] = np.abs(df['Temperature'] - 22) + np.abs(df['Humidity'] - 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. Time Series Features\n",
    "print(\"=== TIME SERIES FEATURES ===\")\n",
    "\n",
    "# Karena data kita time series, kita bisa buat lag dan rolling features\n",
    "# Pastikan data sudah diurutkan berdasarkan waktu\n",
    "\n",
    "# Lag features\n",
    "df['AQI_lag24'] = df['AQI'].shift(24)  # 24 jam sebelumnya\n",
    "\n",
    "# Rolling features\n",
    "df['AQI_rolling_24h_mean'] = df['AQI'].rolling(window=24).mean()\n",
    "df['PM2.5_rolling_24h_mean'] = df['PM2.5'].rolling(window=24).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. Polynomial Features (opsional)\n",
    "print(\"=== POLYNOMIAL FEATURES ===\")\n",
    "\n",
    "# Pilih beberapa fitur penting\n",
    "important_features = ['PM2.5', 'NO2', 'Temperature', 'Humidity']\n",
    "poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n",
    "poly_features = poly.fit_transform(df[important_features])\n",
    "poly_feature_names = poly.get_feature_names_out(important_features)\n",
    "\n",
    "# Buat DataFrame untuk fitur polinomial\n",
    "df_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)\n",
    "\n",
    "# Gabungkan dengan data utama\n",
    "df = pd.concat([df, df_poly], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Seleksi Fitur\n",
    "print(\"=== FEATURE SELECTION ===\")\n",
    "\n",
    "# Pisahkan fitur dan target\n",
    "X = df.drop('AQI', axis=1)\n",
    "y = df['AQI']\n",
    "\n",
    "# Gunakan SelectKBest\n",
    "selector = SelectKBest(score_func=f_regression, k=20)\n",
    "X_selected = selector.fit_transform(X, y)\n",
    "\n",
    "# Dapatkan nama fitur yang terpilih\n",
    "selected_features = X.columns[selector.get_support()].tolist()\n",
    "print(\"Selected features (KBest):\", selected_features)\n",
    "\n",
    "# Gunakan Feature Importance dari Random Forest\n",
    "rf = RandomForestRegressor(n_estimators=100, random_state=42)\n",
    "rf.fit(X, y)\n",
    "\n",
    "# Urutkan fitur berdasarkan importance\n",
    "feature_importances = pd.DataFrame({\n",
    "    'feature': X.columns,\n",
    "    'importance': rf.feature_importances_\n",
    "}).sort_values('importance', ascending=False)\n",
    "\n",
    "# Pilih top 20 fitur\n",
    "top_features = feature_importances.head(20)['feature'].tolist()\n",
    "print(\"Selected features (Random Forest):\", top_features)\n",
    "\n",
    "# Gabungkan kedua metode\n",
    "final_features = list(set(selected_features + top_features))\n",
    "print(\"Final selected features:\", final_features)\n",
    "\n",
    "# Buat dataset akhir dengan fitur terpilih\n",
    "final_data = df[final_features + ['AQI']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save final engineered data\n",
    "final_data.to_csv('data/processed/air_quality_engineered.csv')\n",
    "print(\"Engineered data saved to 'data/processed/air_quality_engineered.csv'\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}