In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Air Quality Prediction - Data Exploration\n",
    "\n",
    "## Tujuan\n",
    "Notebook ini bertujuan untuk melakukan eksplorasi awal terhadap dataset kualitas udara yang akan digunakan untuk membangun model prediksi Air Quality Index (AQI).\n",
    "\n",
    "## Dataset Overview\n",
    "Dataset yang digunakan berisi informasi tentang berbagai parameter kualitas udara seperti:\n",
    "- **PM2.5**: Particulate Matter dengan diameter < 2.5 μm\n",
    "- **PM10**: Particulate Matter dengan diameter < 10 μm\n",
    "- **NO2**: Nitrogen Dioxide\n",
    "- **CO**: Carbon Monoxide\n",
    "- **SO2**: Sulfur Dioxide\n",
    "- **O3**: Ozone\n",
    "- **Temperature**: Suhu udara\n",
    "- **Humidity**: Kelembaban udara\n",
    "- **WindSpeed**: Kecepatan angin\n",
    "- **Pressure**: Tekanan udara\n",
    "- **AQI**: Air Quality Index (target variable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries yang diperlukan\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "from datetime import datetime, timedelta\n",
    "import sys\n",
    "import os\n",
    "\n",
    "# Add src to path\n",
    "sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))\n",
    "\n",
    "# Jika ada modul kustom, kita akan mengimpornya di sini\n",
    "# from visualization.visualize import AirQualityVisualizer\n",
    "# from data.data_loader import AirQualityDataLoader\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "\n",
    "print(\"Libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate sample data untuk demonstrasi\n",
    "# Dalam implementasi nyata, ganti dengan path dataset sebenarnya\n",
    "\n",
    "def generate_sample_air_quality_data(n_samples=2000):\n",
    "    \"\"\"\n",
    "    Generate sample air quality data untuk demonstrasi\n",
    "    Dalam implementasi nyata, ganti dengan data loader yang sebenarnya\n",
    "    \"\"\"\n",
    "    np.random.seed(42)\n",
    "    \n",
    "    # Generate dates\n",
    "    start_date = datetime(2020, 1, 1)\n",
    "    dates = [start_date + timedelta(hours=i) for i in range(n_samples)]\n",
    "    \n",
    "    # Generate correlated pollutant data\n",
    "    base_pollution = np.random.normal(0, 1, n_samples)\n",
    "    seasonal_effect = np.sin(np.arange(n_samples) * 2 * np.pi / (24*30)) * 0.5  # Monthly seasonality\n",
    "    daily_effect = np.sin(np.arange(n_samples) * 2 * np.pi / 24) * 0.3  # Daily pattern\n",
    "    \n",
    "    data = {\n",
    "        'date': dates,\n",
    "        'PM2.5': np.maximum(0, 25 + 15 * base_pollution + 10 * seasonal_effect + np.random.normal(0, 5, n_samples)),\n",
    "        'PM10': np.maximum(0, 45 + 20 * base_pollution + 15 * seasonal_effect + np.random.normal(0, 8, n_samples)),\n",
    "        'NO2': np.maximum(0, 20 + 12 * base_pollution + 8 * daily_effect + np.random.normal(0, 6, n_samples)),\n",
    "        'CO': np.maximum(0, 0.8 + 0.4 * base_pollution + 0.2 * daily_effect + np.random.normal(0, 0.2, n_samples)),\n",
    "        'SO2': np.maximum(0, 6 + 4 * base_pollution + 2 * seasonal_effect + np.random.normal(0, 2, n_samples)),\n",
    "        'O3': np.maximum(0, 35 + 20 * -base_pollution + 15 * daily_effect + np.random.normal(0, 10, n_samples)),  # Inversely correlated\n",
    "        'Temperature': 20 + 10 * np.sin(np.arange(n_samples) * 2 * np.pi / (24*365)) + 5 * daily_effect + np.random.normal(0, 3, n_samples),\n",
    "        'Humidity': np.clip(50 + 20 * np.random.normal(0, 1, n_samples) - 5 * seasonal_effect, 0, 100),\n",
    "        'WindSpeed': np.maximum(0, 8 + 5 * np.random.normal(0, 1, n_samples) + 3 * seasonal_effect),\n",
    "        'Pressure': 1013 + 10 * np.random.normal(0, 1, n_samples) + 5 * seasonal_effect\n",
    "    }\n",
    "    \n",
    "    # Calculate AQI based on pollutants (simplified formula)\n",
    "    data['AQI'] = np.clip(\n",
    "        (data['PM2.5'] * 2.0 + \n",
    "         data['PM10'] * 1.0 + \n",
    "         data['NO2'] * 1.5 + \n",
    "         data['CO'] * 15 + \n",
    "         data['SO2'] * 1.2 - \n",
    "         data['O3'] * 0.1 +\n",
    "         np.random.normal(0, 8, n_samples)) / 2,\n",
    "        0, 500\n",
    "    )\n",
    "    \n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "# Load data\n",
    "df = generate_sample_air_quality_data(2000)\n",
    "print(f\"Dataset loaded successfully!\")\n",
    "print(f\"Shape: {df.shape}\")\n",
    "print(f\"Columns: {list(df.columns)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display basic information about the dataset\n",
    "print(\"=== DATASET INFO ===\")\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\")\n",
    "print(f\"Date range: {df['date'].min()} to {df['date'].max()}\")\n",
    "print(\"\\n=== DATA TYPES ===\")\n",
    "print(df.dtypes)\n",
    "\n",
    "print(\"\\n=== FIRST 5 ROWS ===\")\n",
    "display(df.head())\n",
    "\n",
    "print(\"\\n=== LAST 5 ROWS ===\")\n",
    "display(df.tail())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Statistik deskriptif untuk variabel numerik\n",
    "numeric_columns = df.select_dtypes(include=[np.number]).columns\n",
    "print(\"=== DESCRIPTIVE STATISTICS ===\")\n",
    "display(df[numeric_columns].describe().round(2))\n",
    "\n",
    "print(\"\\n=== TARGET VARIABLE (AQI) STATISTICS ===\")\n",
    "aqi_stats = df['AQI'].describe()\n",
    "print(f\"Mean AQI: {aqi_stats['mean']:.2f}\")\n",
    "print(f\"Median AQI: {aqi_stats['50%']:.2f}\")\n",
    "print(f\"Standard Deviation: {aqi_stats['std']:.2f}\")\n",
    "print(f\"Range: {aqi_stats['min']:.2f} - {aqi_stats['max']:.2f}\")\n",
    "\n",
    "# AQI Categories based on EPA standards\n",
    "def categorize_aqi(aqi):\n",
    "    if aqi <= 50:\n",
    "        return 'Good'\n",
    "    elif aqi <= 100:\n",
    "        return 'Moderate'\n",
    "    elif aqi <= 150:\n",
    "        return 'Unhealthy for Sensitive Groups'\n",
    "    elif aqi <= 200:\n",
    "        return 'Unhealthy'\n",
    "    elif aqi <= 300:\n",
    "        return 'Very Unhealthy'\n",
    "    else:\n",
    "        return 'Hazardous'\n",
    "\n",
    "df['AQI_Category'] = df['AQI'].apply(categorize_aqi)\n",
    "print(\"\\n=== AQI CATEGORIES DISTRIBUTION ===\")\n",
    "aqi_dist = df['AQI_Category'].value_counts()\n",
    "for category, count in aqi_dist.items():\n",
    "    percentage = (count / len(df)) * 100\n",
    "    print(f\"{category}: {count} ({percentage:.1f}%)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values\n",
    "print(\"=== MISSING VALUES ANALYSIS ===\")\n",
    "missing_data = df.isnull().sum()\n",
    "missing_percentage = (missing_data / len(df)) * 100\n",
    "\n",
    "missing_df = pd.DataFrame({\n",
    "    'Missing Count': missing_data,\n",
    "    'Percentage': missing_percentage\n",
    "})\n",
    "\n",
    "missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)\n",
    "\n",
    "if len(missing_df) > 0:\n",
    "    display(missing_df)\n",
    "    \n",
    "    # Visualize missing values\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')\n",
    "plt.title('Missing Values Heatmap')\n",
    "plt.show()\n",
    "else:\n",
    "    print(\"No missing values found in the dataset!\")\n",
    "\n",
    "# Check for duplicate rows\n",
    "duplicates = df.duplicated().sum()\n",
    "print(f\"\\n=== DUPLICATE ROWS ===\")\n",
    "print(f\"Number of duplicate rows: {duplicates}\")\n",
    "if duplicates > 0:\n",
    "    print(f\"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Karena kita tidak punya modul visualizer, kita buat fungsi sederhana\n",
    "\n",
    "def plot_feature_distributions(df, features):\n",
    "    n_cols = 2\n",
    "    n_rows = (len(features) + 1) // n_cols\n",
    "    plt.figure(figsize=(15, 5 * n_rows))\n",
    "    for i, feature in enumerate(features, 1):\n",
    "        plt.subplot(n_rows, n_cols, i)\n",
    "        sns.histplot(df[feature], kde=True)\n",
    "        plt.title(f'Distribution of {feature}')\n",
    "        plt.grid(True, alpha=0.3)\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "# Plot feature distributions\n",
    "pollutant_features = ['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3']\n",
    "weather_features = ['Temperature', 'Humidity', 'WindSpeed', 'Pressure']\n",
    "\n",
    "print(\"=== POLLUTANT DISTRIBUTIONS ===\")\n",
    "plot_feature_distributions(df, pollutant_features)\n",
    "\n",
    "print(\"\\n=== WEATHER FEATURES DISTRIBUTIONS ===\")\n",
    "plot_feature_distributions(df, weather_features)\n",
    "\n",
    "print(\"\\n=== AQI DISTRIBUTION ===\")\n",
    "plt.figure(figsize=(12, 5))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "sns.histplot(df['AQI'], bins=30, kde=True)\n",
    "plt.title('AQI Distribution')\n",
    "plt.xlabel('Air Quality Index')\n",
    "plt.ylabel('Frequency')\n",
    "plt.grid(True, alpha=0.3)\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "sns.boxplot(y=df['AQI'])\n",
    "plt.title('AQI Box Plot')\n",
    "plt.ylabel('Air Quality Index')\n",
    "plt.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# AQI category distribution\n",
    "plt.figure(figsize=(10, 6))\n",
    "aqi_counts = df['AQI_Category'].value_counts()\n",
    "colors = ['green', 'yellow', 'orange', 'red', 'purple', 'maroon']\n",
    "plt.pie(aqi_counts.values, labels=aqi_counts.index, autopct='%1.1f%%', \n",
    "        colors=colors[:len(aqi_counts)], startangle=90)\n",
    "plt.title('Distribution of AQI Categories')\n",
    "plt.axis('equal')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation analysis\n",
    "print(\"=== CORRELATION ANALYSIS ===\")\n",
    "\n",
    "# Fungsi untuk plot correlation matrix\n",
    "def plot_correlation_matrix(df):\n",
    "    plt.figure(figsize=(14, 10))\n",
    "    corr = df.corr()\n",
    "    mask = np.triu(np.ones_like(corr, dtype=bool))\n",
    "    sns.heatmap(corr, mask=mask, annot=True, fmt=\".2f\", cmap='coolwarm', \n",
    "                cbar_kws={'shrink': .8}, vmin=-1, vmax=1)\n",
    "    plt.title('Correlation Matrix')\n",
    "    plt.show()\n",
    "\n",
    "plot_correlation_matrix(df[numeric_columns])\n",
    "\n",
    "# Top correlations with AQI\n",
    "aqi_correlations = df[numeric_columns].corr()['AQI'].abs().sort_values(ascending=False)\n",
    "print(\"\\n=== TOP CORRELATIONS WITH AQI ===\")\n",
    "for feature, corr in aqi_correlations.items():\n",
    "    if feature != 'AQI':\n",
    "        print(f\"{feature}: {corr:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Time series analysis\n",
    "print(\"\\n=== TIME SERIES ANALYSIS ===\")\n",
    "\n",
    "# Set date as index\n",
    "df.set_index('date', inplace=True)\n",
    "\n",
    "# Plot time series of key pollutants\n",
    "plt.figure(figsize=(14, 10))\n",
    "for i, pollutant in enumerate(pollutant_features, 1):\n",
    "    plt.subplot(3, 2, i)\n",
    "    sns.lineplot(x=df.index, y=df[pollutant], alpha=0.7)\n",
    "    plt.title(f'{pollutant} Time Series')\n",
    "    plt.ylabel('Concentration')\n",
    "    plt.xlabel('Date')\n",
    "    plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot AQI time series\n",
    "plt.figure(figsize=(14, 6))\n",
    "sns.lineplot(x=df.index, y=df['AQI'], alpha=0.7)\n",
    "plt.title('AQI Time Series')\n",
    "plt.ylabel('AQI')\n",
    "plt.xlabel('Date')\n",
    "plt.xticks(rotation=45)\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.show()\n",
    "\n",
    "# Plot rolling averages for AQI\n",
    "plt.figure(figsize=(14, 6))\n",
    "df['AQI_7d_ma'] = df['AQI'].rolling(window=24*7).mean()\n",
    "df['AQI_30d_ma'] = df['AQI'].rolling(window=24*30).mean()\n",
    "\n",
    "sns.lineplot(x=df.index, y=df['AQI'], alpha=0.3, label='Hourly AQI')\n",
    "sns.lineplot(x=df.index, y=df['AQI_7d_ma'], label='7-Day Moving Average')\n",
    "sns.lineplot(x=df.index, y=df['AQI_30d_ma'], label='30-Day Moving Average')\n",
    "plt.title('AQI Trends with Moving Averages')\n",
    "plt.ylabel('AQI')\n",
    "plt.xlabel('Date')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Seasonal and diurnal analysis\n",
    "print(\"\\n=== SEASONAL AND DIURNAL ANALYSIS ===\")\n",
    "\n",
    "# Extract time features\n",
    "df['hour'] = df.index.hour\n",
    "df['day_of_week'] = df.index.dayofweek\n",
    "df['month'] = df.index.month\n",
    "\n",
    "# Plot diurnal patterns\n",
    "plt.figure(figsize=(14, 10))\n",
    "for i, pollutant in enumerate(pollutant_features, 1):\n",
    "    plt.subplot(3, 2, i)\n",
    "    sns.boxplot(x='hour', y=pollutant, data=df)\n",
    "    plt.title(f'{pollutant} Diurnal Pattern')\n",
    "    plt.xlabel('Hour of Day')\n",
    "    plt.ylabel('Concentration')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot AQI diurnal pattern\n",
    "plt.figure(figsize=(14, 6))\n",
    "sns.boxplot(x='hour', y='AQI', data=df)\n",
    "plt.title('AQI Diurnal Pattern')\n",
    "plt.xlabel('Hour of Day')\n",
    "plt.ylabel('AQI')\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.show()\n",
    "\n",
    "# Plot monthly patterns\n",
    "plt.figure(figsize=(14, 10))\n",
    "for i, pollutant in enumerate(pollutant_features, 1):\n",
    "    plt.subplot(3, 2, i)\n",
    "    sns.boxplot(x='month', y=pollutant, data=df)\n",
    "    plt.title(f'{pollutant} Monthly Pattern')\n",
    "    plt.xlabel('Month')\n",
    "    plt.ylabel('Concentration')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot AQI monthly pattern\n",
    "plt.figure(figsize=(14, 6))\n",
    "sns.boxplot(x='month', y='AQI', data=df)\n",
    "plt.title('AQI Monthly Pattern')\n",
    "plt.xlabel('Month')\n",
    "plt.ylabel('AQI')\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Multivariate analysis\n",
    "print(\"\\n=== MULTIVARIATE ANALYSIS ===\")\n",
    "\n",
    "# Pairplot for key pollutants and AQI\n",
    "print(\"Pairplot for pollutants and AQI (first 500 samples)\")\n",
    "sns.pairplot(df[['PM2.5', 'PM10', 'NO2', 'CO', 'SO2', 'O3', 'AQI']].sample(500))\n",
    "plt.suptitle('Pairplot of Pollutants and AQI', y=1.02)\n",
    "plt.show()\n",
    "\n",
    "# Scatter plots with regression lines\n",
    "plt.figure(figsize=(14, 10))\n",
    "for i, pollutant in enumerate(pollutant_features, 1):\n",
    "    plt.subplot(3, 2, i)\n",
    "    sns.regplot(x=pollutant, y='AQI', data=df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})\n",
    "    plt.title(f'{pollutant} vs AQI')\n",
    "    plt.xlabel(pollutant)\n",
    "    plt.ylabel('AQI')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interaction with meteorological conditions\n",
    "print(\"\\n=== INTERACTION WITH METEOROLOGICAL CONDITIONS ===\")\n",
    "\n",
    "# Plot temperature vs pollutants\n",
    "plt.figure(figsize=(14, 10))\n",
    "for i, pollutant in enumerate(pollutant_features, 1):\n",
    "    plt.subplot(3, 2, i)\n",
    "    sns.scatterplot(x='Temperature', y=pollutant, data=df, alpha=0.3)\n",
    "    plt.title(f'Temperature vs {pollutant}')\n",
    "    plt.xlabel('Temperature (°C)')\n",
    "    plt.ylabel(pollutant)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Plot wind speed vs pollutants\n",
    "plt.figure(figsize=(14, 10))\n",
    "for i, pollutant in enumerate(pollutant_features, 1):\n",
    "    plt.subplot(3, 2, i)\n",
    "    sns.scatterplot(x='WindSpeed', y=pollutant, data=df, alpha=0.3)\n",
    "    plt.title(f'Wind Speed vs {pollutant}')\n",
    "    plt.xlabel('Wind Speed (m/s)')\n",
    "    plt.ylabel(pollutant)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Kesimpulan eksplorasi data\n",
    "print(\"\\n=== DATA EXPLORATION CONCLUSIONS ===\")\n",
    "print(\"Berdasarkan analisis eksplorasi data, beberapa temuan penting adalah:\")\n",
    "print(\"1. Distribusi AQI menunjukkan bahwa sebagian besar sampel berada dalam kategori 'Moderate'\")\n",
    "print(\"2. Terdapat korelasi kuat antara PM2.5, PM10, dan NO2 dengan AQI\")\n",
    "print(\"3. Polutan menunjukkan pola diurnal yang jelas dengan peningkatan konsentrasi pada jam sibuk\")\n",
    "print(\"4. Terdapat pola musiman dengan tingkat polusi yang lebih tinggi pada bulan-bulan tertentu\")\n",
    "print(\"5. Kondisi meteorologi seperti suhu dan kecepatan angin mempengaruhi tingkat polusi\")\n",
    "print(\"6. Beberapa polutan menunjukkan hubungan non-linear dengan AQI\")\n",
    "print(\"\\nRekomendasi untuk preprocessing dan feature engineering:\")\n",
    "print(\"- Menangani outlier dengan metode yang tepat\")\n",
    "print(\"- Membuat fitur waktu (jam, hari, bulan)\")\n",
    "print(\"- Membuat fitur interaksi antara polutan dan kondisi meteorologi\")\n",
    "print(\"- Membuat fitur rolling statistics untuk menangkap pola temporal\")\n",
    "print(\"- Transformasi logaritmik untuk fitur dengan distribusi miring\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}