# Feature Engineering for Rainfall Forecasting

This notebook explores and implements feature engineering techniques for the Selangor rainfall forecasting project.

## Objectives:
- Load and explore raw data
- Create lag features
- Generate moving averages
- Add seasonal indicators
- Test feature combinations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load Raw Data

In [None]:
# Load the primary dataset
df = pd.read_csv("../data/raw/230731665812CCD_weekly1.csv")
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print("\nFirst few rows:")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nDescriptive Statistics:")
df.describe()

## 2. Lag Features

Create lag features to capture temporal dependencies in the data.

In [None]:
# Create lag features
def create_lag_features(df, columns, lags=[1, 2, 3]):
    """
    Create lag features for specified columns.
    
    Args:
        df: DataFrame
        columns: List of column names to create lags for
        lags: List of lag periods
    
    Returns:
        DataFrame with lag features added
    """
    df_lagged = df.copy()
    
    for col in columns:
        for lag in lags:
            df_lagged[f'{col}_lag_{lag}'] = df_lagged[col].shift(lag)
    
    return df_lagged

# Apply lag features
lag_columns = ['Precipitation_mm', 'Temp_avg', 'Relative_Humidity', 'Wind_kmh']
df_with_lags = create_lag_features(df, lag_columns, lags=[1, 2])

print(f"Original columns: {len(df.columns)}")
print(f"With lag features: {len(df_with_lags.columns)}")
print("\nNew lag columns:")
lag_cols = [col for col in df_with_lags.columns if 'lag' in col]
print(lag_cols)

## 3. Moving Averages

Create moving averages to smooth out short-term fluctuations.

In [None]:
# Create moving average features
def create_moving_averages(df, columns, windows=[3, 4, 6]):
    """
    Create moving average features.
    
    Args:
        df: DataFrame
        columns: List of column names
        windows: List of window sizes
    
    Returns:
        DataFrame with moving average features
    """
    df_ma = df.copy()
    
    for col in columns:
        for window in windows:
            df_ma[f'{col}_ma_{window}'] = df_ma[col].rolling(window=window).mean()
    
    return df_ma

# Apply moving averages
ma_columns = ['Precipitation_mm', 'Temp_avg', 'Relative_Humidity']
df_with_ma = create_moving_averages(df_with_lags, ma_columns, windows=[3, 4])

print(f"With moving averages: {len(df_with_ma.columns)}")
print("\nMoving average columns:")
ma_cols = [col for col in df_with_ma.columns if '_ma_' in col]
print(ma_cols)

## 4. Seasonal Features

Create seasonal indicators based on Malaysian climate patterns.

In [None]:
# Create seasonal features
def create_seasonal_features(df):
    """
    Create seasonal features based on Malaysian climate.
    
    Monsoon season: October-December, April
    Dry season: June-August
    """
    df_seasonal = df.copy()
    
    # Extract month
    df_seasonal['Month'] = df_seasonal['Date'].dt.month
    
    # Monsoon season (heavy rainfall)
    df_seasonal['is_monsoon'] = df_seasonal['Month'].isin([10, 11, 12, 4]).astype(int)
    
    # Dry season (low rainfall)
    df_seasonal['is_dry_season'] = df_seasonal['Month'].isin([6, 7, 8]).astype(int)
    
    # Cyclical encoding for month
    df_seasonal['month_sin'] = np.sin(2 * np.pi * df_seasonal['Month'] / 12)
    df_seasonal['month_cos'] = np.cos(2 * np.pi * df_seasonal['Month'] / 12)
    
    # Week of year cyclical encoding
    df_seasonal['week_sin'] = np.sin(2 * np.pi * df_seasonal['Week_Number'] / 52)
    df_seasonal['week_cos'] = np.cos(2 * np.pi * df_seasonal['Week_Number'] / 52)
    
    return df_seasonal

# Apply seasonal features
df_with_seasonal = create_seasonal_features(df_with_ma)

print(f"With seasonal features: {len(df_with_seasonal.columns)}")
print("\nSeasonal columns:")
seasonal_cols = ['Month', 'is_monsoon', 'is_dry_season', 'month_sin', 'month_cos', 'week_sin', 'week_cos']
print(seasonal_cols)

## 5. Interaction Features

Create interaction features to capture relationships between variables.

In [None]:
# Create interaction features
def create_interaction_features(df):
    """
    Create interaction features between weather variables.
    """
    df_interaction = df.copy()
    
    # Temperature-Humidity interaction (heat index proxy)
    df_interaction['temp_humidity_interaction'] = (
        df_interaction['Temp_avg'] * df_interaction['Relative_Humidity']
    )
    
    # Wind-Precipitation ratio
    df_interaction['wind_precip_ratio'] = (
        df_interaction['Wind_kmh'] / (df_interaction['Precipitation_mm'] + 1)
    )
    
    # Temperature difference from mean
    temp_mean = df_interaction['Temp_avg'].mean()
    df_interaction['temp_deviation'] = df_interaction['Temp_avg'] - temp_mean
    
    # Humidity categories
    df_interaction['humidity_category'] = pd.cut(
        df_interaction['Relative_Humidity'], 
        bins=[0, 60, 80, 100], 
        labels=['Low', 'Medium', 'High']
    )
    
    # One-hot encode humidity categories
    humidity_dummies = pd.get_dummies(df_interaction['humidity_category'], prefix='humidity')
    df_interaction = pd.concat([df_interaction, humidity_dummies], axis=1)
    
    return df_interaction

# Apply interaction features
df_engineered = create_interaction_features(df_with_seasonal)

print(f"Final feature count: {len(df_engineered.columns)}")
print("\nInteraction columns:")
interaction_cols = ['temp_humidity_interaction', 'wind_precip_ratio', 'temp_deviation']
print(interaction_cols)

## 6. Feature Analysis and Visualization

In [None]:
# Analyze correlation with target variable
numeric_cols = df_engineered.select_dtypes(include=[np.number]).columns
correlations = df_engineered[numeric_cols].corr()['Precipitation_mm'].abs().sort_values(ascending=False)

print("Top 15 features correlated with Precipitation:")
print(correlations.head(15))

In [None]:
# Visualize feature importance through correlation
plt.figure(figsize=(12, 8))
top_features = correlations.head(15).index[1:]  # Exclude self-correlation
plt.barh(range(len(top_features)), correlations[top_features].values)
plt.yticks(range(len(top_features)), top_features)
plt.xlabel('Absolute Correlation with Precipitation')
plt.title('Top Features Correlated with Precipitation')
plt.tight_layout()
plt.show()

In [None]:
# Visualize seasonal patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Monthly precipitation patterns
monthly_precip = df_engineered.groupby('Month')['Precipitation_mm'].mean()
axes[0, 0].bar(monthly_precip.index, monthly_precip.values)
axes[0, 0].set_title('Average Precipitation by Month')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Precipitation (mm)')

# Monsoon vs non-monsoon
monsoon_data = df_engineered.groupby('is_monsoon')['Precipitation_mm'].mean()
axes[0, 1].bar(['Non-Monsoon', 'Monsoon'], monsoon_data.values)
axes[0, 1].set_title('Average Precipitation: Monsoon vs Non-Monsoon')
axes[0, 1].set_ylabel('Precipitation (mm)')

# Temperature-Humidity relationship
axes[1, 0].scatter(df_engineered['Temp_avg'], df_engineered['Relative_Humidity'], 
                   c=df_engineered['Precipitation_mm'], cmap='viridis', alpha=0.6)
axes[1, 0].set_xlabel('Temperature (°C)')
axes[1, 0].set_ylabel('Relative Humidity (%)')
axes[1, 0].set_title('Temperature vs Humidity (colored by Precipitation)')

# Lag feature effectiveness
lag_corr = correlations[correlations.index.str.contains('lag')].head(5)
axes[1, 1].barh(range(len(lag_corr)), lag_corr.values)
axes[1, 1].set_yticks(range(len(lag_corr)))
axes[1, 1].set_yticklabels(lag_corr.index)
axes[1, 1].set_xlabel('Correlation with Precipitation')
axes[1, 1].set_title('Top Lag Features')

plt.tight_layout()
plt.show()

## 7. Feature Selection and Final Dataset

In [None]:
# Select top features for modeling
def select_features(df, target_col='Precipitation_mm', top_n=20):
    """
    Select top features based on correlation with target.
    """
    # Calculate correlations
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlations = df[numeric_cols].corr()[target_col].abs().sort_values(ascending=False)
    
    # Select top features (excluding target itself)
    selected_features = correlations.head(top_n + 1).index[1:].tolist()
    
    # Always include basic weather variables
    essential_features = ['Temp_avg', 'Relative_Humidity', 'Wind_kmh']
    for feature in essential_features:
        if feature not in selected_features:
            selected_features.append(feature)
    
    return selected_features

# Select features
selected_features = select_features(df_engineered, top_n=15)
print(f"Selected {len(selected_features)} features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i:2d}. {feature}")

In [None]:
# Create final dataset
final_features = ['Date', 'Year', 'Week_Number'] + selected_features + ['Precipitation_mm']
df_final = df_engineered[final_features].copy()

# Remove rows with NaN values (due to lag features)
df_final_clean = df_final.dropna()

print(f"Original dataset: {len(df_engineered)} rows")
print(f"After removing NaN: {len(df_final_clean)} rows")
print(f"Final feature count: {len(selected_features)}")

# Save the engineered dataset
df_final_clean.to_csv('../data/processed/engineered_features.csv', index=False)
print("\nEngineered dataset saved to data/processed/engineered_features.csv")

## 8. Summary and Next Steps

### Features Created:
1. **Lag Features**: Previous week's weather conditions
2. **Moving Averages**: Smoothed trends over 3-4 week periods
3. **Seasonal Indicators**: Monsoon and dry season flags
4. **Cyclical Encoding**: Month and week cyclical features
5. **Interaction Features**: Temperature-humidity interactions

### Key Insights:
- Lag features show strong correlation with current precipitation
- Seasonal patterns are clearly visible in the data
- Temperature-humidity interactions provide additional predictive power

### Next Steps:
1. Proceed to model training with engineered features
2. Compare performance with and without feature engineering
3. Fine-tune feature selection based on model performance

In [None]:
# Final dataset summary
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"Total features created: {len(df_engineered.columns)}")
print(f"Selected features: {len(selected_features)}")
print(f"Final dataset shape: {df_final_clean.shape}")
print(f"Date range: {df_final_clean['Date'].min()} to {df_final_clean['Date'].max()}")
print("\nFeature engineering completed successfully!")