In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import os
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.layers import Input


# DISPLAY SETTING
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import pandas as pd
import os

In [3]:
# Path to the folder containing the datasets
data_folder = "States Data"
output_folder_cleaned = "States Cleaned"
output_folder_predictions = "States Prediction"

# List of states
states = [
    "johor", "kedah", "kelantan", "malacca", "negerisembilan", "pahang", "perak",
    "perlis", "pulaupinang", "sabah", "sarawak", "selangor", "terengganu"
]

In [4]:
# Loop through each state
for state in states:
    print(f"Processing state: {state}")
    
    # Load dataset
    file_path = os.path.join(data_folder, f"realWeather_{state}.csv")
    weather_data = pd.read_csv(file_path, index_col="datetime")
    weather_data.index = pd.to_datetime(weather_data.index)
    weather_data.columns = weather_data.columns.str.lower()
    
    # Clean and preprocess data
    columns_to_keep = [
        'temp', 'feelslike', 'dew', 'humidity', 'precip',
        'windspeed', 'winddir', 'sealevelpressure', 'cloudcover',
        'visibility', 'solarradiation', 'uvindex'
    ]
    weather_data = weather_data[columns_to_keep].copy()
    weather_data = weather_data.apply(lambda col: col.fillna(col.median()) if col.dtype in ['float64', 'int64'] else col)
    

    cleaned_file_path = os.path.join(output_folder_cleaned, f"{state}_cleaned_weather.csv")
    weather_data.to_csv(cleaned_file_path, index=True)
    print(f"Cleaned dataset saved for {state}")

Processing state: johor
Cleaned dataset saved for johor
Processing state: kedah
Cleaned dataset saved for kedah
Processing state: kelantan
Cleaned dataset saved for kelantan
Processing state: malacca
Cleaned dataset saved for malacca
Processing state: negerisembilan
Cleaned dataset saved for negerisembilan
Processing state: pahang
Cleaned dataset saved for pahang
Processing state: perak
Cleaned dataset saved for perak
Processing state: perlis
Cleaned dataset saved for perlis
Processing state: pulaupinang
Cleaned dataset saved for pulaupinang
Processing state: sabah
Cleaned dataset saved for sabah
Processing state: sarawak
Cleaned dataset saved for sarawak
Processing state: selangor
Cleaned dataset saved for selangor
Processing state: terengganu
Cleaned dataset saved for terengganu


# THE START

In [5]:
def create_7_day_hourly_averages(data, target_date):
    """
    Create 7-day hourly averages for each feature, ending on the day before the target date.
    """
    start_date = target_date - pd.Timedelta(days=7)
    end_date = target_date - pd.Timedelta(days=1)
    
    # Filter data for the 7-day window
    seven_day_data = data.loc[start_date:end_date]
    
    # Calculate hourly averages
    hourly_averages = seven_day_data.groupby(seven_day_data.index.hour).mean()
    return hourly_averages

def predict_with_7_day_averages(target_variable, test_features):
    """
    Train and predict using 7-day hourly averages as features.
    """
    # Define features to exclude based on target variable
    exclude_features = ["temp", "feelslike"] if target_variable in ["temp", "feelslike"] else ["precip"]

    # Filter training data up to 30th November 2024
    train_data = weather_data.loc[:'2024-11-30 23:00:00']
    X = train_data.drop(columns=exclude_features)
    y = train_data[target_variable]

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Cross-Validation Setup
    cv = 5  # Number of folds

    # Train and evaluate multiple models
    results = {}

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_scaled, y)
    results['Linear Regression'] = evaluate_model(lr_model, X_scaled, y, cv)

    # XGBoost
    xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    xgb_model.fit(X_scaled, y)
    results['XGBoost'] = evaluate_model(xgb_model, X_scaled, y, cv)

    # Artificial Neural Network (ANN)
    ann_model = MLPRegressor(hidden_layer_sizes=(64, 64), activation='relu', solver='adam', max_iter=1000, random_state=42)
    ann_model.fit(X_scaled, y)
    results['ANN'] = evaluate_model(ann_model, X_scaled, y, cv)

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    rf_model.fit(X_scaled, y)
    results['Random Forest'] = evaluate_model(rf_model, X_scaled, y, cv)

    # Map model names to their variables
    model_map = {
        'Linear Regression': lr_model,
        'XGBoost': xgb_model,
        'ANN': ann_model,
        'Random Forest': rf_model
    }

    # Choose the best model based on R²
    best_model_name = max(results, key=lambda k: results[k]['r2'])
    best_model = model_map[best_model_name]
    print(f"Chosen Model for {target_variable}: {best_model_name}")

    # Align test features with training features (excluding relevant columns)
    aligned_test_features = test_features.drop(columns=exclude_features)
    test_features_scaled = scaler.transform(aligned_test_features)
    return best_model.predict(test_features_scaled)

# Evaluation function for a model
def evaluate_model(model, X_scaled, y, cv):
    y_pred = model.predict(X_scaled)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    cv_mse = -cross_val_score(model, X_scaled, y, cv=cv, scoring='neg_mean_squared_error').mean()
    return {'mse': mse, 'r2': r2, 'mae': mae, 'cv_mse': cv_mse}

In [6]:
# Main loop through each state
for state in states:
    print(f"Processing predictions for state: {state}")
    
    # Load cleaned dataset
    cleaned_file_path = os.path.join(output_folder_cleaned, f"{state}_cleaned_weather.csv")
    weather_data = pd.read_csv(cleaned_file_path, index_col="datetime")
    weather_data.index = pd.to_datetime(weather_data.index)
    
    # Predict for 2nd December 2024
    target_date = pd.Timestamp("2024-12-02")
    hourly_averages = create_7_day_hourly_averages(weather_data, target_date)

    predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))
    for variable in ["temp", "feelslike", "precip"]:
        # Ensure test features exclude relevant columns
        test_features = hourly_averages.copy()
        predictions[variable] = predict_with_7_day_averages(variable, test_features)
    
    # Save predictions
    prediction_file_path = os.path.join(output_folder_predictions, f"{state}_predicted_weather.csv")
    predictions.to_csv(prediction_file_path, index=True)
    print(f"Predictions saved for state: {state}")

Processing predictions for state: johor


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: XGBoost
Chosen Model for feelslike: XGBoost
Chosen Model for precip: Random Forest
Predictions saved for state: johor
Processing predictions for state: kedah


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: ANN
Chosen Model for precip: ANN
Predictions saved for state: kedah
Processing predictions for state: kelantan


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: XGBoost
Chosen Model for feelslike: XGBoost
Chosen Model for precip: Random Forest
Predictions saved for state: kelantan
Processing predictions for state: malacca


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: Random Forest
Chosen Model for precip: ANN
Predictions saved for state: malacca
Processing predictions for state: negerisembilan


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: Random Forest
Chosen Model for precip: Random Forest
Predictions saved for state: negerisembilan
Processing predictions for state: pahang


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: ANN
Chosen Model for precip: ANN
Predictions saved for state: pahang
Processing predictions for state: perak


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: XGBoost
Chosen Model for feelslike: Random Forest
Chosen Model for precip: Random Forest
Predictions saved for state: perak
Processing predictions for state: perlis


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: Random Forest
Chosen Model for precip: Random Forest
Predictions saved for state: perlis
Processing predictions for state: pulaupinang


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: ANN
Chosen Model for feelslike: ANN
Chosen Model for precip: Random Forest
Predictions saved for state: pulaupinang
Processing predictions for state: sabah


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: XGBoost
Chosen Model for feelslike: XGBoost
Chosen Model for precip: Random Forest
Predictions saved for state: sabah
Processing predictions for state: sarawak


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: ANN
Chosen Model for precip: ANN
Predictions saved for state: sarawak
Processing predictions for state: selangor


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: XGBoost
Chosen Model for feelslike: XGBoost
Chosen Model for precip: Random Forest
Predictions saved for state: selangor
Processing predictions for state: terengganu


  predictions = pd.DataFrame(index=pd.date_range(target_date, periods=24, freq="H"))


Chosen Model for temp: Random Forest
Chosen Model for feelslike: Random Forest
Chosen Model for precip: Random Forest
Predictions saved for state: terengganu
