In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [2]:
# Suppress warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
%matplotlib inline

In [15]:
# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Rainfall%20Forecast/Rainfall.csv")

In [16]:
# Data preprocessing
# Remove unnecessary columns
df.drop(columns=["Date", "Location", "RainTomorrow"], inplace=True)

# Handle missing values if any
df.dropna(inplace=True)

# Convert categorical variables to numerical
df = pd.get_dummies(df, columns=["WindGustDir", "WindDir9am", "WindDir3pm", "RainToday"])

# Define input (independent variables) and output (dependent variable)
X = df.drop(columns=["Rainfall"])
y = df["Rainfall"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Model training and evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "MLP Regressor": MLPRegressor(max_iter=10000),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "SVR Linear Kernel": SVR(kernel="linear"),
    "SVR RBF Kernel": SVR(kernel="rbf")
}

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train_scaled)
    
    # Evaluate model performance on training set
    mae_train = mean_absolute_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    
    # Cross-validation RMSE
    cv_scores = cross_val_score(model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=10)
    cv_rmse_scores = np.sqrt(-cv_scores)
    mean_cv_rmse = cv_rmse_scores.mean()
    
    # Print results
    print(f"Model: {name}")
    print(f"Training MAE: {mae_train:.2f}")
    print(f"Training RMSE: {rmse_train:.2f}")
    print(f"Mean Cross-validation RMSE: {mean_cv_rmse:.2f}")
    print("=" * 40)

Model: Linear Regression
Training MAE: 2.67
Training RMSE: 6.35
Mean Cross-validation RMSE: 6.21
Model: Ridge Regression
Training MAE: 2.67
Training RMSE: 6.35
Mean Cross-validation RMSE: 6.21


In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score


def calculate_accuracy(y_true, y_pred, tolerance):
    return np.mean(np.abs(y_true - y_pred) <= tolerance)


best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test_scaled)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
test_r2 = r2_score(y_test, y_pred_test)
print(f"Test RMSE with best model: {test_rmse:.2f}")
print(f"Test R² with best model: {test_r2:.2f}")


tolerance = 16  
accuracy = calculate_accuracy(y_test, y_pred_test, tolerance)
print(f"Test Accuracy with tolerance {tolerance}: {accuracy:.2f}")