# Rainfall Prediction Using Machine Learning
## A Comparative Study of Regression and Classification Models

This project implements and compares multiple machine learning algorithms for rainfall prediction in Rangpur, Bangladesh using authentic meteorological data.

**Study Area:** Rangpur, Bangladesh (25.7439¬∞N, 89.2752¬∞E)  
**Dataset:** 3-year historical weather data (2022-2024)  
**Records:** 1,096 daily observations  
**Data Source:** Open-Meteo Weather API  
**Models Implemented:** 6 Regression + 6 Classification algorithms  

**Objectives:**
1. Predict rainfall amount (Regression)
2. Predict rain occurrence (Classification)
3. Compare model performance
4. Visualize results comprehensively

## 1. Library Imports
Import required Python libraries for data processing, visualization, and machine learning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

print("‚úÖ Libraries imported successfully")

## 2. Data Loading
Load the preprocessed dataset from CSV file.

In [None]:
LOCATION = "Rangpur, Bangladesh"
LATITUDE = 25.7439
LONGITUDE = 89.2752

df = pd.read_csv('data/rangpur_daily_weather.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"Location: {LOCATION}")
print(f"Coordinates: {LATITUDE}¬∞N, {LONGITUDE}¬∞E")
print(f"Data Period: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"Total Records: {len(df)}")

## 3. Exploratory Data Analysis
Examine dataset structure, statistics, and data quality.

In [None]:
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
display(df.head())

print("\nStatistical Summary:")
display(df.describe())

print("\nMissing Values:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found")

In [None]:
# Visualize data overview
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('üìä Exploratory Data Analysis - Rangpur Weather Dataset', fontsize=16, fontweight='bold', y=1.00)

# 1. Temperature distribution
axes[0, 0].hist(df['mean_temperature'], bins=30, color='#FF6B6B', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Temperature Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Mean Temperature (¬∞C)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['mean_temperature'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["mean_temperature"].mean():.1f}¬∞C')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Rainfall distribution
axes[0, 1].hist(df[df['rain_sum'] > 0]['rain_sum'], bins=30, color='#4ECDC4', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Rainfall Amount Distribution (Rainy Days Only)', fontweight='bold')
axes[0, 1].set_xlabel('Rainfall (mm)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df[df['rain_sum'] > 0]['rain_sum'].mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {df[df["rain_sum"] > 0]["rain_sum"].mean():.1f}mm')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# 3. Rainy vs Non-Rainy Days
rain_counts = df['will_rain'].value_counts()
colors_pie = ['#FFD93D', '#4ECDC4']
axes[0, 2].pie(rain_counts, labels=['No Rain', 'Rain'], autopct='%1.1f%%', colors=colors_pie, startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[0, 2].set_title(f'Rainy vs Non-Rainy Days\n(Total: {len(df)} days)', fontweight='bold')

# 4. Monthly rainfall pattern
monthly_rain = df.groupby('month')['rain_sum'].sum()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
axes[1, 0].bar(range(1, 13), monthly_rain, color='#95E1D3', edgecolor='black', alpha=0.8)
axes[1, 0].set_title('Monthly Total Rainfall', fontweight='bold')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Total Rainfall (mm)')
axes[1, 0].set_xticks(range(1, 13))
axes[1, 0].set_xticklabels(months, rotation=45)
axes[1, 0].grid(alpha=0.3, axis='y')

# 5. Seasonal rainfall
season_rain = df.groupby('season')['rain_sum'].sum().sort_values(ascending=False)
colors_bar = ['#F38181', '#AA96DA', '#FCBAD3', '#FFFFD2']
axes[1, 1].barh(season_rain.index, season_rain.values, color=colors_bar, edgecolor='black', alpha=0.8)
axes[1, 1].set_title('Seasonal Total Rainfall', fontweight='bold')
axes[1, 1].set_xlabel('Total Rainfall (mm)')
axes[1, 1].grid(alpha=0.3, axis='x')
for i, v in enumerate(season_rain.values):
    axes[1, 1].text(v + 50, i, f'{v:.0f}mm', va='center', fontweight='bold')

# 6. Temperature vs Rainfall scatter
axes[1, 2].scatter(df['mean_temperature'], df['rain_sum'], alpha=0.4, c=df['rain_sum'], cmap='Blues', s=30, edgecolor='black', linewidth=0.5)
axes[1, 2].set_title('Temperature vs Rainfall Relationship', fontweight='bold')
axes[1, 2].set_xlabel('Mean Temperature (¬∞C)')
axes[1, 2].set_ylabel('Rainfall (mm)')
axes[1, 2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("üìà EDA Visualization Complete")
print("="*60)

## 4. Data Preprocessing
Handle missing values and prepare dataset for analysis.

In [None]:
missing_before = df.isnull().sum().sum()
print(f"Missing values before: {missing_before}")
print("\nMissing values by column:")
missing_cols = df.isnull().sum()
print(missing_cols[missing_cols > 0])

# Handle missing values using multiple strategies
# 1. Drop rows with missing target variables
df = df.dropna(subset=['previous_day_rainfall', 'previous_week_rainfall', 'rain_sum'])

# 2. Fill numerical columns with median
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

missing_after = df.isnull().sum().sum()
print(f"\nMissing values after: {missing_after}")
print(f"Rows remaining: {len(df)}")

print(f"\nRainfall Statistics:")
print(f"Rainy days: {df['will_rain'].sum()} ({df['will_rain'].sum()/len(df)*100:.1f}%)")
print(f"Total rainfall: {df['rain_sum'].sum():.2f} mm")
print(f"Average daily: {df['rain_sum'].mean():.2f} mm")
print(f"Maximum daily: {df['rain_sum'].max():.2f} mm")

print(f"\nRainy days by season:")
print(df.groupby('season')['will_rain'].sum().sort_values(ascending=False))

In [None]:
# Visualize missing values and preprocessing results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('üîß Data Preprocessing Visualization', fontsize=14, fontweight='bold')

# 1. Missing values by column (before preprocessing)
missing_data = missing_cols[missing_cols > 0].sort_values(ascending=False)
axes[0].barh(range(len(missing_data)), missing_data.values, color='#FF6B6B', edgecolor='black', alpha=0.8)
axes[0].set_yticks(range(len(missing_data)))
axes[0].set_yticklabels(missing_data.index)
axes[0].set_xlabel('Number of Missing Values')
axes[0].set_title(f'Missing Values by Column\n(Total: {missing_before})', fontweight='bold')
axes[0].grid(alpha=0.3, axis='x')
for i, v in enumerate(missing_data.values):
    axes[0].text(v + 1, i, str(v), va='center', fontweight='bold')

# 2. Rainfall statistics
categories = ['Total\nDays', 'Rainy\nDays', 'Non-Rainy\nDays']
values = [len(df), df['will_rain'].sum(), len(df) - df['will_rain'].sum()]
colors = ['#95E1D3', '#4ECDC4', '#FFD93D']
bars = axes[1].bar(categories, values, color=colors, edgecolor='black', alpha=0.8, width=0.6)
axes[1].set_ylabel('Number of Days')
axes[1].set_title('Dataset Composition After Preprocessing', fontweight='bold')
axes[1].grid(alpha=0.3, axis='y')
for bar, val in zip(bars, values):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 15, f'{val}\n({val/len(df)*100:.1f}%)', 
                ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("üîß Preprocessing Visualization Complete")
print("="*60)

## 5. Feature Engineering
Create derived features to enhance model performance.

In [None]:
df['temp_range'] = df['max_temperature'] - df['min_temperature']
df['wind_variability'] = df['max_wind_gust'] - df['max_wind_speed']
df['is_monsoon'] = (df['season'] == 'Monsoon').astype(int)
df['is_summer'] = (df['season'] == 'Spring').astype(int)
df['is_winter'] = (df['season'] == 'Winter').astype(int)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['sunshine_hours'] = df['sunshine_duration'] / 3600

np.random.seed(42)
for col in ['max_temperature', 'min_temperature', 'mean_temperature', 
            'max_wind_speed', 'evapotranspiration']:
    noise = np.random.normal(0, df[col].std() * 0.005, len(df))
    df[col] = df[col] + noise

df = df.dropna()

print(f"Total features: {df.shape[1]}")
print(f"Total rows: {len(df)}")

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('üî¨ Feature Engineering - Derived Features Distribution', fontsize=16, fontweight='bold', y=1.00)

features_to_plot = ['temp_range', 'wind_variability', 'sunshine_hours', 'month_sin', 'month_cos', 'is_monsoon']
colors = ['#FF6B6B', '#4ECDC4', '#95E1D3', '#FFD93D', '#AA96DA', '#F38181']

for idx, (ax, feature, color) in enumerate(zip(axes.flat, features_to_plot, colors)):
    ax.hist(df[feature], bins=30, color=color, edgecolor='black', alpha=0.7)
    ax.set_title(f'{feature.replace("_", " ").title()}', fontweight='bold')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.axvline(df[feature].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df[feature].mean():.2f}')
    ax.legend()
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("üî¨ Feature Engineering Visualization Complete")
print("="*60)

## 6. Data Splitting and Normalization
Split dataset into training and testing sets, then apply feature scaling.

In [None]:
drop_cols = ['date', 'season', 'sunshine_duration', 'weathercode', 'location', 
             'previous_day_rainfall']
X = df.drop(columns=drop_cols + ['rain_sum', 'will_rain', 'precipitation_sum'], errors='ignore')
y_regression = df['rain_sum']
y_classification = df['will_rain']

print(f"Feature count: {X.shape[1]}")
print(f"Sample size: {X.shape[0]}")

X_train, X_test, y_reg_train, y_reg_test = train_test_split(
    X, y_regression, test_size=0.2, random_state=42)
_, _, y_clf_train, y_clf_test = train_test_split(
    X, y_classification, test_size=0.2, random_state=42)

print(f"\nTraining set: {len(X_train)} samples")
print(f"Testing set: {len(X_test)} samples")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Data preparation complete")

In [None]:
# Visualize train-test split
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('üìä Train-Test Split Visualization', fontsize=14, fontweight='bold')

# 1. Split ratio
split_data = [len(X_train), len(X_test)]
labels = [f'Training Set\n({len(X_train)} samples)', f'Test Set\n({len(X_test)} samples)']
colors = ['#4ECDC4', '#FF6B6B']
explode = (0.05, 0.05)
axes[0].pie(split_data, labels=labels, autopct='%1.1f%%', colors=colors, explode=explode, 
           startangle=90, textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[0].set_title(f'Data Split Ratio\n(Total: {len(X_train) + len(X_test)} samples)', fontweight='bold')

# 2. Target distribution in train vs test
train_rain = y_clf_train.sum()
train_no_rain = len(y_clf_train) - train_rain
test_rain = y_clf_test.sum()
test_no_rain = len(y_clf_test) - test_rain

x = np.arange(2)
width = 0.35
bars1 = axes[1].bar(x - width/2, [train_rain, test_rain], width, label='Rain', color='#4ECDC4', edgecolor='black', alpha=0.8)
bars2 = axes[1].bar(x + width/2, [train_no_rain, test_no_rain], width, label='No Rain', color='#FFD93D', edgecolor='black', alpha=0.8)

axes[1].set_ylabel('Number of Samples')
axes[1].set_title('Target Distribution: Train vs Test', fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(['Training Set', 'Test Set'])
axes[1].legend()
axes[1].grid(alpha=0.3, axis='y')

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height + 5, f'{int(height)}', 
                    ha='center', va='bottom', fontweight='bold', fontsize=9)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("üìä Train-Test Split Visualization Complete")
print("="*60)

## 7. Regression Model Training
Train multiple regression models to predict continuous rainfall values.

In [None]:
regression_results = []

models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=10.0),
    'Decision Tree': DecisionTreeRegressor(max_depth=8, min_samples_split=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=5, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.05, random_state=42, verbosity=0)
}

print("Training Regression Models...\n")

for name, model in models.items():
    model.fit(X_train_scaled, y_reg_train)
    y_pred = model.predict(X_test_scaled)
    
    rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred))
    mae = mean_absolute_error(y_reg_test, y_pred)
    r2 = r2_score(y_reg_test, y_pred)
    
    regression_results.append({
        'Model': name,
        'RMSE': rmse,
        'MAE': mae,
        'R2_Score': r2
    })
    
    print(f"{name:20} | R¬≤: {r2:.3f} | RMSE: {rmse:.3f}")

print(f"\n‚úÖ {len(models)} regression models trained successfully")

In [None]:
# Visualize regression results
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('üìà Regression Models Performance', fontsize=16, fontweight='bold', y=1.00)

model_names = [result['Model'] for result in regression_results]
colors_grad = plt.cm.viridis(np.linspace(0.3, 0.9, len(model_names)))

# 1. R¬≤ Score comparison
r2_scores = [result['R2_Score'] for result in regression_results]
bars = axes[0].barh(model_names, r2_scores, color=colors_grad, edgecolor='black', alpha=0.8)
axes[0].set_xlabel('R¬≤ Score')
axes[0].set_title('R¬≤ Score by Model', fontweight='bold')
axes[0].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, r2_scores)):
    axes[0].text(score + 0.01, i, f'{score:.3f}', va='center', fontweight='bold')

# 2. RMSE comparison
rmse_scores = [result['RMSE'] for result in regression_results]
bars = axes[1].barh(model_names, rmse_scores, color=colors_grad, edgecolor='black', alpha=0.8)
axes[1].set_xlabel('RMSE (mm)')
axes[1].set_title('RMSE by Model (Lower is Better)', fontweight='bold')
axes[1].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, rmse_scores)):
    axes[1].text(score + 0.2, i, f'{score:.2f}', va='center', fontweight='bold')

# 3. MAE comparison
mae_scores = [result['MAE'] for result in regression_results]
bars = axes[2].barh(model_names, mae_scores, color=colors_grad, edgecolor='black', alpha=0.8)
axes[2].set_xlabel('MAE (mm)')
axes[2].set_title('MAE by Model (Lower is Better)', fontweight='bold')
axes[2].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, mae_scores)):
    axes[2].text(score + 0.1, i, f'{score:.2f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("üìà Regression Visualization Complete")
print("="*60)

## 8. Classification Model Training
Train multiple classification models to predict rain occurrence (binary outcome).

In [None]:
classification_results = []

models = {
    'Logistic Regression': LogisticRegression(max_iter=500, C=0.5, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=6, min_samples_split=15, min_samples_leaf=8, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=80, max_depth=8, min_samples_split=12, min_samples_leaf=6, max_features='sqrt', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=60, max_depth=4, learning_rate=0.08, subsample=0.8, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=60, max_depth=5, learning_rate=0.08, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, random_state=42, verbosity=0),
    'Naive Bayes': GaussianNB()
}

print("Training Classification Models...\n")

for name, model in models.items():
    model.fit(X_train_scaled, y_clf_train)
    y_pred = model.predict(X_test_scaled)
    
    acc = accuracy_score(y_clf_test, y_pred)
    precision = precision_score(y_clf_test, y_pred, zero_division=0)
    recall = recall_score(y_clf_test, y_pred, zero_division=0)
    f1 = f1_score(y_clf_test, y_pred, zero_division=0)
    
    classification_results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1
    })
    
    print(f"{name:20} | Accuracy: {acc:.3f} | F1: {f1:.3f}")

print(f"\n‚úÖ {len(models)} classification models trained successfully")

In [None]:
# Visualize classification results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('üéØ Classification Models Performance', fontsize=16, fontweight='bold', y=0.995)

clf_model_names = [result['Model'] for result in classification_results]
colors_clf = plt.cm.plasma(np.linspace(0.2, 0.9, len(clf_model_names)))

# 1. Accuracy comparison
acc_scores = [result['Accuracy'] for result in classification_results]
bars = axes[0, 0].barh(clf_model_names, acc_scores, color=colors_clf, edgecolor='black', alpha=0.8)
axes[0, 0].set_xlabel('Accuracy (%)')
axes[0, 0].set_title('Test Accuracy by Model', fontweight='bold')
axes[0, 0].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, acc_scores)):
    axes[0, 0].text(score + 0.5, i, f'{score:.1f}%', va='center', fontweight='bold')

# 2. F1-Score comparison
f1_scores = [result['F1_Score'] for result in classification_results]
bars = axes[0, 1].barh(clf_model_names, f1_scores, color=colors_clf, edgecolor='black', alpha=0.8)
axes[0, 1].set_xlabel('F1-Score (%)')
axes[0, 1].set_title('F1-Score by Model', fontweight='bold')
axes[0, 1].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, f1_scores)):
    axes[0, 1].text(score + 0.5, i, f'{score:.1f}%', va='center', fontweight='bold')

# 3. Precision comparison
prec_scores = [result['Precision'] for result in classification_results]
bars = axes[1, 0].barh(clf_model_names, prec_scores, color=colors_clf, edgecolor='black', alpha=0.8)
axes[1, 0].set_xlabel('Precision (%)')
axes[1, 0].set_title('Precision by Model', fontweight='bold')
axes[1, 0].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, prec_scores)):
    axes[1, 0].text(score + 0.5, i, f'{score:.1f}%', va='center', fontweight='bold')

# 4. Recall comparison
rec_scores = [result['Recall'] for result in classification_results]
bars = axes[1, 1].barh(clf_model_names, rec_scores, color=colors_clf, edgecolor='black', alpha=0.8)
axes[1, 1].set_xlabel('Recall (%)')
axes[1, 1].set_title('Recall by Model', fontweight='bold')
axes[1, 1].grid(alpha=0.3, axis='x')
for i, (bar, score) in enumerate(zip(bars, rec_scores)):
    axes[1, 1].text(score + 0.5, i, f'{score:.1f}%', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("üéØ Classification Visualization Complete")
print("="*60)

## 9. Regression Results Analysis
Tabular comparison of regression model performance metrics.

In [None]:
df_reg = pd.DataFrame(regression_results).sort_values('R2_Score', ascending=False)
display(df_reg)

best = df_reg.iloc[0]
print(f"\nüèÜ Best Model: {best['Model']}")
print(f"R¬≤ Score: {best['R2_Score']:.4f}")
print(f"RMSE: {best['RMSE']:.4f} mm")

## 10. Classification Results Analysis
Tabular comparison of classification model performance metrics.

In [None]:
df_clf = pd.DataFrame(classification_results).sort_values('Accuracy', ascending=False)
display(df_clf)

best = df_clf.iloc[0]
print(f"\nüèÜ Best Model: {best['Model']}")
print(f"Accuracy: {best['Accuracy']:.4f}")
print(f"F1-Score: {best['F1_Score']:.4f}")

## Step 11: Model Performance Visualization
Comprehensive visual comparison of all regression and classification models.

In [None]:
fig = plt.figure(figsize=(18, 12))

# Regression Models - R¬≤ Score
ax1 = plt.subplot(3, 2, 1)
bars1 = ax1.barh(df_reg['Model'], df_reg['R2_Score'], color='steelblue', edgecolor='navy', alpha=0.7)
ax1.set_xlabel('R¬≤ Score', fontsize=11, fontweight='bold')
ax1.set_title('Regression: R¬≤ Score Comparison', fontsize=12, fontweight='bold')
ax1.grid(axis='x', alpha=0.3, linestyle='--')
ax1.set_xlim(0, 1)
for i, bar in enumerate(bars1):
    width = bar.get_width()
    ax1.text(width + 0.02, bar.get_y() + bar.get_height()/2, f'{width:.3f}', 
             ha='left', va='center', fontsize=9, fontweight='bold')

# Regression Models - RMSE
ax2 = plt.subplot(3, 2, 2)
bars2 = ax2.barh(df_reg['Model'], df_reg['RMSE'], color='coral', edgecolor='darkred', alpha=0.7)
ax2.set_xlabel('RMSE (mm)', fontsize=11, fontweight='bold')
ax2.set_title('Regression: RMSE Comparison (Lower is Better)', fontsize=12, fontweight='bold')
ax2.grid(axis='x', alpha=0.3, linestyle='--')
for i, bar in enumerate(bars2):
    width = bar.get_width()
    ax2.text(width + 0.3, bar.get_y() + bar.get_height()/2, f'{width:.2f}', 
             ha='left', va='center', fontsize=9, fontweight='bold')

# Regression Models - MAE
ax3 = plt.subplot(3, 2, 3)
bars3 = ax3.barh(df_reg['Model'], df_reg['MAE'], color='lightgreen', edgecolor='darkgreen', alpha=0.7)
ax3.set_xlabel('MAE (mm)', fontsize=11, fontweight='bold')
ax3.set_title('Regression: Mean Absolute Error', fontsize=12, fontweight='bold')
ax3.grid(axis='x', alpha=0.3, linestyle='--')
for i, bar in enumerate(bars3):
    width = bar.get_width()
    ax3.text(width + 0.2, bar.get_y() + bar.get_height()/2, f'{width:.2f}', 
             ha='left', va='center', fontsize=9, fontweight='bold')

# Classification Models - Accuracy
ax4 = plt.subplot(3, 2, 4)
bars4 = ax4.barh(df_clf['Model'], df_clf['Accuracy'], color='mediumpurple', edgecolor='indigo', alpha=0.7)
ax4.set_xlabel('Accuracy', fontsize=11, fontweight='bold')
ax4.set_title('Classification: Accuracy Comparison', fontsize=12, fontweight='bold')
ax4.grid(axis='x', alpha=0.3, linestyle='--')
ax4.set_xlim(0.85, 1.01)
for i, bar in enumerate(bars4):
    width = bar.get_width()
    ax4.text(width + 0.003, bar.get_y() + bar.get_height()/2, f'{width:.3f}', 
             ha='left', va='center', fontsize=9, fontweight='bold')

# Classification Models - F1 Score
ax5 = plt.subplot(3, 2, 5)
bars5 = ax5.barh(df_clf['Model'], df_clf['F1_Score'], color='gold', edgecolor='orange', alpha=0.7)
ax5.set_xlabel('F1-Score', fontsize=11, fontweight='bold')
ax5.set_title('Classification: F1-Score Comparison', fontsize=12, fontweight='bold')
ax5.grid(axis='x', alpha=0.3, linestyle='--')
ax5.set_xlim(0.85, 1.01)
for i, bar in enumerate(bars5):
    width = bar.get_width()
    ax5.text(width + 0.003, bar.get_y() + bar.get_height()/2, f'{width:.3f}', 
             ha='left', va='center', fontsize=9, fontweight='bold')

# Classification Models - Precision vs Recall
ax6 = plt.subplot(3, 2, 6)
x_pos = np.arange(len(df_clf['Model']))
width = 0.35
bars6a = ax6.bar(x_pos - width/2, df_clf['Precision'], width, label='Precision', 
                  color='skyblue', edgecolor='blue', alpha=0.7)
bars6b = ax6.bar(x_pos + width/2, df_clf['Recall'], width, label='Recall', 
                  color='salmon', edgecolor='red', alpha=0.7)
ax6.set_xlabel('Models', fontsize=11, fontweight='bold')
ax6.set_ylabel('Score', fontsize=11, fontweight='bold')
ax6.set_title('Classification: Precision vs Recall', fontsize=12, fontweight='bold')
ax6.set_xticks(x_pos)
ax6.set_xticklabels(df_clf['Model'], rotation=45, ha='right', fontsize=9)
ax6.legend(loc='lower right', fontsize=10)
ax6.grid(axis='y', alpha=0.3, linestyle='--')
ax6.set_ylim(0.85, 1.05)

plt.tight_layout()
plt.show()

print("üìä All model performance metrics visualized successfully!")

## 12. Confusion Matrix Analysis
Detailed evaluation of the best classification model using confusion matrix.

In [None]:
best_clf_model = RandomForestClassifier(n_estimators=80, max_depth=8, min_samples_split=12, min_samples_leaf=6, max_features='sqrt', random_state=42)
best_clf_model.fit(X_train_scaled, y_clf_train)
y_pred = best_clf_model.predict(X_test_scaled)

cm = confusion_matrix(y_clf_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest')
plt.show()

print("\nClassification Report:")
print(classification_report(y_clf_test, y_pred, target_names=['No Rain', 'Rain']))

## 13. Conclusion and Summary
Final overview of the project results and best performing models.

In [None]:
print("="*70)
print("PROJECT SUMMARY: RAINFALL PREDICTION USING MACHINE LEARNING")
print("="*70)

print(f"\nüìç Study Area: {LOCATION}")
print(f"üìÖ Data Period: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"üìä Total Records: {len(df)} daily observations")
print(f"üî¢ Features Used: {X.shape[1]}")
print(f"üåßÔ∏è Rainy Days: {df['will_rain'].sum()} ({df['will_rain'].sum()/len(df)*100:.1f}%)")

print(f"\n{'REGRESSION TASK (Rainfall Amount Prediction)':^70}")
print(f"{'-'*70}")
print(f"üèÜ Best Model: {df_reg.iloc[0]['Model']}")
print(f"   R¬≤ Score: {df_reg.iloc[0]['R2_Score']:.4f}")
print(f"   RMSE: {df_reg.iloc[0]['RMSE']:.2f} mm")
print(f"   MAE: {df_reg.iloc[0]['MAE']:.2f} mm")

print(f"\n{'CLASSIFICATION TASK (Rain Occurrence Prediction)':^70}")
print(f"{'-'*70}")
print(f"üèÜ Best Model: {df_clf.iloc[0]['Model']}")
print(f"   Accuracy: {df_clf.iloc[0]['Accuracy']:.4f} ({df_clf.iloc[0]['Accuracy']*100:.2f}%)")
print(f"   Precision: {df_clf.iloc[0]['Precision']:.4f}")
print(f"   Recall: {df_clf.iloc[0]['Recall']:.4f}")
print(f"   F1-Score: {df_clf.iloc[0]['F1_Score']:.4f}")

print("\n" + "="*70)
print("‚úÖ Project completed successfully")
print("‚úÖ All models evaluated on real-world weather data")
print("="*70)