In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [4]:
# Set styles for better looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [7]:
df = pd.read_csv(r'../data/cleaned/weather_data_cleaned.csv')
df.head()

Unnamed: 0,datetime,tempmax,tempmin,temp_avg,humidity,rainfall,wind_speed,pressure,cloudcover,dew,...,temp_range,gdd_base10,gdd_cumsum,is_rainy_day,is_heavy_rain,is_hot_day,has_rain,has_partly_cloudy,has_overcast,has_clear
0,2014-01-08,33.8,22.3,27.2,77.1,0.1,10.8,1009.7,49.5,22.1,...,11.5,18.05,138.7,0,0,1,1,1,0,0
1,2014-01-09,33.3,24.0,27.8,77.2,0.5,10.8,1009.8,53.8,22.7,...,9.3,18.65,157.35,0,0,1,1,1,0,0
2,2014-01-10,33.8,23.6,27.7,76.3,0.1,11.2,1010.5,54.2,22.4,...,10.2,18.7,176.05,0,0,1,1,1,0,0
3,2014-01-11,34.6,23.7,27.9,75.1,0.0,11.2,1011.0,52.1,21.9,...,10.9,19.15,195.2,0,0,1,0,1,0,0
4,2014-01-12,33.0,24.3,27.8,77.4,0.9,10.8,1012.1,67.2,22.9,...,8.7,18.65,213.85,0,0,0,1,1,0,0


In [8]:
df['datetime'] = pd.to_datetime(df['datetime'])
print(f"✅ Loaded {len(df)} records")
print(f"   Date range: {df['datetime'].min()} to {df['datetime'].max()}")

✅ Loaded 4346 records
   Date range: 2014-01-08 00:00:00 to 2025-12-01 00:00:00


# Plot Distributions

In [26]:
# Create figure with subplots
fig, axes = plt.subplots(2, 3, figsize=(16, 8))
fig.suptitle('Distribution of Weather Variables (Aba, 2014-2025)', fontsize=16, fontweight='bold')

# 1. Temperature
axes[0, 0].hist(df['temp_avg'], bins=50, color='orange', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Average Temperature Distribution')
axes[0, 0].set_xlabel('Temperature (°C)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['temp_avg'].mean(), color='red', linestyle='--', label=f'Mean: {df["temp_avg"].mean():.1f}°C')
axes[0, 0].legend()

# 2. Rainfall
axes[0, 1].hist(df['rainfall'], bins=50, color='blue', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Rainfall Distribution')
axes[0, 1].set_xlabel('Rainfall (mm)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['rainfall'].mean(), color='red', linestyle='--', label=f'Mean: {df["rainfall"].mean():.1f}mm')
axes[0, 1].legend()

# 3. Humidity
axes[0, 2].hist(df['humidity'], bins=50, color='cyan', edgecolor='black', alpha=0.7)
axes[0, 2].set_title('Humidity Distribution')
axes[0, 2].set_xlabel('Humidity (%)')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].axvline(df['humidity'].mean(), color='red', linestyle='--', label=f'Mean: {df["humidity"].mean():.1f}%')
axes[0, 2].legend()

# 4. Wind Speed
axes[1, 0].hist(df['wind_speed'], bins=50, color='green', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Wind Speed Distribution')
axes[1, 0].set_xlabel('Wind Speed (km/h)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df['wind_speed'].mean(), color='red', linestyle='--', label=f'Mean: {df["wind_speed"].mean():.1f} km/h')
axes[1, 0].legend()

# 5. GDD (Growing Degree Days)
axes[1, 1].hist(df['gdd_base10'], bins=50, color='brown', edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Growing Degree Days (GDD) Distribution')
axes[1, 1].set_xlabel('GDD (base 10°C)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].axvline(df['gdd_base10'].mean(), color='red', linestyle='--', label=f'Mean: {df["gdd_base10"].mean():.1f}')
axes[1, 1].legend()

# 6. Cloud Cover
axes[1, 2].hist(df['cloudcover'], bins=50, color='gray', edgecolor='black', alpha=0.7)
axes[1, 2].set_title('Cloud Cover Distribution')
axes[1, 2].set_xlabel('Cloud Cover (%)')
axes[1, 2].set_ylabel('Frequency')
axes[1, 2].axvline(df['cloudcover'].mean(), color='red', linestyle='--', label=f'Mean: {df["cloudcover"].mean():.1f}%')
axes[1, 2].legend()

plt.tight_layout()
plt.savefig('../outputs/eda_distributions.png', dpi=300, bbox_inches='tight')
print("   ✅ Saved: outputs/eda_distributions.png")
plt.close()

   ✅ Saved: outputs/eda_distributions.png


## Plotting rainfall Analysis

In [29]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Rainfall Analysis (Aba, 2014-2025)', fontsize=16, fontweight='bold')

# 1. Rainy days vs dry days
rainy_days = (df['rainfall'] > 1.0).sum()
dry_days = (df['rainfall'] <= 1.0).sum()

axes[0, 0].bar(['Rainy Days\n(>1mm)', 'Dry Days\n(≤1mm)'], [rainy_days, dry_days], color=['blue', 'orange'])
axes[0, 0].set_title('Rainy vs Dry Days')
axes[0, 0].set_ylabel('Number of Days')
for i, v in enumerate([rainy_days, dry_days]):
    axes[0, 0].text(i, v + 50, f'{v}\n({v/len(df)*100:.1f}%)', ha='center', fontweight='bold')

# 2. Rainfall by month
monthly_rain = df.groupby('month')['rainfall'].mean()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

axes[0, 1].bar(range(1, 13), monthly_rain.values, color='blue', alpha=0.7)
axes[0, 1].set_title('Average Daily Rainfall by Month')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Average Rainfall (mm)')
axes[0, 1].set_xticks(range(1, 13))
axes[0, 1].set_xticklabels(month_names, rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)

# 3. Rainfall intensity categories
rainfall_categories = pd.cut(df['rainfall'], 
                                bins=[0, 1, 10, 25, 50, df['rainfall'].max()],
                                labels=['No rain\n(0-1mm)', 'Light\n(1-10mm)', 'Moderate\n(10-25mm)', 
                                        'Heavy\n(25-50mm)', 'Very Heavy\n(>50mm)'])
category_counts = rainfall_categories.value_counts().sort_index()

axes[1, 0].bar(range(len(category_counts)), category_counts.values, 
                color=['lightgray', 'lightblue', 'blue', 'darkblue', 'navy'])
axes[1, 0].set_title('Rainfall Intensity Distribution')
axes[1, 0].set_ylabel('Number of Days')
axes[1, 0].set_xticks(range(len(category_counts)))
axes[1, 0].set_xticklabels(category_counts.index, rotation=45)
for i, v in enumerate(category_counts.values):
    axes[1, 0].text(i, v + 20, str(v), ha='center', fontweight='bold')

# 4. Cumulative rainfall by year
yearly_rain = df.groupby('year')['rainfall'].sum()

axes[1, 1].plot(yearly_rain.index, yearly_rain.values, marker='o', linewidth=2, markersize=8, color='blue')
axes[1, 1].set_title('Total Annual Rainfall')
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Total Rainfall (mm)')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].axhline(yearly_rain.mean(), color='red', linestyle='--', label=f'Average: {yearly_rain.mean():.0f}mm')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('../outputs/eda_rainfall_analysis.png', dpi=300, bbox_inches='tight')
print("   ✅ Saved: outputs/eda_rainfall_analysis.png")
plt.close()


   ✅ Saved: outputs/eda_rainfall_analysis.png


## Time Series Patterns

In [35]:
fig, axes = plt.subplots(3, 1, figsize=(15, 12))
fig.suptitle('Time Series Patterns (Aba, 2014-2025)', fontsize=16, fontweight='bold')

# 1. Temperature over time
axes[0].plot(df['datetime'], df['temp_avg'], linewidth=0.5, alpha=0.6, color='orange')
axes[0].plot(df['datetime'], df['temp_avg'].rolling(30).mean(), linewidth=2, color='red', label='30-day avg')
axes[0].set_title('Temperature Over Time (with 30-day moving average)')
axes[0].set_ylabel('Temperature (°C)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. Rainfall over time
axes[1].bar(df['datetime'], df['rainfall'], width=1, color='blue', alpha=0.5)
axes[1].plot(df['datetime'], df['rainfall'].rolling(30).mean(), linewidth=2, color='darkblue', label='30-day avg')
axes[1].set_title('Daily Rainfall (with 30-day moving average)')
axes[1].set_ylabel('Rainfall (mm)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# 3. GDD cumulative (shows heat accumulation)
axes[2].plot(df['datetime'], df['gdd_cumsum'], linewidth=1, color='brown')
axes[2].set_title('Cumulative Growing Degree Days (GDD)')
axes[2].set_ylabel('Cumulative GDD')
axes[2].set_xlabel('Date')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/eda_time_series.png', dpi=300, bbox_inches='tight')
print("   ✅ Saved: outputs/eda_time_series.png")
plt.close()

   ✅ Saved: outputs/eda_time_series.png


## Plotting Seasonal Pattersn

In [36]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Seasonal Patterns by Month', fontsize=16, fontweight='bold')

# 1. Box plot: Temperature by month
df.boxplot(column='temp_avg', by='month', ax=axes[0, 0])
axes[0, 0].set_title('Temperature Distribution by Month')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Temperature (°C)')
axes[0, 0].get_figure().suptitle('')  # Remove automatic title

# 2. Box plot: Rainfall by month
df.boxplot(column='rainfall', by='month', ax=axes[0, 1])
axes[0, 1].set_title('Rainfall Distribution by Month')
axes[0, 1].set_xlabel('Month')
axes[0, 1].set_ylabel('Rainfall (mm)')
axes[0, 1].set_ylim(0, 100)  # Limit to see patterns better
axes[0, 1].get_figure().suptitle('')

# 3. Humidity by month
monthly_humidity = df.groupby('month')['humidity'].mean()
axes[1, 0].plot(range(1, 13), monthly_humidity.values, marker='o', linewidth=2, markersize=8, color='cyan')
axes[1, 0].fill_between(range(1, 13), monthly_humidity.values, alpha=0.3, color='cyan')
axes[1, 0].set_title('Average Humidity by Month')
axes[1, 0].set_xlabel('Month')
axes[1, 0].set_ylabel('Humidity (%)')
axes[1, 0].set_xticks(range(1, 13))
axes[1, 0].grid(True, alpha=0.3)

# 4. GDD by month
monthly_gdd = df.groupby('month')['gdd_base10'].mean()
axes[1, 1].bar(range(1, 13), monthly_gdd.values, color='brown', alpha=0.7)
axes[1, 1].set_title('Average Daily GDD by Month')
axes[1, 1].set_xlabel('Month')
axes[1, 1].set_ylabel('GDD (base 10°C)')
axes[1, 1].set_xticks(range(1, 13))
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/eda_seasonal_patterns.png', dpi=300, bbox_inches='tight')
print("   ✅ Saved: outputs/eda_seasonal_patterns.png")
plt.close()

   ✅ Saved: outputs/eda_seasonal_patterns.png


## Correlational Analysis

In [45]:
numeric_features = [
        'tempmax', 'tempmin', 'temp_avg', 'humidity', 'rainfall',
        'wind_speed', 'pressure', 'cloudcover', 'dew',
        'temp_range', 'gdd_base10', 'rainfall_7day_avg',
        'temp_avg_7day', 'humidity_7day_avg'
    ]
# Calculate correlation matrix
corr_matrix = df[numeric_features].corr()

# Create figure
fig, axes = plt.subplots(1, 2, figsize=(18, 8))
fig.suptitle('Feature Correlations', fontsize=16, fontweight='bold')

# 1. Full correlation heatmap
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, ax=axes[0], cbar_kws={'label': 'Correlation'})
axes[0].set_title('Correlation Matrix (All Features)')

# 2. Correlations with rainfall (target variable)
rainfall_corr = corr_matrix['rainfall'].sort_values(ascending=False)

colors = ['green' if x > 0 else 'red' for x in rainfall_corr.values]
axes[1].barh(range(len(rainfall_corr)), rainfall_corr.values, color=colors, alpha=0.7)
axes[1].set_yticks(range(len(rainfall_corr)))
axes[1].set_yticklabels(rainfall_corr.index)
axes[1].set_xlabel('Correlation with Rainfall')
axes[1].set_title('Feature Importance for Rainfall Prediction')
axes[1].axvline(x=0, color='black', linestyle='-', linewidth=0.5)
axes[1].grid(axis='x', alpha=0.3)

# Add correlation values on bars
for i, v in enumerate(rainfall_corr.values):
    axes[1].text(v + 0.02 if v > 0 else v - 0.02, i, f'{v:.3f}', 
                va='center', ha='left' if v > 0 else 'right', fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/eda_correlations.png', dpi=300, bbox_inches='tight')
print("   ✅ Saved: outputs/eda_correlations.png")
plt.close()

   ✅ Saved: outputs/eda_correlations.png


## Scatter plots showing relationship with rainfall.

In [47]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Relationships with Rainfall', fontsize=16, fontweight='bold')

# 1. Humidity vs Rainfall
axes[0, 0].scatter(df['humidity'], df['rainfall'], alpha=0.3, s=10, color='blue')
axes[0, 0].set_xlabel('Humidity (%)')
axes[0, 0].set_ylabel('Rainfall (mm)')
axes[0, 0].set_title('Humidity vs Rainfall')
axes[0, 0].set_ylim(0, 100)  # Limit for visibility

# 2. Cloud Cover vs Rainfall
axes[0, 1].scatter(df['cloudcover'], df['rainfall'], alpha=0.3, s=10, color='gray')
axes[0, 1].set_xlabel('Cloud Cover (%)')
axes[0, 1].set_ylabel('Rainfall (mm)')
axes[0, 1].set_title('Cloud Cover vs Rainfall')
axes[0, 1].set_ylim(0, 100)

# 3. Temperature vs Rainfall
axes[0, 2].scatter(df['temp_avg'], df['rainfall'], alpha=0.3, s=10, color='orange')
axes[0, 2].set_xlabel('Temperature (°C)')
axes[0, 2].set_ylabel('Rainfall (mm)')
axes[0, 2].set_title('Temperature vs Rainfall')
axes[0, 2].set_ylim(0, 100)

# 4. Pressure vs Rainfall
axes[1, 0].scatter(df['pressure'], df['rainfall'], alpha=0.3, s=10, color='purple')
axes[1, 0].set_xlabel('Pressure (hPa)')
axes[1, 0].set_ylabel('Rainfall (mm)')
axes[1, 0].set_title('Pressure vs Rainfall')
axes[1, 0].set_ylim(0, 100)

# 5. Rainfall: Yesterday vs Today
axes[1, 1].scatter(df['rainfall_lag_1'], df['rainfall'], alpha=0.3, s=10, color='green')
axes[1, 1].set_xlabel('Yesterday Rainfall (mm)')
axes[1, 1].set_ylabel('Today Rainfall (mm)')
axes[1, 1].set_title('Rainfall Persistence (Lag-1)')
axes[1, 1].set_xlim(0, 100)
axes[1, 1].set_ylim(0, 100)

# 6. 7-day average rainfall vs today
axes[1, 2].scatter(df['rainfall_7day_avg'], df['rainfall'], alpha=0.3, s=10, color='brown')
axes[1, 2].set_xlabel('7-Day Avg Rainfall (mm)')
axes[1, 2].set_ylabel('Today Rainfall (mm)')
axes[1, 2].set_title('Recent Rainfall Trend')
axes[1, 2].set_xlim(0, 30)
axes[1, 2].set_ylim(0, 100)

plt.tight_layout()
plt.savefig('../outputs/eda_scatter_plots.png', dpi=300, bbox_inches='tight')
print("   ✅ Saved: outputs/eda_scatter_plots.png")
plt.close()


   ✅ Saved: outputs/eda_scatter_plots.png
