In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv('data/weather_data.csv')

# Ensure 'YEAR', 'MO', and 'DY' are integers
data['YEAR'] = pd.to_numeric(data['YEAR'], errors='coerce').fillna(0).astype(int)
data['MO'] = pd.to_numeric(data['MO'], errors='coerce').fillna(0).astype(int)
data['DY'] = pd.to_numeric(data['DY'], errors='coerce').fillna(0).astype(int)

# Function to create a date
def create_date(row):
    try:
        year = int(row['YEAR'])
        month = int(row['MO'])
        day = int(row['DY'])
        return pd.Timestamp(year=year, month=month, day=day)
    except ValueError:
        return pd.NaT

# Apply the function to create a new date column
data['Date'] = data.apply(create_date, axis=1)

# Extract features
X = data[['LAT', 'LON', 'Date']].copy()
X['Year'] = X['Date'].dt.year
X['Month'] = X['Date'].dt.month
X['Day'] = X['Date'].dt.day

# Define target variables
y_temp = data['Temperature']
y_wind = data['Wind(m/s)']
y_rainfall = data['Precipitation(mm/day)']
y_humidity = data['Humidity(g/kg)']

# Drop NaN values
X = X.dropna()
y_temp = y_temp[X.index]
y_wind = y_wind[X.index]
y_rainfall = y_rainfall[X.index]
y_humidity = y_humidity[X.index]

# Split the dataset into training and testing sets
X_train, X_test, y_train_temp, y_test_temp = train_test_split(X[['LAT', 'LON', 'Year', 'Month', 'Day']], y_temp, test_size=0.2, random_state=42)
X_train_wind, X_test_wind, y_train_wind, y_test_wind = train_test_split(X[['LAT', 'LON', 'Year', 'Month', 'Day']], y_wind, test_size=0.2, random_state=42)
X_train_rain, X_test_rain, y_train_rain, y_test_rain = train_test_split(X[['LAT', 'LON', 'Year', 'Month', 'Day']], y_rainfall, test_size=0.2, random_state=42)
X_train_hum, X_test_hum, y_train_hum, y_test_hum = train_test_split(X[['LAT', 'LON', 'Year', 'Month', 'Day']], y_humidity, test_size=0.2, random_state=42)

# Initialize Random Forest models
model_temp_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_wind_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rain_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_hum_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the models
model_temp_rf.fit(X_train, y_train_temp)
model_wind_rf.fit(X_train_wind, y_train_wind)
model_rain_rf.fit(X_train_rain, y_train_rain)
model_hum_rf.fit(X_train_hum, y_train_hum)

# Make predictions
y_pred_temp_rf = model_temp_rf.predict(X_test)
y_pred_wind_rf = model_wind_rf.predict(X_test_wind)
y_pred_rain_rf = model_rain_rf.predict(X_test_rain)
y_pred_hum_rf = model_hum_rf.predict(X_test_hum)

# Calculate Mean Squared Error for each model
mse_temp_rf = mean_squared_error(y_test_temp, y_pred_temp_rf)
mse_wind_rf = mean_squared_error(y_test_wind, y_pred_wind_rf)
mse_rain_rf = mean_squared_error(y_test_rain, y_pred_rain_rf)
mse_hum_rf = mean_squared_error(y_test_hum, y_pred_hum_rf)

print(f'Mean Squared Error for Temperature (Random Forest): {mse_temp_rf}')
print(f'Mean Squared Error for Wind (Random Forest): {mse_wind_rf}')
print(f'Mean Squared Error for Rainfall (Random Forest): {mse_rain_rf}')
print(f'Mean Squared Error for Humidity (Random Forest): {mse_hum_rf}')

# Visualize predicted data vs actual data using histograms
def plot_histogram(actual, predicted, title, xlabel):
    plt.figure(figsize=(10, 6))
    plt.hist(actual, bins=20, alpha=0.5, label='Actual', color='blue')
    plt.hist(predicted, bins=20, alpha=0.5, label='Predicted', color='orange')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

plot_histogram(y_test_temp, y_pred_temp_rf, 'Temperature: Actual vs Predicted', 'Temperature (°C)')
plot_histogram(y_test_wind, y_pred_wind_rf, 'Wind Speed: Actual vs Predicted', 'Wind Speed (m/s)')
plot_histogram(y_test_rain, y_pred_rain_rf, 'Rainfall: Actual vs Predicted', 'Rainfall (mm/day)')
plot_histogram(y_test_hum, y_pred_hum_rf, 'Humidity: Actual vs Predicted', 'Humidity (g/kg)')

# Function to visualize geographic data (plot latitude and longitude on a map)
def plot_on_map(latitudes, longitudes, values, title, color_label):
    fig = px.scatter_geo(
        lat=latitudes, lon=longitudes,
        color=values,
        color_continuous_scale='Viridis',
        title=title,
        labels={color_label: color_label}
    )
    fig.update_layout(geo_scope='world')  # Focus on global scale, modify to 'usa', 'europe' for regional
    fig.show()

# Plot actual vs predicted data on a map for Temperature
plot_on_map(X_test['LAT'], X_test['LON'], y_test_temp, "Actual Temperature", "Temperature (°C)")
plot_on_map(X_test['LAT'], X_test['LON'], y_pred_temp_rf, "Predicted Temperature", "Temperature (°C)")

# Plot actual vs predicted data on a map for Wind Speed
plot_on_map(X_test_wind['LAT'], X_test_wind['LON'], y_test_wind, "Actual Wind Speed", "Wind Speed (m/s)")
plot_on_map(X_test_wind['LAT'], X_test_wind['LON'], y_pred_wind_rf, "Predicted Wind Speed", "Wind Speed (m/s)")

# Plot actual vs predicted data on a map for Rainfall
plot_on_map(X_test_rain['LAT'], X_test_rain['LON'], y_test_rain, "Actual Rainfall", "Rainfall (mm/day)")
plot_on_map(X_test_rain['LAT'], X_test_rain['LON'], y_pred_rain_rf, "Predicted Rainfall", "Rainfall (mm/day)")

# Plot actual vs predicted data on a map for Humidity
plot_on_map(X_test_hum['LAT'], X_test_hum['LON'], y_test_hum, "Actual Humidity", "Humidity (g/kg)")
plot_on_map(X_test_hum['LAT'], X_test_hum['LON'], y_pred_hum_rf, "Predicted Humidity", "Humidity (g/kg)")
