In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import os

# === Configuration ===
TARGET_COLUMNS = ['Temperature_C', 'Humidity_pct', 'Precipitation_mm', 'Wind_Speed_kmh']

# === Load and Clean Data ===
filepath= "/content/weather_data.csv"
def load_and_clean_data(filepath):
    print("Loading and cleaning data...")
    df = pd.read_csv(filepath, encoding='utf-8', on_bad_lines='skip')
    df.columns = df.columns.str.strip()

    expected_columns = ['Location', 'Date_Time'] + TARGET_COLUMNS
    if all(col.isdigit() for col in df.columns):
        if len(df.columns) == len(expected_columns):
            print("Warning: Assigning expected column names as CSV lacks header.")
            df.columns = expected_columns
        else:
            raise ValueError(f"CSV column count mismatch. Expected: {expected_columns}")

    missing_columns = [col for col in expected_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns: {missing_columns}")

    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

    non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
    df[non_numeric_columns] = df[non_numeric_columns].fillna(df[non_numeric_columns].mode().iloc[0])

    df['Date_Time'] = pd.to_datetime(df['Date_Time'], errors='coerce')
    df['day_of_year'] = df['Date_Time'].dt.dayofyear
    df['month'] = df['Date_Time'].dt.month

    return df

# === Train Model ===
def train_model_for_location(df, location):
    df_loc = df[df['Location'] == location].copy()
    if df_loc.empty:
        print(f"No data found for {location}")
        return

    # Drop columns for features and targets
    X = df_loc.drop(columns=TARGET_COLUMNS + ['Date_Time', 'Location'])
    y = df_loc[TARGET_COLUMNS]

    # Encode categorical columns
    categorical_cols = X.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        encoder = LabelEncoder()
        X[col] = encoder.fit_transform(X[col])
        label_encoders[col] = encoder

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train multi-output model
    base_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model = MultiOutputRegressor(base_model)
    model.fit(X_train_scaled, y_train)

    # Save everything
    joblib.dump(model, f'weather_model_{location}.pkl')
    joblib.dump(scaler, f'scaler_{location}.pkl')
    joblib.dump(label_encoders, f'label_encoders_{location}.pkl')
    joblib.dump(X.columns.tolist(), f'feature_columns_{location}.pkl')
    joblib.dump(df_loc.iloc[-1].to_dict(), f'last_data_{location}.pkl')

    # Evaluation
    y_pred = model.predict(X_test_scaled)
    print(f"\n📍 {location} Results:")
    for i, target in enumerate(TARGET_COLUMNS):
        mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])  # Calculate MSE
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
        print(f" - {target} | RMSE: {rmse:.2f} | MAE: {mae:.2f}")

# === Predict ===
def predict_next_day(location, input_data):
    model = joblib.load(f'weather_model_{location}.pkl')
    scaler = joblib.load(f'scaler_{location}.pkl')
    encoders = joblib.load(f'label_encoders_{location}.pkl')
    feature_columns = joblib.load(f'feature_columns_{location}.pkl')

    df_input = pd.DataFrame([input_data])

    for col, encoder in encoders.items():
        df_input[col] = encoder.transform(df_input[col])

    df_input_scaled = scaler.transform(df_input[feature_columns])
    prediction = model.predict(df_input_scaled)[0]

    return dict(zip(TARGET_COLUMNS, prediction))

# === MAIN ===
if __name__ == "__main__":
    csv_file_path = "/content/weather_data.csv"  # Update if needed
    df = load_and_clean_data(csv_file_path)
    print(df.head())

    locations = df['Location'].unique()
    print(f"Found locations: {locations}")

    for loc in locations:
        train_model_for_location(df, loc)

    # Example Prediction
    test_location = locations[0]
    last_data = joblib.load(f'last_data_{test_location}.pkl')
    input_data = {k: v for k, v in last_data.items() if k not in TARGET_COLUMNS + ['Date_Time', 'Location']}
    print(f"\nPredicting for tomorrow in {test_location}...")
    prediction = predict_next_day(test_location, input_data)
    print("Prediction Results:")
    for k, v in prediction.items():
        print(f" - {k}: {v:.2f}")


Loading and cleaning data...
       Location           Date_Time  Temperature_C  Humidity_pct  \
0     San Diego 2024-01-14 21:12:46      10.683001     41.195754   
1     San Diego 2024-05-17 15:22:10       8.734140     58.319107   
2     San Diego 2024-05-11 09:30:59      11.632436     38.820175   
3  Philadelphia 2024-02-26 17:32:39      -8.628976     54.074474   
4   San Antonio 2024-04-29 13:23:51      39.808213     72.899908   

   Precipitation_mm  Wind_Speed_kmh  day_of_year  month  
0          4.020119        8.233540           14      1  
1          9.111623       27.715161          138      5  
2          4.607511       28.732951          132      5  
3          3.183720       26.367303           57      2  
4          9.598282       29.898622          120      4  
Found locations: ['San Diego' 'Philadelphia' 'San Antonio' 'San Jose' 'New York' 'Houston'
 'Dallas' 'Chicago' 'Los Angeles' 'Phoenix']

📍 San Diego Results:
 - Temperature_C | RMSE: 14.43 | MAE: 12.49
 - Humidity_