<a href="https://colab.research.google.com/github/mayank5515/major_project/blob/main/Copy_of_major_randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Setup ---
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import json
from datetime import datetime, timedelta


In [None]:

# --- Step 1: Mount Google Drive and Load Data ---
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/major_project/anand-vihar_delhi-air-quality.csv'
df = pd.read_csv(file_path)

# Preview
print("✅ CSV Loaded:")
print(df.head())




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ CSV Loaded:
       date  pm25  pm10   o3  no2  so2   co
0  2025/5/1   165    90   11   12    2   14
1  2025/5/2   130   134   10   15    3   22
2  2025/5/3   170   152    8   15    3   23
3  2025/5/4   162    94   10   13    2   13
4  2025/5/5   123    99   15   13    2   12


In [None]:
# --- Step 2: Preprocess ---
df.columns = [col.strip().lower() for col in df.columns]
df['date'] = pd.to_datetime(df['date'], format='%Y/%m/%d', errors='coerce')

# Replace problematic strings like '--' with NaN
df.replace(['--', 'NA', 'NaN', '', ' '], pd.NA, inplace=True)

# Convert all columns except 'date' to numeric
for col in ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with any missing values
df = df.dropna(subset=['pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'date'])

In [None]:
# --- Step 3: Prepare Features ---
df = df.sort_values('date')
features = ['pm10', 'o3', 'no2', 'so2', 'co']
X = df[features]
y = df['pm25']


In [None]:
# --- Step 4: Train/Test Split ---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# --- Step 5: Train Model ---
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# # --- Step 6: Forecast Next 24 Hours ---
# from datetime import timedelta
# import json

# last_known = X.iloc[-1].values.reshape(1, -1)
# base_time = df['date'].max() + timedelta(days=1)

# forecast = []
# for i in range(24):
#     predicted = model.predict(last_known)[0]
#     forecast.append({
#         "timestamp": (base_time + timedelta(hours=i)).strftime('%Y-%m-%dT%H:%M:%S'),
#         "aqi": round(predicted)
#     })
# --- Step 6: Forecast Next 10 Days (Modified) ---
from datetime import timedelta
import json

# Prepare the base time as before
base_time = df['date'].max() + timedelta(days=1)

# Store the forecasted data for the next 10 days
forecast = []

# Loop through the next 10 days
for day in range(10):
    # Start with the last known data for the first prediction
    last_known = X.iloc[-1].values.reshape(1, -1)

    # Forecast for the current day (24 hours)
    daily_forecast = []
    for i in range(24):
        predicted = model.predict(last_known)[0]
        daily_forecast.append({
            "timestamp": (base_time + timedelta(hours=i + day * 24)).strftime('%Y-%m-%dT%H:%M:%S'),
            "aqi": round(predicted)
        })
        # Update the last known data for the next hour
        last_known[0] = list(last_known[0][1:]) + [predicted]

    # Append the daily forecast to the overall forecast
    forecast.extend(daily_forecast)




In [None]:
from google.colab import files
import json

# Save forecast data
with open('/content/forecast_10_days.json', 'w') as f:
    json.dump(forecast, f, indent=2)

# Download it
files.download('/content/forecast_10_days.json')

print("✅ Done! forecast_10_days.json is downloading...")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Done! forecast_10_days.json is downloading...
