In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
measurement_route = "/Users/nathanjones/Downloads/NUWE/Hackathons/Schneider_DataScience/hackathon-schneider-pollution/data/processed/measurements.csv"

df = pd.read_csv(measurement_route, parse_dates=["Measurement date"])

In [7]:
df["Year"] = df["Measurement date"].dt.year
df["Month"] = df["Measurement date"].dt.month
df["Day"] = df["Measurement date"].dt.day
df["Weekday"] = df["Measurement date"].dt.weekday
df.drop(columns=["Measurement date"], inplace=True)



In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544296 entries, 0 to 544295
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Station code  544296 non-null  int64  
 1   SO2           544296 non-null  float64
 2   NO2           544296 non-null  float64
 3   O3            544296 non-null  float64
 4   CO            544296 non-null  float64
 5   PM10          544296 non-null  float64
 6   PM2.5         544296 non-null  float64
 7   Season        544296 non-null  int64  
 8   Hour          544296 non-null  int64  
 9   Year          544296 non-null  int32  
 10  Month         544296 non-null  int32  
 11  Day           544296 non-null  int32  
 12  Weekday       544296 non-null  int32  
dtypes: float64(6), int32(4), int64(3)
memory usage: 45.7 MB


In [12]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define features and pollutants
features = ["Station code", "Hour", "Month", "Day", "Weekday"]
pollutants = ["SO2", "NO2", "O3", "CO", "PM10", "PM2.5"]

models = {}
results = {}

for pollutant in pollutants:
    print(f"Training model for {pollutant}...")
    
    X = df[features]
    y = df[pollutant]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

    model = RandomForestRegressor(n_estimators=50, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    models[pollutant] = model
    results[pollutant] = mae

print("Model Performance:", results)


Training model for SO2...
Training model for NO2...
Training model for O3...
Training model for CO...
Training model for PM10...
Training model for PM2.5...
Model Performance: {'SO2': 0.0006101725763978173, 'NO2': 0.00402240798579215, 'O3': 0.004203404250107172, 'CO': 0.06730001837222119, 'PM10': 7.97794966011391, 'PM2.5': 6.080516381897238}


In [13]:
import joblib

# Save models
for pollutant, model in models.items():
    joblib.dump(model, f"/Users/nathanjones/Downloads/NUWE/Hackathons/Schneider_DataScience/hackathon-schneider-pollution/models/task_2/{pollutant}_model_task_2.pkl")




In [16]:
import json

# Define the prediction periods
prediction_requests = [
    {"station_code": 206, "pollutant": "SO2",   "start": "2023-07-01 00:00:00", "end": "2023-07-31 23:00:00"},
    {"station_code": 211, "pollutant": "NO2",   "start": "2023-08-01 00:00:00", "end": "2023-08-31 23:00:00"},
    {"station_code": 217, "pollutant": "O3",    "start": "2023-09-01 00:00:00", "end": "2023-09-30 23:00:00"},
    {"station_code": 219, "pollutant": "CO",    "start": "2023-10-01 00:00:00", "end": "2023-10-31 23:00:00"},
    {"station_code": 225, "pollutant": "PM10",  "start": "2023-11-01 00:00:00", "end": "2023-11-30 23:00:00"},
    {"station_code": 228, "pollutant": "PM2.5", "start": "2023-12-01 00:00:00", "end": "2023-12-31 23:00:00"},
]

# Initialize result dictionary
predictions_task_2_json = {"target": {}}

for request in prediction_requests:
    station_code = request["station_code"]
    pollutant = request["pollutant"]
    
    # Generate hourly timestamps
    date_range = pd.date_range(start=request["start"], end=request["end"], freq="h")
    
    # Create DataFrame with correctly ordered features
    df_pred = pd.DataFrame({
        "Station code": station_code,
        "Hour": date_range.hour,
        "Month": date_range.month,
        "Day": date_range.day,
        "Weekday": date_range.weekday,
    })

    # Predict using the preloaded model
    model = models[pollutant]  # No need to reload!
    predictions = model.predict(df_pred)

    # Store results in JSON format
    predictions_task_2_json["target"][str(station_code)] = {
        str(date): round(value, 2) for date, value in zip(date_range, predictions)
    }


# Save JSON out
with open("/Users/nathanjones/Downloads/NUWE/Hackathons/Schneider_DataScience/hackathon-schneider-pollution/predictions/predictions_task_2.json", "w") as f:
    json.dump(predictions_task_2_json, f, indent=2)

