# Extreme Gradient Boosting 
(for NO2)

In [2]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!conda install -c conda-forge xgboost -y

Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [4]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset
file_path = "/Users/shrutikute/Downloads/updated_air_quality_dataset.csv"
df = pd.read_csv(file_path)

# Map county names to city names
county_to_city = {
    "Cook": "Chicago",
    "Los Angeles": "Los Angeles"
}

df["City"] = df["County Name"].map(county_to_city)  # Create a 'City' column

# Define features and target variable
features = ['temperature_2m (°C)', 'relative_humidity_2m (%)', 
            'precipitation (mm)', 'wind_speed_100m (km/h)', 'Day', 'Hour']
target = 'NO2'  # Updating the target variable to NO2

# Drop rows with missing target values
df = df.dropna(subset=[target])

# Loop through each city (Chicago, Los Angeles) and train a model
for city in ["Chicago", "Los Angeles"]:
    print(f"\nTraining Model for {city}")
    
    # Filter data for the specific city
    city_df = df[df["City"] == city]
    
    # Check if there's enough data
    if city_df.shape[0] < 50:  # Adjust threshold as needed
        print(f"Not enough data for {city}. Skipping...")
        continue
    
    # Extract features and target
    X = city_df[features]
    y = city_df[target]

    # Standardizing features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data into training (80%) and testing (20%)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train XGBoost model with optimized hyperparameters
    xgb_regressor = xgb.XGBRegressor(
        objective='reg:squarederror', 
        n_estimators=500, 
        learning_rate=0.05, 
        max_depth=8, 
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    xgb_regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_regressor.predict(X_test)

    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{city} Model Performance:")
    print(f"   • Mean Absolute Error (MAE): {mae:.4f}")
    print(f"   • Mean Squared Error (MSE): {mse:.4f}")
    print(f"   • R-squared Score (R²): {r2:.4f}")



Training Model for Chicago
Chicago Model Performance:
   • Mean Absolute Error (MAE): 3.1177
   • Mean Squared Error (MSE): 17.5033
   • R-squared Score (R²): 0.7164

Training Model for Los Angeles
Los Angeles Model Performance:
   • Mean Absolute Error (MAE): 6.5823
   • Mean Squared Error (MSE): 70.5231
   • R-squared Score (R²): 0.4636


In [11]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load dataset
file_path = "/Users/shrutikute/Downloads/updated_air_quality_dataset.csv"
df = pd.read_csv(file_path)

# Fix column names to remove whitespace
df.columns = df.columns.str.replace(" ", "_")

# Map County Names to City Names (only for Chicago & LA)
county_to_city = {
    "Cook": "Chicago",
    "Los Angeles": "Los Angeles"
}
df["City"] = df["County_Name"].map(county_to_city)

# Feature Engineering - Convert 'Day' to Cyclic Encoding
df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 7)
df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 7)

# Feature Engineering - Rolling Average for NO2 (3-hour trend)
df["NO2_Rolling_Avg"] = df["NO2"].rolling(window=3, min_periods=1).mean()

# Define updated features
features = [
    'temperature_2m_(°C)', 'relative_humidity_2m_(%)', 
    'precipitation_(mm)', 'wind_speed_100m_(km/h)',
    'Day_sin', 'Day_cos', 'NO2_Rolling_Avg', 'Hour'
]
target = 'NO2'  # Updating the target variable to NO2

# Drop rows with missing values in features and target
df = df.dropna(subset=features + [target])

# Loop through each city (Chicago, Los Angeles) and train a model
for city in ["Chicago", "Los Angeles"]:
    print(f"\nTraining Model for {city}")
    
    # Filter data for the specific city
    city_df = df[df["City"] == city]
    
    # Check if there's enough data
    if city_df.shape[0] < 50:
        print(f"Not enough data for {city}. Skipping...")
        continue
    
    # Extract features and target
    X = city_df[features]
    y = city_df[target]

    # Standardizing features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data into training (80%) and testing (20%)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Define parameter grid for Grid Search
    param_grid = {
        'n_estimators': [300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [6, 8],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'reg_alpha': [0.01, 0.1],  # L1 Regularization
        'reg_lambda': [1, 10, 50]  # L2 Regularization
    }

    # Initialize XGBoost model
    xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    # Perform Grid Search CV
    tuner = GridSearchCV(
        xgb_regressor, param_grid, scoring='r2', cv=3, verbose=0, n_jobs=-1
    )

    tuner.fit(X_train, y_train)

    # Best parameters
    best_params = tuner.best_params_
    print(f"Best Hyperparameters for {city}: {best_params}")

    # Train model with best parameters found through grid search
    xgb_best = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
    xgb_best.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_best.predict(X_test)

    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{city} Model Performance:")
    print(f"   • Mean Absolute Error (MAE): {mae:.4f}")
    print(f"   • Mean Squared Error (MSE): {mse:.4f}")
    print(f"   • R-squared Score (R²): {r2:.4f}")



Training Model for Chicago
Best Hyperparameters for Chicago: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300, 'reg_alpha': 0.01, 'reg_lambda': 50, 'subsample': 0.8}
Chicago Model Performance:
   • Mean Absolute Error (MAE): 1.7118
   • Mean Squared Error (MSE): 6.0558
   • R-squared Score (R²): 0.9019

Training Model for Los Angeles
Best Hyperparameters for Los Angeles: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 50, 'subsample': 0.8}
Los Angeles Model Performance:
   • Mean Absolute Error (MAE): 2.3266
   • Mean Squared Error (MSE): 11.2268
   • R-squared Score (R²): 0.9146
