In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from math import sqrt
from joblib import dump
import numpy as np

def removeMissingRows(df, column_names):
    df = df.dropna(subset=column_names)
    return df

cities = ['Bakersfield', 'Los_Angeles', 'New_York', 'Phoenix', 'Reno', 'Visalia', 'Denver', 'Boston']
for city in cities:
    df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')
    df = removeMissingRows(df, 'aqi')
    numerical_features = ['temp', 'visibility', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'wind_gust', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_id', 'year', 'month', 'day']
    X = df[numerical_features]
    y = df['aqi']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    gbm_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('scaler', StandardScaler()),
        ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
    ])

    gbm_pipeline.fit(X_train, y_train)
    y_pred = gbm_pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"City: {city}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R^2 Score: {r2}")

    dump(gbm_pipeline, f'gbm_model_{city}.joblib')
    with open(f'gbm_results_{city}.txt', 'w') as file:
        file.write(f"Mean Squared Error (MSE): {mse}\n")
        file.write(f"Root Mean Squared Error (RMSE): {rmse}\n")
        file.write(f"R^2 Score: {r2}\n")



  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Bakersfield
Mean Squared Error (MSE): 100.87158575412798
Root Mean Squared Error (RMSE): 10.043484741568934
R^2 Score: 0.3290117346567467


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Los_Angeles
Mean Squared Error (MSE): 18.333852894066194
Root Mean Squared Error (RMSE): 4.281804864080823
R^2 Score: 0.266643013141344


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: New_York
Mean Squared Error (MSE): 205.29664312711571
Root Mean Squared Error (RMSE): 14.32817654578264
R^2 Score: 0.5051640880233066


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Phoenix
Mean Squared Error (MSE): 6.999820482395419
Root Mean Squared Error (RMSE): 2.6457173852086733
R^2 Score: 0.3622906895730027


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Reno
Mean Squared Error (MSE): 0.4686344727831356
Root Mean Squared Error (RMSE): 0.6845688225322094
R^2 Score: 0.27675083487495244
City: Visalia
Mean Squared Error (MSE): 17.851036827906274
Root Mean Squared Error (RMSE): 4.2250487367492315
R^2 Score: 0.3177792195106931


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Denver
Mean Squared Error (MSE): 99.63022756824611
Root Mean Squared Error (RMSE): 9.981494255282929
R^2 Score: 0.3137043693693553


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Boston
Mean Squared Error (MSE): 154.27060541388516
Root Mean Squared Error (RMSE): 12.420571863400056
R^2 Score: 0.36668780249688526
