In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from math import sqrt
import numpy as np
from joblib import dump

def removeMissingRows(df, column_names):
    df = df.dropna(subset=column_names)
    return df

# List of cities
cities = ['Bakersfield', 'Los_Angeles', 'New_York', 'Phoenix', 'Reno', 'Visalia', 'Denver', 'Boston']

# Loop through each city
for city in cities:
    # Load the dataset for each city
    df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')
    df = removeMissingRows(df, 'aqi')
    # Define features and target
    numerical_features = ['temp', 'visibility', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'wind_gust', 'rain_1h', 'snow_1h', 'clouds_all', 'weather_id', 'year', 'month', 'day']
    X = df[numerical_features]
    y = df['aqi']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a pipeline for preprocessing and regression
    elastic_net_pipeline = Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('scaler', StandardScaler()),
        ('regressor', ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
    ])

    # Train the model
    elastic_net_pipeline.fit(X_train, y_train)

    # Predict using the model
    y_pred = elastic_net_pipeline.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Output the performance metrics
    print(f"City: {city}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R^2 Score: {r2}")

    # Save the trained model and results
    dump(elastic_net_pipeline, f'elastic_net_model_{city}.joblib')
    with open(f'elastic_net_results_{city}.txt', 'w') as file:
        file.write(f"Mean Squared Error (MSE): {mse}\n")
        file.write(f"Root Mean Squared Error (RMSE): {rmse}\n")
        file.write(f"R^2 Score: {r2}\n")



  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Bakersfield
Mean Squared Error (MSE): 121.4790890229168
Root Mean Squared Error (RMSE): 11.021755260525286
R^2 Score: 0.19193256842767636


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Los_Angeles
Mean Squared Error (MSE): 22.2496544449254
Root Mean Squared Error (RMSE): 4.716953937121435
R^2 Score: 0.1100103378893248


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: New_York
Mean Squared Error (MSE): 279.6010888008414
Root Mean Squared Error (RMSE): 16.721276530242584
R^2 Score: 0.32606467568603614


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Phoenix
Mean Squared Error (MSE): 8.827580212998674
Root Mean Squared Error (RMSE): 2.9711244021411614
R^2 Score: 0.19577507672826278


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Reno
Mean Squared Error (MSE): 0.605740757111703
Root Mean Squared Error (RMSE): 0.7782934903439082
R^2 Score: 0.06515307279584526
City: Visalia
Mean Squared Error (MSE): 24.744701278700887
Root Mean Squared Error (RMSE): 4.974404615499315
R^2 Score: 0.05432106929835623


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Denver
Mean Squared Error (MSE): 111.69162812876387
Root Mean Squared Error (RMSE): 10.568426000534037
R^2 Score: 0.23062028228043263


  df = pd.read_csv(f'../csv/final_merged_data/merged_data_{city}.csv')


City: Boston
Mean Squared Error (MSE): 185.99268069303412
Root Mean Squared Error (RMSE): 13.637913355533321
R^2 Score: 0.23646223457032778
