In [80]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Preprocessing the Data

In [81]:
# Load the data
file_path = Path('AirQualityAndWeatherDataMerged.csv')
aqi_df = pd.read_csv(file_path)
aqi_df

Unnamed: 0.1,Unnamed: 0,state,county,city,date_local,parameter,sample_duration,units_of_measure,arithmetic_mean,aqi,...,Precipitation,Snow,Snow Depth,Wind Speed,Wind Direction,Wind Gust,Visibility,Cloud Cover,Relative Humidity,Conditions
0,0,Tennessee,Sumner,Hendersonville,2018-01-01,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),6.2,26.0,...,0.00,0.0,0.0,13.4,292.13,19.7,9.9,32.2,58.99,Partially cloudy
1,1,Tennessee,Davidson,Nashville,2018-01-01,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),5.1,21.0,...,0.00,0.0,0.0,13.9,278.04,21.9,9.9,32.1,57.18,Partially cloudy
2,2,Tennessee,Davidson,Nashville,2018-01-02,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),3.8,16.0,...,0.00,0.0,0.0,8.8,153.88,,9.9,24.8,50.71,Clear
3,3,Tennessee,Sumner,Hendersonville,2018-01-02,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),5.6,23.0,...,0.00,0.0,0.0,7.9,166.08,,9.9,18.0,52.93,Clear
4,4,Tennessee,Sumner,Hendersonville,2018-01-03,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),5.3,22.0,...,0.01,0.0,0.0,11.9,252.83,24.2,9.9,60.1,50.06,"Rain, Partially cloudy"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,1667,Tennessee,Davidson,Nashville,2020-12-30,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),8.7,36.0,...,0.12,0.0,0.0,23.8,170.63,38.1,9.3,68.3,63.09,"Rain, Partially cloudy"
1668,1668,Tennessee,Sumner,Hendersonville,2020-12-30,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),7.3,30.0,...,0.25,0.0,0.0,22.1,185.75,36.1,9.3,67.1,64.16,"Rain, Partially cloudy"
1669,1669,Tennessee,Sumner,Hendersonville,2020-12-31,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),1.0,4.0,...,0.90,0.0,0.0,16.8,127.13,21.8,7.7,97.0,91.89,"Rain, Overcast"
1670,1670,Tennessee,Davidson,Nashville,2020-12-31,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),2.2,9.0,...,0.86,0.0,0.0,17.4,128.83,25.0,7.6,98.8,90.33,"Rain, Overcast"


In [82]:
aqi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1672 entries, 0 to 1671
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           1672 non-null   int64  
 1   state                1672 non-null   object 
 2   county               1672 non-null   object 
 3   city                 1672 non-null   object 
 4   date_local           1672 non-null   object 
 5   parameter            1672 non-null   object 
 6   sample_duration      1672 non-null   object 
 7   units_of_measure     1672 non-null   object 
 8   arithmetic_mean      1672 non-null   float64
 9   aqi                  1672 non-null   float64
 10  City                 1672 non-null   object 
 11  State                1672 non-null   object 
 12  Country              1672 non-null   object 
 13  Date time            1672 non-null   object 
 14  Maximum Temperature  1672 non-null   float64
 15  Minimum Temperature  1672 non-null   f

In [83]:
# Binning aqi values
aqi_bins = [0,50, 100, 300, 500]
group_names = ['Good', 'Moderate', 'Unhealthy', "Hazardous"]

aqi_df["AQI_Range"] = pd.cut(aqi_df["aqi"], aqi_bins, labels=group_names)
aqi_df

Unnamed: 0.1,Unnamed: 0,state,county,city,date_local,parameter,sample_duration,units_of_measure,arithmetic_mean,aqi,...,Snow,Snow Depth,Wind Speed,Wind Direction,Wind Gust,Visibility,Cloud Cover,Relative Humidity,Conditions,AQI_Range
0,0,Tennessee,Sumner,Hendersonville,2018-01-01,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),6.2,26.0,...,0.0,0.0,13.4,292.13,19.7,9.9,32.2,58.99,Partially cloudy,Good
1,1,Tennessee,Davidson,Nashville,2018-01-01,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),5.1,21.0,...,0.0,0.0,13.9,278.04,21.9,9.9,32.1,57.18,Partially cloudy,Good
2,2,Tennessee,Davidson,Nashville,2018-01-02,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),3.8,16.0,...,0.0,0.0,8.8,153.88,,9.9,24.8,50.71,Clear,Good
3,3,Tennessee,Sumner,Hendersonville,2018-01-02,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),5.6,23.0,...,0.0,0.0,7.9,166.08,,9.9,18.0,52.93,Clear,Good
4,4,Tennessee,Sumner,Hendersonville,2018-01-03,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),5.3,22.0,...,0.0,0.0,11.9,252.83,24.2,9.9,60.1,50.06,"Rain, Partially cloudy",Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,1667,Tennessee,Davidson,Nashville,2020-12-30,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),8.7,36.0,...,0.0,0.0,23.8,170.63,38.1,9.3,68.3,63.09,"Rain, Partially cloudy",Good
1668,1668,Tennessee,Sumner,Hendersonville,2020-12-30,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),7.3,30.0,...,0.0,0.0,22.1,185.75,36.1,9.3,67.1,64.16,"Rain, Partially cloudy",Good
1669,1669,Tennessee,Sumner,Hendersonville,2020-12-31,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),1.0,4.0,...,0.0,0.0,16.8,127.13,21.8,7.7,97.0,91.89,"Rain, Overcast",Good
1670,1670,Tennessee,Davidson,Nashville,2020-12-31,PM2.5 - Local Conditions,24-HR BLK AVG,Micrograms/cubic meter (LC),2.2,9.0,...,0.0,0.0,17.4,128.83,25.0,7.6,98.8,90.33,"Rain, Overcast",Good


In [84]:
aqi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1672 entries, 0 to 1671
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Unnamed: 0           1672 non-null   int64   
 1   state                1672 non-null   object  
 2   county               1672 non-null   object  
 3   city                 1672 non-null   object  
 4   date_local           1672 non-null   object  
 5   parameter            1672 non-null   object  
 6   sample_duration      1672 non-null   object  
 7   units_of_measure     1672 non-null   object  
 8   arithmetic_mean      1672 non-null   float64 
 9   aqi                  1672 non-null   float64 
 10  City                 1672 non-null   object  
 11  State                1672 non-null   object  
 12  Country              1672 non-null   object  
 13  Date time            1672 non-null   object  
 14  Maximum Temperature  1672 non-null   float64 
 15  Minimum Temperature  

In [78]:
aqi_df = aqi_df.drop(["Unnamed: 0", "state", "county", "city", "date_local", "parameter", "sample_duration", "units_of_measure", "City", "State", "Country", "Date time"],axis = 1)
aqi_df

KeyError: "['Unnamed: 0' 'state' 'county' 'city' 'date_local' 'parameter'\n 'sample_duration' 'units_of_measure' 'City' 'State' 'Country' 'Date time'] not found in axis"

Unnamed: 0,arithmetic_mean,aqi,Temperature,Visibility,Cloud Cover,Relative Humidity,Conditions,AQI_Range
0,6.2,26.0,13.3,9.9,32.2,58.99,Partially cloudy,Good
1,5.1,21.0,13.6,9.9,32.1,57.18,Partially cloudy,Good
2,3.8,16.0,14.6,9.9,24.8,50.71,Clear,Good
3,5.6,23.0,13.9,9.9,18.0,52.93,Clear,Good
4,5.3,22.0,24.5,9.9,60.1,50.06,"Rain, Partially cloudy",Good
...,...,...,...,...,...,...,...,...
1667,8.7,36.0,56.0,9.3,68.3,63.09,"Rain, Partially cloudy",Good
1668,7.3,30.0,55.5,9.3,67.1,64.16,"Rain, Partially cloudy",Good
1669,1.0,4.0,41.0,7.7,97.0,91.89,"Rain, Overcast",Good
1670,2.2,9.0,41.6,7.6,98.8,90.33,"Rain, Overcast",Good
