In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px


In [2]:
city_day_data = pd.read_csv('../data/comparison_city_day.csv')

In [3]:
city_df = city_day_data.copy()
city_df['Date'] = pd.to_datetime(city_df['Date'])
city_df = city_df[city_df['AQI'].notna()].reset_index(drop=True)
city_df = city_df.sort_values(by = ['Date','City'])
city_df = city_df.reset_index(drop=True)
city_df.head(5)

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI,AQI_Bucket
0,Delhi,2015-01-01,313.22,607.98,69.16,36.39,110.59,33.85,15.2,9.25,41.68,14.36,24.86,472.0,Severe
1,Delhi,2015-01-02,186.18,269.55,62.09,32.87,88.14,31.83,9.54,6.65,29.97,10.55,20.09,454.0,Severe
2,Delhi,2015-01-03,87.18,131.9,25.73,30.31,47.95,69.55,10.61,2.65,19.71,3.91,10.23,143.0,Moderate
3,Delhi,2015-01-04,151.84,241.84,25.01,36.91,48.62,130.36,11.54,4.63,25.36,4.26,9.71,319.0,Very Poor
4,Delhi,2015-01-05,146.6,219.13,14.01,34.92,38.25,122.88,9.2,3.33,23.2,2.8,6.21,325.0,Very Poor


In [4]:
# The full correlation matrix of the data is as follows
city_df.corr()
# Here we observe that almost all pairs of columns have a positive correlation. 
# NOx has a very high correlation with NO and NO2, something which is expected. 

# PM2.5 and PM10 have a high correlation. PM2. 5 is the subset of PM10 particles
# that have aerodynamic diameters less than or equal to 2.5 μm.

# Benzene and toulene also have a high correlation. This is due to the fact that
# noth these chemicals are very similar physically and chemically and thus are 
# expected to exist under similar conditions.

# Apart from these, there is abscence of any pair of pollutants with particularly high correlation.


Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,AQI
PM2.5,1.0,0.871494,0.469176,0.406199,0.475995,0.31653,0.108662,0.147238,0.181193,0.027294,0.140691,0.702553
PM10,0.871494,1.0,0.519079,0.523979,0.556444,0.394602,0.13269,0.253499,0.258167,0.026689,0.197992,0.84401
NO,0.469176,0.519079,1.0,0.500006,0.819155,0.220422,0.209814,0.183167,0.005071,0.041333,0.168642,0.462666
NO2,0.406199,0.523979,0.500006,1.0,0.664503,0.29958,0.347848,0.399479,0.296289,0.027447,0.261889,0.548254
NOx,0.475995,0.556444,0.819155,0.664503,1.0,0.228318,0.23868,0.255367,0.097633,0.043805,0.208113,0.503932
NH3,0.31653,0.394602,0.220422,0.29958,0.228318,1.0,0.121293,-0.024005,0.109037,-0.00261,0.034683,0.294658
CO,0.108662,0.13269,0.209814,0.347848,0.23868,0.121293,1.0,0.495671,0.051548,0.043718,0.237389,0.654849
SO2,0.147238,0.253499,0.183167,0.399479,0.255367,-0.024005,0.495671,1.0,0.182535,0.031671,0.271102,0.465364
O3,0.181193,0.258167,0.005071,0.296289,0.097633,0.109037,0.051548,0.182535,1.0,0.013854,0.108078,0.217084
Benzene,0.027294,0.026689,0.041333,0.027447,0.043805,-0.00261,0.043718,0.031671,0.013854,1.0,0.774409,0.043167


In [5]:
# Lets see the correlation with AQI specifically.
city_df.corr().AQI


PM2.5      0.702553
PM10       0.844010
NO         0.462666
NO2        0.548254
NOx        0.503932
NH3        0.294658
CO         0.654849
SO2        0.465364
O3         0.217084
Benzene    0.043167
Toluene    0.257159
AQI        1.000000
Name: AQI, dtype: float64