In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import aqi_calculator as aqic

plt.style.use('seaborn')

In [2]:
city_df = pd.read_csv('../data/comparison_city_day.csv')
city_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39047 entries, 0 to 39046
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        39047 non-null  object 
 1   Date        39047 non-null  object 
 2   PM2.5       33649 non-null  float64
 3   PM10        27443 non-null  float64
 4   NO          34764 non-null  float64
 5   NO2         34912 non-null  float64
 6   NOx         34128 non-null  float64
 7   NH3         27752 non-null  float64
 8   CO          36398 non-null  float64
 9   SO2         34751 non-null  float64
 10  O3          34157 non-null  float64
 11  Benzene     30407 non-null  float64
 12  Toluene     27321 non-null  float64
 13  AQI         33830 non-null  float64
 14  AQI_Bucket  33830 non-null  object 
dtypes: float64(12), object(3)
memory usage: 4.5+ MB


In [3]:
city_max_df = city_df.drop(columns=["AQI", "AQI_Bucket"])

# This gives us the particle for which there is max concentration in each city for each day
city_max_df["Max_particle"] = city_max_df[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene']].idxmax(axis=1)
city_max_df.Max_particle.value_counts()

PM10       24070
PM2.5       5277
O3          2348
NOx         1985
NH3         1309
NO2          691
CO           582
SO2          364
Benzene      241
NO           223
Toluene      108
Name: Max_particle, dtype: int64

In [4]:
# Calculating the 24 hour average and 8 hour maximum for finding subindex
city_subindex_df = city_df
city_subindex_df["PM10_24hr_avg"] = city_subindex_df.groupby("City")["PM10"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["PM2.5_24hr_avg"] = city_subindex_df.groupby("City")["PM2.5"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["SO2_24hr_avg"] = city_subindex_df.groupby("City")["SO2"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["NOx_24hr_avg"] = city_subindex_df.groupby("City")["NOx"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["NH3_24hr_avg"] = city_subindex_df.groupby("City")["NH3"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["CO_8hr_max"] = city_subindex_df.groupby("City")["CO"].rolling(window = 8, min_periods = 1).mean().values
city_subindex_df["O3_8hr_max"] = city_subindex_df.groupby("City")["O3"].rolling(window = 8, min_periods = 1).mean().values

city_subindex_df = city_subindex_df.drop(columns = ["PM10", "PM2.5", "SO2", "NOx", "NH3", "CO", "O3", "NO", "NO2", "Benzene", "Toluene", "AQI", "AQI_Bucket"])
city_subindex_df

Unnamed: 0,City,Date,PM10_24hr_avg,PM2.5_24hr_avg,SO2_24hr_avg,NOx_24hr_avg,NH3_24hr_avg,CO_8hr_max,O3_8hr_max
0,Ahmedabad,2015-01-01,,,,,,0.92000,133.36000
1,Ahmedabad,2015-01-02,,,,,,0.94500,83.71000
2,Ahmedabad,2015-01-03,,,,,,6.43000,66.04000
3,Ahmedabad,2015-01-04,,,,,,5.24750,58.55000
4,Ahmedabad,2015-01-05,,,,,,8.61800,54.70200
...,...,...,...,...,...,...,...,...,...
39042,Visakhapatnam,2021-06-27,99.677917,32.797083,9.484583,30.636667,18.128750,0.62875,8.34500
39043,Visakhapatnam,2021-06-28,98.625000,32.306667,9.358750,30.405833,18.336667,0.62000,8.57750
39044,Visakhapatnam,2021-06-29,99.997083,32.862083,9.297917,30.512917,18.585417,0.60875,8.34875
39045,Visakhapatnam,2021-06-30,102.282917,33.646250,9.118333,30.699167,18.791667,0.62125,8.29500


In [5]:
# Calculating subindexes.
city_subindex_df["PM2.5_subindex"] = city_subindex_df["PM2.5_24hr_avg"].apply(lambda x: aqic.PM25_subindex(x))
city_subindex_df["PM10_subindex"] = city_subindex_df["PM10_24hr_avg"].apply(lambda x: aqic.PM10_subindex(x))
city_subindex_df["SO2_subindex"] = city_subindex_df["SO2_24hr_avg"].apply(lambda x: aqic.SO2_subindex(x))
city_subindex_df["NOx_subindex"] = city_subindex_df["NOx_24hr_avg"].apply(lambda x: aqic.NOx_subindex(x))
city_subindex_df["NH3_subindex"] = city_subindex_df["NH3_24hr_avg"].apply(lambda x: aqic.NH3_subindex(x))
city_subindex_df["CO_subindex"] = city_subindex_df["CO_8hr_max"].apply(lambda x: aqic.CO_subindex(x))
city_subindex_df["O3_subindex"] = city_subindex_df["O3_8hr_max"].apply(lambda x: aqic.O3_subindex(x))

city_subindex_df = city_subindex_df.drop(columns=["PM2.5_24hr_avg", "PM10_24hr_avg", "SO2_24hr_avg", "NOx_24hr_avg", "NH3_24hr_avg", "CO_8hr_max", "O3_8hr_max"])
city_subindex_df

Unnamed: 0,City,Date,PM2.5_subindex,PM10_subindex,SO2_subindex,NOx_subindex,NH3_subindex,CO_subindex,O3_subindex
0,Ahmedabad,2015-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,46.00000,149.058824
1,Ahmedabad,2015-01-02,0.000000,0.000000,0.000000,0.000000,0.000000,47.25000,83.710000
2,Ahmedabad,2015-01-03,0.000000,0.000000,0.000000,0.000000,0.000000,155.37500,66.040000
3,Ahmedabad,2015-01-04,0.000000,0.000000,0.000000,0.000000,0.000000,140.59375,58.550000
4,Ahmedabad,2015-01-05,0.000000,0.000000,0.000000,0.000000,0.000000,182.72500,54.702000
...,...,...,...,...,...,...,...,...,...
39042,Visakhapatnam,2021-06-27,54.661806,99.677917,11.855729,38.295833,4.532188,31.43750,8.345000
39043,Visakhapatnam,2021-06-28,53.844444,98.625000,11.698437,38.007292,4.584167,31.00000,8.577500
39044,Visakhapatnam,2021-06-29,54.770139,99.997083,11.622396,38.141146,4.646354,30.43750,8.348750
39045,Visakhapatnam,2021-06-30,56.077083,101.521944,11.397917,38.373958,4.697917,31.06250,8.295000


In [6]:
# This gives us the number of dates and cities for each particle when its concentration (or subindex to be precise) was the highest.
city_subindex_df["Max_particle_subindex"] = city_subindex_df[['PM2.5_subindex', 'PM10_subindex', 'SO2_subindex', 'NOx_subindex', 'NH3_subindex', 'CO_subindex', 'O3_subindex']].idxmax(axis=1)
city_subindex_df.Max_particle_subindex.value_counts()

PM10_subindex     15780
PM2.5_subindex    14957
CO_subindex        5530
NOx_subindex       1505
O3_subindex        1232
SO2_subindex         41
NH3_subindex          2
Name: Max_particle_subindex, dtype: int64

In [7]:
# This is done so that the dates for which the data is not present is not included in the average.
city_subindex_df = city_subindex_df.replace(0, np.NaN)
city_subindex_df

Unnamed: 0,City,Date,PM2.5_subindex,PM10_subindex,SO2_subindex,NOx_subindex,NH3_subindex,CO_subindex,O3_subindex,Max_particle_subindex
0,Ahmedabad,2015-01-01,,,,,,46.00000,149.058824,O3_subindex
1,Ahmedabad,2015-01-02,,,,,,47.25000,83.710000,O3_subindex
2,Ahmedabad,2015-01-03,,,,,,155.37500,66.040000,CO_subindex
3,Ahmedabad,2015-01-04,,,,,,140.59375,58.550000,CO_subindex
4,Ahmedabad,2015-01-05,,,,,,182.72500,54.702000,CO_subindex
...,...,...,...,...,...,...,...,...,...,...
39042,Visakhapatnam,2021-06-27,54.661806,99.677917,11.855729,38.295833,4.532188,31.43750,8.345000,PM10_subindex
39043,Visakhapatnam,2021-06-28,53.844444,98.625000,11.698437,38.007292,4.584167,31.00000,8.577500,PM10_subindex
39044,Visakhapatnam,2021-06-29,54.770139,99.997083,11.622396,38.141146,4.646354,30.43750,8.348750,PM10_subindex
39045,Visakhapatnam,2021-06-30,56.077083,101.521944,11.397917,38.373958,4.697917,31.06250,8.295000,PM10_subindex


In [8]:
# This founds the average concentration for each particle for each city over all dates.
city_average_particle_df = city_subindex_df.groupby('City').mean().reset_index()
city_average_particle_df.columns = city_average_particle_df.columns.str.rstrip('_subindex')
city_average_particle_df

Unnamed: 0,City,PM2.5,PM10,SO2,NO,NH3,CO,O3
0,Ahmedabad,128.468878,113.710743,57.130584,56.955292,5.002766,251.468705,39.005262
1,Aizawl,33.536874,36.580698,11.534159,15.770856,5.57344,13.796269,4.794175
2,Amaravati,64.93477,69.824496,18.071335,17.023703,3.025559,30.311475,37.779837
3,Amritsar,101.398753,105.317588,11.039125,43.425751,4.339964,32.214346,22.892324
4,Bengaluru,59.013795,78.795204,7.082393,24.821189,4.919549,59.124247,32.342772
5,Bhopal,88.80594,106.985076,19.726153,25.031848,3.996324,44.122038,62.460692
6,Brajrajnagar,114.190638,103.964006,15.0591,29.610287,7.715336,67.358997,21.048313
7,Chandigarh,70.95666,87.845688,12.366598,21.917705,11.566563,29.036478,24.175383
8,Chennai,82.119196,60.479832,10.066303,21.37163,14.436969,46.783781,32.195856
9,Coimbatore,50.077443,38.500621,11.563998,33.563673,2.115408,39.809443,22.447313


In [9]:
fig = px.treemap(pd.melt(city_average_particle_df, id_vars = 'City'), path=['City','variable'], values=pd.melt(city_average_particle_df, id_vars = 'City')['value'], title = 'Cities and the proportion of pollution in each', width=1400, height=700)
fig.show()