In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

plt.style.use('seaborn')

In [2]:
city_df = pd.read_csv('../data/comparison_city_day.csv')
city_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB


In [3]:
city_max_df = city_df.drop(columns=["AQI", "AQI_Bucket"])

# This gives us the particle for which there is max concentration in each city for each day
city_max_df["Max_particle"] = city_max_df[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']].idxmax(axis=1)
city_max_df.Max_particle.value_counts()

PM10       16124
PM2.5       5234
O3          1870
NOx         1746
NH3         1083
NO2          654
CO           582
SO2          321
Benzene      225
NO           171
Toluene       90
Xylene         8
Name: Max_particle, dtype: int64

In [4]:
# Calculating the 24 hour average and 8 hour maximum for finding subindex
city_subindex_df = city_df
city_subindex_df["PM10_24hr_avg"] = city_subindex_df.groupby("City")["PM10"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["PM2.5_24hr_avg"] = city_subindex_df.groupby("City")["PM2.5"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["SO2_24hr_avg"] = city_subindex_df.groupby("City")["SO2"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["NOx_24hr_avg"] = city_subindex_df.groupby("City")["NOx"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["NH3_24hr_avg"] = city_subindex_df.groupby("City")["NH3"].rolling(window = 24, min_periods = 16).mean().values
city_subindex_df["CO_8hr_max"] = city_subindex_df.groupby("City")["CO"].rolling(window = 8, min_periods = 1).mean().values
city_subindex_df["O3_8hr_max"] = city_subindex_df.groupby("City")["O3"].rolling(window = 8, min_periods = 1).mean().values

city_subindex_df = city_subindex_df.drop(columns = ["PM10", "PM2.5", "SO2", "NOx", "NH3", "CO", "O3", "NO", "NO2", "Benzene", "Toluene", "Xylene", "AQI", "AQI_Bucket"])
city_subindex_df

Unnamed: 0,City,Date,PM10_24hr_avg,PM2.5_24hr_avg,SO2_24hr_avg,NOx_24hr_avg,NH3_24hr_avg,CO_8hr_max,O3_8hr_max
0,Ahmedabad,2015-01-01,,,,,,0.92000,133.36000
1,Ahmedabad,2015-01-02,,,,,,0.94500,83.71000
2,Ahmedabad,2015-01-03,,,,,,6.43000,66.04000
3,Ahmedabad,2015-01-04,,,,,,5.24750,58.55000
4,Ahmedabad,2015-01-05,,,,,,8.61800,54.70200
...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,83.888750,24.799583,8.877083,23.936667,10.276667,0.57750,33.25750
29527,Visakhapatnam,2020-06-28,83.765000,25.017917,9.107500,23.701250,10.408750,0.55000,31.77875
29528,Visakhapatnam,2020-06-29,81.815000,24.734583,9.015000,23.436250,10.417083,0.52125,30.86500
29529,Visakhapatnam,2020-06-30,80.773333,24.525417,9.132500,23.279583,10.475417,0.49500,30.04625


In [5]:
# Functions for calculating subindex for each particle
def PM25_subindex(x):
    if x <= 30:
        return x * 50 / 30
    elif x <= 60:
        return 50 + (x - 30) * 50 / 30
    elif x <= 90:
        return 100 + (x - 60) * 100 / 30
    elif x <= 120:
        return 200 + (x - 90) * 100 / 30
    elif x <= 250:
        return 300 + (x - 120) * 100 / 130
    elif x > 250:
        return 400 + (x - 250) * 100 / 130
    else:
        return 0

def PM10_subindex(x):
    if x <= 50:
        return x
    elif x <= 100:
        return x
    elif x <= 250:
        return 100 + (x - 100) * 100 / 150
    elif x <= 350:
        return 200 + (x - 250)
    elif x <= 430:
        return 300 + (x - 350) * 100 / 80
    elif x > 430:
        return 400 + (x - 430) * 100 / 80
    else:
        return 0

def SO2_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 380:
        return 100 + (x - 80) * 100 / 300
    elif x <= 800:
        return 200 + (x - 380) * 100 / 420
    elif x <= 1600:
        return 300 + (x - 800) * 100 / 800
    elif x > 1600:
        return 400 + (x - 1600) * 100 / 800
    else:
        return 0

def NOx_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 180:
        return 100 + (x - 80) * 100 / 100
    elif x <= 280:
        return 200 + (x - 180) * 100 / 100
    elif x <= 400:
        return 300 + (x - 280) * 100 / 120
    elif x > 400:
        return 400 + (x - 400) * 100 / 120
    else:
        return 0

def NH3_subindex(x):
    if x <= 200:
        return x * 50 / 200
    elif x <= 400:
        return 50 + (x - 200) * 50 / 200
    elif x <= 800:
        return 100 + (x - 400) * 100 / 400
    elif x <= 1200:
        return 200 + (x - 800) * 100 / 400
    elif x <= 1800:
        return 300 + (x - 1200) * 100 / 600
    elif x > 1800:
        return 400 + (x - 1800) * 100 / 600
    else:
        return 0

def CO_subindex(x):
    if x <= 1:
        return x * 50 / 1
    elif x <= 2:
        return 50 + (x - 1) * 50 / 1
    elif x <= 10:
        return 100 + (x - 2) * 100 / 8
    elif x <= 17:
        return 200 + (x - 10) * 100 / 7
    elif x <= 34:
        return 300 + (x - 17) * 100 / 17
    elif x > 34:
        return 400 + (x - 34) * 100 / 17
    else:
        return 0

def O3_subindex(x):
    if x <= 50:
        return x * 50 / 50
    elif x <= 100:
        return 50 + (x - 50) * 50 / 50
    elif x <= 168:
        return 100 + (x - 100) * 100 / 68
    elif x <= 208:
        return 200 + (x - 168) * 100 / 40
    elif x <= 748:
        return 300 + (x - 208) * 100 / 539
    elif x > 748:
        return 400 + (x - 400) * 100 / 539
    else:
        return 0

In [6]:
# Calculating subindexes.
city_subindex_df["PM2.5_subindex"] = city_subindex_df["PM2.5_24hr_avg"].apply(lambda x: PM25_subindex(x))
city_subindex_df["PM10_subindex"] = city_subindex_df["PM10_24hr_avg"].apply(lambda x: PM10_subindex(x))
city_subindex_df["SO2_subindex"] = city_subindex_df["SO2_24hr_avg"].apply(lambda x: SO2_subindex(x))
city_subindex_df["NOx_subindex"] = city_subindex_df["NOx_24hr_avg"].apply(lambda x: NOx_subindex(x))
city_subindex_df["NH3_subindex"] = city_subindex_df["NH3_24hr_avg"].apply(lambda x: NH3_subindex(x))
city_subindex_df["CO_subindex"] = city_subindex_df["CO_8hr_max"].apply(lambda x: CO_subindex(x))
city_subindex_df["O3_subindex"] = city_subindex_df["O3_8hr_max"].apply(lambda x: O3_subindex(x))

city_subindex_df = city_subindex_df.drop(columns=["PM2.5_24hr_avg", "PM10_24hr_avg", "SO2_24hr_avg", "NOx_24hr_avg", "NH3_24hr_avg", "CO_8hr_max", "O3_8hr_max"])
city_subindex_df

Unnamed: 0,City,Date,PM2.5_subindex,PM10_subindex,SO2_subindex,NOx_subindex,NH3_subindex,CO_subindex,O3_subindex
0,Ahmedabad,2015-01-01,0.000000,0.000000,0.000000,0.000000,0.000000,46.00000,149.058824
1,Ahmedabad,2015-01-02,0.000000,0.000000,0.000000,0.000000,0.000000,47.25000,83.710000
2,Ahmedabad,2015-01-03,0.000000,0.000000,0.000000,0.000000,0.000000,155.37500,66.040000
3,Ahmedabad,2015-01-04,0.000000,0.000000,0.000000,0.000000,0.000000,140.59375,58.550000
4,Ahmedabad,2015-01-05,0.000000,0.000000,0.000000,0.000000,0.000000,182.72500,54.702000
...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,41.332639,83.888750,11.096354,29.920833,2.569167,28.87500,33.257500
29527,Visakhapatnam,2020-06-28,41.696528,83.765000,11.384375,29.626562,2.602187,27.50000,31.778750
29528,Visakhapatnam,2020-06-29,41.224306,81.815000,11.268750,29.295313,2.604271,26.06250,30.865000
29529,Visakhapatnam,2020-06-30,40.875694,80.773333,11.415625,29.099479,2.618854,24.75000,30.046250


In [7]:
# This gives us the number of dates and cities for each particle when its concentration (or subindex to be precise) was the highest.
city_subindex_df["Max_particle_subindex"] = city_subindex_df[['PM2.5_subindex', 'PM10_subindex', 'SO2_subindex', 'NOx_subindex', 'NH3_subindex', 'CO_subindex', 'O3_subindex']].idxmax(axis=1)
city_subindex_df.Max_particle_subindex.value_counts()

PM2.5_subindex    12302
PM10_subindex     10305
CO_subindex        4750
NOx_subindex       1278
O3_subindex         887
SO2_subindex          7
NH3_subindex          2
Name: Max_particle_subindex, dtype: int64

In [8]:
# This is done so that the dates for which the data is not present is not included in the average.
city_subindex_df = city_subindex_df.replace(0, np.NaN)
city_subindex_df

Unnamed: 0,City,Date,PM2.5_subindex,PM10_subindex,SO2_subindex,NOx_subindex,NH3_subindex,CO_subindex,O3_subindex,Max_particle_subindex
0,Ahmedabad,2015-01-01,,,,,,46.00000,149.058824,O3_subindex
1,Ahmedabad,2015-01-02,,,,,,47.25000,83.710000,O3_subindex
2,Ahmedabad,2015-01-03,,,,,,155.37500,66.040000,CO_subindex
3,Ahmedabad,2015-01-04,,,,,,140.59375,58.550000,CO_subindex
4,Ahmedabad,2015-01-05,,,,,,182.72500,54.702000,CO_subindex
...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,41.332639,83.888750,11.096354,29.920833,2.569167,28.87500,33.257500,PM10_subindex
29527,Visakhapatnam,2020-06-28,41.696528,83.765000,11.384375,29.626562,2.602187,27.50000,31.778750,PM10_subindex
29528,Visakhapatnam,2020-06-29,41.224306,81.815000,11.268750,29.295313,2.604271,26.06250,30.865000,PM10_subindex
29529,Visakhapatnam,2020-06-30,40.875694,80.773333,11.415625,29.099479,2.618854,24.75000,30.046250,PM10_subindex


In [9]:
# This founds the average concentration for each particle for each city over all dates.
city_average_particle_df = city_subindex_df.groupby('City').mean().reset_index()
city_average_particle_df.columns = city_average_particle_df.columns.str.rstrip('_subindex')
city_average_particle_df

Unnamed: 0,City,PM2.5,PM10,SO2,NO,NH3,CO,O3
0,Ahmedabad,136.935873,108.653549,65.011997,58.84131,,300.74248,38.714193
1,Aizawl,29.911447,24.00208,10.011316,15.499584,5.450005,14.98406,3.62909
2,Amaravati,69.061587,74.95911,17.746205,19.479023,3.001702,31.881565,38.41683
3,Amritsar,103.990658,106.30566,10.120169,44.751654,3.605634,33.345575,22.548262
4,Bengaluru,60.738522,81.771561,6.879077,24.649548,5.465053,63.810767,33.005703
5,Bhopal,100.813526,113.815867,16.5096,29.266535,4.739708,43.947959,59.621734
6,Brajrajnagar,124.290688,110.093918,12.304882,29.905415,9.373907,74.951612,17.086066
7,Chandigarh,77.011131,84.165376,12.655348,19.566363,7.818767,31.781054,20.013563
8,Chennai,88.19818,61.450951,9.832248,22.077443,15.912975,48.308304,32.31498
9,Coimbatore,49.109768,38.372411,10.812434,39.148939,2.696128,47.696617,28.943355


In [10]:
fig = px.treemap(pd.melt(city_average_particle_df, id_vars = 'City'), path=['City','variable'], values=pd.melt(city_average_particle_df, id_vars = 'City')['value'], title = 'Cities and the proportion of pollution in each', width=1400, height=700)
fig.show()