In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
city_day_data = pd.read_csv('../data/comparison_city_day.csv').sort_values(by=['City', 'Date'])

In [42]:
city_df = city_day_data.copy()
city_df = city_df[['City', 'Date', 'AQI']]
city_df['Date'] = pd.to_datetime(city_df['Date'])
city_df

Unnamed: 0,City,Date,AQI
0,Ahmedabad,2015-01-01,
1,Ahmedabad,2015-01-02,
2,Ahmedabad,2015-01-03,
3,Ahmedabad,2015-01-04,
4,Ahmedabad,2015-01-05,
...,...,...,...
29526,Visakhapatnam,2020-06-27,41.0
29527,Visakhapatnam,2020-06-28,70.0
29528,Visakhapatnam,2020-06-29,68.0
29529,Visakhapatnam,2020-06-30,54.0


In [43]:
city_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29531 entries, 0 to 29530
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   City    29531 non-null  object        
 1   Date    29531 non-null  datetime64[ns]
 2   AQI     24850 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 922.8+ KB


In [44]:
# Having rows with null values of AQI won't help us. Setting them as zero would bias our findings
city_df = city_df[city_df['AQI'].notna()].reset_index()
city_df

Unnamed: 0,index,City,Date,AQI
0,28,Ahmedabad,2015-01-29,209.0
1,29,Ahmedabad,2015-01-30,328.0
2,30,Ahmedabad,2015-01-31,514.0
3,31,Ahmedabad,2015-02-01,782.0
4,32,Ahmedabad,2015-02-02,914.0
...,...,...,...,...
24845,29526,Visakhapatnam,2020-06-27,41.0
24846,29527,Visakhapatnam,2020-06-28,70.0
24847,29528,Visakhapatnam,2020-06-29,68.0
24848,29529,Visakhapatnam,2020-06-30,54.0


In [45]:
# Analysis would be easier if we look at things month wise
city_df['Month/Year'] = city_df['Date'].dt.strftime("%m/%Y")
city_df['Month'] = city_df['Month/Year'].apply(lambda x: x.split('/')[0])
city_df['Year'] = city_df['Month/Year'].apply(lambda x: x.split('/')[1])

city_df = city_df.drop(['Date'], axis=1)
city_df

Unnamed: 0,index,City,AQI,Month/Year,Month,Year
0,28,Ahmedabad,209.0,01/2015,01,2015
1,29,Ahmedabad,328.0,01/2015,01,2015
2,30,Ahmedabad,514.0,01/2015,01,2015
3,31,Ahmedabad,782.0,02/2015,02,2015
4,32,Ahmedabad,914.0,02/2015,02,2015
...,...,...,...,...,...,...
24845,29526,Visakhapatnam,41.0,06/2020,06,2020
24846,29527,Visakhapatnam,70.0,06/2020,06,2020
24847,29528,Visakhapatnam,68.0,06/2020,06,2020
24848,29529,Visakhapatnam,54.0,06/2020,06,2020


In [47]:
# Let us look at the mean of all 26 cities first
city_avg = city_df.groupby(['Month/Year', 'Month', 'Year']).agg({'AQI' : np.mean}).reset_index()
city_avg

Unnamed: 0,Month/Year,Month,Year,AQI
0,01/2015,01,2015,343.000000
1,01/2016,01,2016,280.555556
2,01/2017,01,2017,241.483471
3,01/2018,01,2018,257.365617
4,01/2019,01,2019,265.248201
...,...,...,...,...
62,12/2015,12,2015,251.338798
63,12/2016,12,2016,263.555556
64,12/2017,12,2017,228.321256
65,12/2018,12,2018,254.296367


In [52]:
city_avg = city_avg.sort_values(['Year', 'Month'])
# month_to_season_mapping = {
#     '12': 'Winter',
#     '01': 'Winter',
#     '02': 'Spring',
#     '03': 'Spring',
#     '04': 'Summer',
#     '05': 'Summer',
#     '06': 'Summer',
#     '07': 'Monsoon',
#     '08': 'Monsoon',
#     '09': 'Monsoon',
#     '10': 'Autumn',
#     '11': 'Autumn',
# }
# city_avg['Season'] = city_avg['Month'].apply(lambda x: month_to_season_mapping[x])
city_avg

Unnamed: 0,Month/Year,Month,Year,AQI,Season
0,01/2015,01,2015,343.000000,Winter
6,02/2015,02,2015,418.830189,Spring
12,03/2015,03,2015,298.160920,Spring
18,04/2015,04,2015,192.224852,Summer
24,05/2015,05,2015,193.178378,Summer
...,...,...,...,...,...
17,03/2020,03,2020,110.177267,Spring
23,04/2020,04,2020,86.718056,Summer
29,05/2020,05,2020,87.446292,Summer
35,06/2020,06,2020,76.214674,Summer


In [59]:
# Make a line plot of the above
fig = px.line(city_avg, x='Month/Year', y='AQI', height=700, title='Monthly Analysis of AQI')
fig.show()

In [63]:
# See if any city shows abnormality in the above trend
city_wise = city_df.groupby(['City','Month/Year', 'Month', 'Year']).agg({'AQI': np.mean}).reset_index()
city_wise = city_wise.sort_values(['Year', 'Month'])
city_wise

Unnamed: 0,City,Month/Year,Month,Year,AQI
0,Ahmedabad,01/2015,01,2015,350.333333
324,Delhi,01/2015,01,2015,342.290323
4,Ahmedabad,02/2015,02,2015,520.640000
330,Delhi,02/2015,02,2015,327.928571
8,Ahmedabad,03/2015,03,2015,418.571429
...,...,...,...,...,...
766,Patna,07/2020,07,2020,98.000000
794,Shillong,07/2020,07,2020,24.000000
818,Talcher,07/2020,07,2020,101.000000
851,Thiruvananthapuram,07/2020,07,2020,41.000000


In [65]:
for city in city_wise['City'].unique():
    this_city = city_wise[city_wise['City'] == city]
    fig = px.line(this_city, x='Month/Year', y='AQI', height=500, title='Monthly Analysis of AQI of ' + city)
    fig.show()