In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

plt.style.use('seaborn')


In [3]:
city_hour_df = pd.read_csv('../data/comparison_city_hour.csv', usecols=[0,1,14])
city_hour_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707875 entries, 0 to 707874
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   City      707875 non-null  object 
 1   Datetime  707875 non-null  object 
 2   AQI       578795 non-null  float64
dtypes: float64(1), object(2)
memory usage: 16.2+ MB


In [4]:
# Rows with null values of AQI won't have any contribution and setting them to any other value may bias the results.
city_hour_df = city_hour_df[city_hour_df['AQI'].notna()].reset_index(drop=True)
city_hour_df

Unnamed: 0,City,Datetime,AQI
0,Ahmedabad,2015-01-29 09:00:00,288.0
1,Ahmedabad,2015-01-29 10:00:00,181.0
2,Ahmedabad,2015-01-29 11:00:00,182.0
3,Ahmedabad,2015-01-29 12:00:00,182.0
4,Ahmedabad,2015-01-29 13:00:00,179.0
...,...,...,...
578790,Visakhapatnam,2020-06-30 20:00:00,51.0
578791,Visakhapatnam,2020-06-30 21:00:00,51.0
578792,Visakhapatnam,2020-06-30 22:00:00,50.0
578793,Visakhapatnam,2020-06-30 23:00:00,50.0


In [5]:
# Since the analysis is performed on hourly basis, it would be easier if we only consider the time of day.
city_hour_df['Time'] = pd.to_datetime(city_hour_df['Datetime']).dt.time
city_hour_df = city_hour_df.drop(columns=['Datetime'])
city_hour_df

Unnamed: 0,City,AQI,Time
0,Ahmedabad,288.0,09:00:00
1,Ahmedabad,181.0,10:00:00
2,Ahmedabad,182.0,11:00:00
3,Ahmedabad,182.0,12:00:00
4,Ahmedabad,179.0,13:00:00
...,...,...,...
578790,Visakhapatnam,51.0,20:00:00
578791,Visakhapatnam,51.0,21:00:00
578792,Visakhapatnam,50.0,22:00:00
578793,Visakhapatnam,50.0,23:00:00


In [6]:
# This will give us the mean of all 26 cities for each hour of the day.
city_hour_avg = city_hour_df.groupby(['Time'])['AQI'].mean().reset_index()
city_hour_avg

Unnamed: 0,Time,AQI
0,00:00:00,176.218518
1,01:00:00,175.797356
2,02:00:00,175.16512
3,03:00:00,175.306664
4,04:00:00,174.593521
5,05:00:00,172.429719
6,06:00:00,169.036355
7,07:00:00,165.273535
8,08:00:00,162.214873
9,09:00:00,159.361069


In [7]:
fig = px.line(city_hour_avg, x='Time', y='AQI', title='Average AQI for each hour of the day', markers=True)
fig.show()

In [8]:
city_wise = city_hour_df.groupby(['City', 'Time'])['AQI'].mean().reset_index()
city_wise = city_wise.sort_values(by=['City', 'Time'])
city_wise

Unnamed: 0,City,Time,AQI
0,Ahmedabad,00:00:00,641.205975
1,Ahmedabad,01:00:00,649.441732
2,Ahmedabad,02:00:00,650.274725
3,Ahmedabad,03:00:00,652.148322
4,Ahmedabad,04:00:00,646.931518
...,...,...,...
619,Visakhapatnam,19:00:00,116.795826
620,Visakhapatnam,20:00:00,116.870000
621,Visakhapatnam,21:00:00,116.868062
622,Visakhapatnam,22:00:00,116.753874


In [9]:
for city in city_wise['City'].unique():
    this_city = city_wise[city_wise['City'] == city]
    fig = px.line(this_city, x='Time', y='AQI', height=500, title='Hourly Analysis is of AQI of ' + city)
    fig.show()

In [10]:
# This will give us the data for 25 cities after removing Ahmedabad.
city_hour_no_ahmedabad_df = city_hour_df[city_hour_df['City'] != 'Ahmedabad'].reset_index(drop=True)
city_hour_no_ahmedabad_df

Unnamed: 0,City,AQI,Time
0,Aizawl,54.0,00:00:00
1,Aizawl,53.0,01:00:00
2,Aizawl,53.0,02:00:00
3,Aizawl,53.0,03:00:00
4,Aizawl,53.0,04:00:00
...,...,...,...
547869,Visakhapatnam,51.0,20:00:00
547870,Visakhapatnam,51.0,21:00:00
547871,Visakhapatnam,50.0,22:00:00
547872,Visakhapatnam,50.0,23:00:00


In [11]:
# This will give us the mean of AQI for 25 cities for each hour of the day.
city_hour_avg = city_hour_no_ahmedabad_df.groupby(['Time'])['AQI'].mean().reset_index()
city_hour_avg

Unnamed: 0,Time,AQI
0,00:00:00,150.155944
1,01:00:00,149.307645
2,02:00:00,148.560151
3,03:00:00,148.520391
4,04:00:00,147.998379
5,05:00:00,147.406175
6,06:00:00,146.920465
7,07:00:00,146.632055
8,08:00:00,146.544728
9,09:00:00,146.733561


In [12]:
fig = px.line(city_hour_avg, x='Time', y='AQI', title='Average AQI for each hour of the day of 25 cities', markers=True)
fig.show()