In [1]:
import glob                        # working with OS pathnames
import kagglehub
import pandas as pd

### Data

In [2]:
# Download latest version
path = kagglehub.dataset_download("abhisheksjha/time-series-air-quality-data-of-india-2010-2023")

# print("Path to dataset files:", path)

In [3]:
df_states = pd.read_csv(f'{path}/stations_info.csv')
df_states.drop(columns=['agency', 'station_location', 'start_month'], inplace=True)
df_states.head()

Unnamed: 0,file_name,state,city,start_month_num,start_year
0,AP001,Andhra Pradesh,Tirupati,7,2016
1,AP002,Andhra Pradesh,Vijayawada,5,2017
2,AP003,Andhra Pradesh,Visakhapatnam,7,2017
3,AP004,Andhra Pradesh,Rajamahendravaram,9,2017
4,AP005,Andhra Pradesh,Amaravati,11,2017


In [4]:
unique_states = df_states['state'].unique()
unique_states

array(['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar',
       'Chhattisgarh', 'Chandigarh', 'Delhi', 'Gujarat',
       'Himachal Pradesh', 'Haryana', 'Jharkhand', 'Jammu and Kashmir',
       'Karnataka', 'Kerala', 'Maharashtra', 'Meghalaya', 'Manipur',
       'Madhya Pradesh', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab',
       'Puducherry', 'Rajasthan', 'Sikkim', 'Telangana', 'Tamil Nadu',
       'Tripura', 'Uttarakhand', 'Uttar Pradesh', 'West Bengal'],
      dtype=object)

In [5]:
df_states[df_states['city'] == 'Ghaziabad']

Unnamed: 0,file_name,state,city,start_month_num,start_year
390,UP009,Uttar Pradesh,Ghaziabad,9,2017
398,UP017,Uttar Pradesh,Ghaziabad,2,2019
399,UP018,Uttar Pradesh,Ghaziabad,2,2019
400,UP019,Uttar Pradesh,Ghaziabad,2,2019


In [6]:
def combine_city_df(state_name):
    '''
    Combine all state files into a single dataframe and attaching the city information.

    Parameters
    ----------
        state_name (str): The name of the state

    Return
    ------
        df (DataFrame): The combined dataframe from all files of a specific state
    '''
    
    state_code = df_states[df_states['city'] == state_name]['file_name'].iloc[0][:2]
    state_files = glob.glob(f'{path}/{state_code}*.csv')
    print(f'Combining a total of {len(state_files)} files...\n')

    combined_df = []
    
    for state_file in state_files:
        file_name = state_file.split(f'{path}\\')[1][0:-4]
        print(f'State file : {file_name}')
        file_df = pd.read_csv(state_file)
        file_df['city'] = df_states[df_states['file_name'] == file_name]['city'].values[0]
        file_df['city'] = file_df['city'].astype('string')
        combined_df.append(file_df)
        
    return pd.concat(combined_df)

In [7]:
df = combine_city_df('Ghaziabad')
df.info()

Combining a total of 57 files...

State file : UP001
State file : UP002
State file : UP003
State file : UP004
State file : UP005
State file : UP006
State file : UP007
State file : UP008
State file : UP009
State file : UP010
State file : UP011
State file : UP012
State file : UP013
State file : UP014
State file : UP015
State file : UP016
State file : UP017
State file : UP018
State file : UP019
State file : UP020
State file : UP021
State file : UP022
State file : UP023
State file : UP024
State file : UP025
State file : UP026
State file : UP027
State file : UP028
State file : UP029
State file : UP030
State file : UP031
State file : UP032
State file : UP033
State file : UP034
State file : UP035
State file : UP036
State file : UP037
State file : UP038
State file : UP039
State file : UP040
State file : UP041
State file : UP042
State file : UP043
State file : UP044
State file : UP045
State file : UP046
State file : UP047
State file : UP048
State file : UP049
State file : UP050
State file : UP0

In [8]:
# Adjust display settings
pd.set_option('display.max_columns', None)

df.describe(include='all')

Unnamed: 0,From Date,To Date,PM2.5 (ug/m3),PM10 (ug/m3),NO (ug/m3),NO2 (ug/m3),NOx (ppb),NH3 (ug/m3),SO2 (ug/m3),CO (mg/m3),Ozone (ug/m3),Benzene (ug/m3),Temp (degree C),RH (%),WS (m/s),WD (deg),SR (W/mt2),BP (mmHg),VWS (m/s),CH4 (ug/m3),NMHC (ug/m3),THC (),city,Toluene (ug/m3),Eth-Benzene (ug/m3),MP-Xylene (ug/m3),O Xylene (ug/m3),AT (degree C),Xylene (ug/m3),RF (mm),Gust (km/hr),Variance (n),Power (W),CO2 (mg/m3),Ozone (),WD (degree),NOx (ug/m3),Toluene (),Eth-Benzene (),MP-Xylene (),Temp (),RH (),WS (),WD (),SR (),BP (),Xylene (),AT (),RF (),WD (degree C),NOx (ppm),AT (degree),NH3 (ppb)
count,1894072,1894072,1389495.0,1133801.0,1538971.0,1546692.0,1448169.0,1030634.0,1488239.0,1420776.0,1452556.0,1238963.0,588258.0,1421313.0,1454653.0,667412.0,1351009.0,1373303.0,508684.0,75362.0,0.0,31132.0,1894072,824907.0,315718.0,295298.0,80400.0,1191124.0,379526.0,797866.0,0.0,0.0,0.0,0.0,13622.0,735514.0,73443.0,39890.0,0.0,0.0,94713.0,40126.0,40126.0,40079.0,40126.0,39423.0,0.0,40055.0,98321.0,60183.0,15333.0,11034.0,7055.0
unique,116112,116112,,,,,,,,,,,,,,,,,,,,,20,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,2023-03-31 23:00:00,2023-04-01 00:00:00,,,,,,,,,,,,,,,,,,,,,Lucknow,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,57,57,,,,,,,,,,,,,,,,,,,,,450305,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,86.16326,171.3119,14.94694,33.7203,33.49649,34.05125,14.15639,1.22184,32.16363,2.66554,29.132288,61.87481,1.41361,175.721671,117.2956,784.795,0.183312,703.710487,,999.925524,,9.553509,3.858045,4.860887,3.547714,25.69331,2.816828,0.217065,,,,,42.260955,165.932801,26.043816,10.770728,,,31.279824,54.317894,2.107994,200.199853,78.032429,747.363535,,24.333764,0.046954,159.551739,4.857546,23.553779,2.571899
std,,,91.63072,138.5747,36.10672,35.24212,44.54708,28.769,15.30661,1.761888,34.58978,13.42346,5.606374,22.72315,2.417777,90.850892,176.2911,95.3509,1.319539,525.4472,,577.899137,,20.410991,16.569314,20.496821,14.161007,8.544265,10.326478,2.635613,,,,,34.062818,75.933788,31.144275,17.521597,,,2.390164,13.682851,1.853498,85.677479,102.60721,31.727791,,7.169538,0.682262,103.134745,2.671251,8.495671,2.331417
min,,,0.01,0.04,0.01,0.01,0.0,0.01,0.01,0.0,0.01,0.0,0.01,0.01,0.01,0.01,0.0,700.0,-18.53,0.0,,0.0,,0.0,0.01,0.01,0.01,0.1,0.0,0.0,,,,,0.5,0.02,0.0,0.0,,,0.12,19.75,0.3,1.0,6.0,702.33,,3.62,0.0,1.0,0.0,4.04,0.13
25%,,,30.0,71.56,2.26,11.63,10.67,15.25,5.26,0.48,8.3,0.12,27.47,46.0,0.38,106.0,7.75,740.43,-0.2,295.7775,,544.93,,0.88,0.14,0.15,0.43,19.38,0.1,0.0,,,,,16.43,106.0,10.18,2.08,,,29.95,44.0,0.55,124.5,7.0,738.0,,18.87,0.0,54.5,4.71,16.3525,0.35
50%,,,57.07,134.36,4.67,23.21,19.97,29.11,9.88,0.88,19.35,0.6,29.9,63.75,0.85,177.5,23.56,743.48,0.0,673.605,,893.21,,3.03,0.69,0.63,1.66,26.8,0.41,0.0,,,,,35.15,164.06,15.7,5.05,,,31.38,55.25,1.6,210.5,9.75,741.0,,25.48,0.0,149.5,4.79,24.34,3.48
75%,,,109.33,229.75,10.99,43.2,37.43,44.68,18.23,1.43,43.7225,1.9,32.0,80.0,1.7,248.75,167.64,750.0,0.26,900.28,,1450.2575,,8.5,2.47,2.03,2.53,31.73,1.8,0.0,,,,,57.6,226.0,29.69,12.5,,,32.8,65.25,3.05,271.5,136.0,743.0,,29.42,0.0,262.29,4.87,30.1975,3.52
