In [1]:
import pandas as pd
import os

CITIES = ["mumbai", "delhi", "chennai", "bangalore"]

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
all_city_dataframes = []

for city in CITIES:
    print(f"\nProcessing: {city.upper()}")
    aqi_file = f"{city}_raw_pm25.csv"
    weather_file = f"{city}_raw_weather.csv"

    if not os.path.exists(aqi_file) or not os.path.exists(weather_file):
        print(f"Warning: Raw files for {city} not found. Skipping.")
        print(f"  (Looking for: '{aqi_file}' and '{weather_file}')")
        continue
        
    try:
        aqi_df = pd.read_csv(aqi_file, parse_dates=['time'])
        weather_df = pd.read_csv(weather_file, parse_dates=['time'])
    except Exception as e:
        print(f"Error loading files for {city}: {e}")
        continue
    print(f"Merging AQI and Weather for {city}...")
    city_df = pd.merge(aqi_df, weather_df, on='time', how='outer')
    city_df['city'] = city
    city_df.set_index('time', inplace=True)
    print(f"Cleaning {city_df['pm2_5'].isnull().sum()} nulls in 'pm2_5' for {city}...")
    
    city_df['pm2_5'].interpolate(method='linear', inplace=True)
    city_df.bfill(inplace=True) 
    city_df.ffill(inplace=True) 
    all_city_dataframes.append(city_df)
if all_city_dataframes:
    df = pd.concat(all_city_dataframes)
    
    output_filename = "final_dataset.csv"
    df.to_csv(output_filename)
df.head()


Processing: MUMBAI
Merging AQI and Weather for mumbai...
Cleaning 5165 nulls in 'pm2_5' for mumbai...

Processing: DELHI
Merging AQI and Weather for delhi...
Cleaning 5165 nulls in 'pm2_5' for delhi...

Processing: CHENNAI
Merging AQI and Weather for chennai...
Cleaning 5165 nulls in 'pm2_5' for chennai...

Processing: BANGALORE


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  city_df['pm2_5'].interpolate(method='linear', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  city_df['pm2_5'].interpolate(method='linear', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

Merging AQI and Weather for bangalore...
Cleaning 5165 nulls in 'pm2_5' for bangalore...


Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,city
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 00:00:00,17.6,22.6,74,0.0,7.4,mumbai
2022-01-01 01:00:00,17.6,21.7,78,0.0,7.1,mumbai
2022-01-01 02:00:00,17.6,21.5,77,0.0,10.5,mumbai
2022-01-01 03:00:00,17.6,21.4,73,0.0,10.6,mumbai
2022-01-01 04:00:00,17.6,20.9,73,0.0,9.8,mumbai


In [3]:
df['city'].value_counts()

city
mumbai       33432
delhi        33432
chennai      33432
bangalore    33432
Name: count, dtype: int64

In [4]:
print(f"Total Null Values in the Dataset {df.isna().sum().sum()}")

Total Null Values in the Dataset 0
