# Data Preprocessing for AirAware

This notebook is used to preprocess the Delhi Air Quality dataset from Kaggle.

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [7]:
# Load the dataset
# Replace 'path_to_your_dataset.csv' with the actual path to your downloaded dataset
df = pd.read_csv('final_dataset.csv')
df.head()

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263
3,4,1,2021,0,1,89.55,132.08,153.98,10.42,1.01,49.19,207
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149


In [8]:
# Check the shape and basic info
print(f"Dataset shape: {df.shape}")
print("\nDataset info:")
print(df.info())

Dataset shape: (1461, 12)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            1461 non-null   int64  
 1   Month           1461 non-null   int64  
 2   Year            1461 non-null   int64  
 3   Holidays_Count  1461 non-null   int64  
 4   Days            1461 non-null   int64  
 5   PM2.5           1461 non-null   float64
 6   PM10            1461 non-null   float64
 7   NO2             1461 non-null   float64
 8   SO2             1461 non-null   float64
 9   CO              1461 non-null   float64
 10  Ozone           1461 non-null   float64
 11  AQI             1461 non-null   int64  
dtypes: float64(6), int64(6)
memory usage: 137.1 KB
None


In [9]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

Missing values per column:
Date              0
Month             0
Year              0
Holidays_Count    0
Days              0
PM2.5             0
PM10              0
NO2               0
SO2               0
CO                0
Ozone             0
AQI               0
dtype: int64


In [11]:
# Data cleaning steps
# 1. Convert date column to datetime
# 2. Handle missing values
# 3. Remove outliers
# 4. Normalize data if needed

# Example preprocessing (adjust according to your dataset structure)
df['Date'] = pd.to_datetime(df['Date'])

# Fill missing values or drop them based on your analysis
df = df.dropna()

# Display cleaned data info
print(f"Cleaned dataset shape: {df.shape}")
print("\nCleaned data info:")
print(df.info())

Cleaned dataset shape: (1461, 12)

Cleaned data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            1461 non-null   datetime64[ns]
 1   Month           1461 non-null   int64         
 2   Year            1461 non-null   int64         
 3   Holidays_Count  1461 non-null   int64         
 4   Days            1461 non-null   int64         
 5   PM2.5           1461 non-null   float64       
 6   PM10            1461 non-null   float64       
 7   NO2             1461 non-null   float64       
 8   SO2             1461 non-null   float64       
 9   CO              1461 non-null   float64       
 10  Ozone           1461 non-null   float64       
 11  AQI             1461 non-null   int64         
dtypes: datetime64[ns](1), float64(6), int64(5)
memory usage: 137.1 KB
None


In [12]:
# Save cleaned data
df.to_csv('../data/cleaned/delhi_air_quality_cleaned.csv', index=False)
print("Cleaned data saved successfully!")

Cleaned data saved successfully!
