In [1]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Connect to drive
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Change to directory
%cd /content/drive/MyDrive/CMP7005 Air Quality Analysis

/content/drive/MyDrive/CMP7005 Air Quality Analysis


# Data Loading and Merging

In [5]:
# Load multiple datasets
ahmedabad_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Ahmedabad_data.csv')
aizawl_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Aizawl_data.csv')
amaravati_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Amaravati_data.csv')
amritsar_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Amritsar_data.csv')
bengaluru_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Bengaluru_data.csv')
bhopal_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Bhopal_data.csv')
brajrajnagar_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Brajrajnagar_data.csv')
chandigarh_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Chandigarh_data.csv')
chennai_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Chennai_data.csv')
coimbatore_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Coimbatore_data.csv')
delhi_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Delhi_data.csv')
ernakulam_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Ernakulam_data.csv')
gurugram_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Gurugram_data.csv')
guwahati_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Guwahati_data.csv')
hyderabad_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Hyderabad_data.csv')
jaipur_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Jaipur_data.csv')
jorapokhar_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Jorapokhar_data.csv')
kochi_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Kochi_data.csv')
kolkata_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Kolkata_data.csv')
lucknow_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Lucknow_data.csv')
mumbai_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Mumbai_data.csv')
patna_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Patna_data.csv')
shillong_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Shillong_data.csv')
talcher_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Talcher_data.csv')
thiruvananthapuram_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Thiruvananthapuram_data.csv')
visakhapatnam_data = pd.read_csv('/content/drive/MyDrive/CMP7005 Air Quality Analysis/Datasets/Visakhapatnam_data.csv')

# Add all the city dataframes to list
cities_data = [ahmedabad_data, aizawl_data, amaravati_data, amritsar_data, bengaluru_data, bhopal_data, brajrajnagar_data,
               chandigarh_data, chennai_data, coimbatore_data, delhi_data, ernakulam_data, gurugram_data, guwahati_data,
               hyderabad_data, jaipur_data, jorapokhar_data, kochi_data, kolkata_data, lucknow_data, mumbai_data, patna_data,
               shillong_data, talcher_data, thiruvananthapuram_data, visakhapatnam_data]

# Combine all datasets
combined_data = pd.concat(cities_data, ignore_index=True)

In [6]:
# Shape of combined dataset
combined_data.shape

(29531, 16)

In [7]:
# Check the count of unique cities
combined_data['City'].nunique()

26

# Initial data inspection

In [8]:
# Count of rows
combined_data.shape[0]

29531

In [9]:
# Count of columns
combined_data.shape[1]

16

In [10]:
# Column names
combined_data.columns.tolist()

['City',
 'Date',
 'PM2.5',
 'PM10',
 'NO',
 'NO2',
 'NOx',
 'NH3',
 'CO',
 'SO2',
 'O3',
 'Benzene',
 'Toluene',
 'Xylene',
 'AQI',
 'AQI_Bucket']

In [11]:
# Data types of each column
combined_data.dtypes

Unnamed: 0,0
City,object
Date,object
PM2.5,float64
PM10,float64
NO,float64
NO2,float64
NOx,float64
NH3,float64
CO,float64
SO2,float64


In [12]:
# First few rows of the dataset
combined_data.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,01/01/2015,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,02/01/2015,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,03/01/2015,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,04/01/2015,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,05/01/2015,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [14]:
# Check for missing values
combined_data.isnull().sum()

Unnamed: 0,0
City,0
Date,0
PM2.5,4598
PM10,11140
NO,3582
NO2,3585
NOx,4185
NH3,10328
CO,2059
SO2,3854
