In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from google.colab import files

In [8]:

uploaded = files.upload()
filename = list(uploaded.keys())[0]
raw = pd.read_csv(filename, sep=",", decimal=",", quotechar='"')
raw = raw.loc[:, ~raw.columns.str.contains("^Unnamed")]
raw_no2 = (
    raw
    .loc[:, ["Date", "Time", "NO2(GT)", "PT08.S4(NO2)", "T", "RH", "AH"]]
    .rename(columns={
        "T": "Temperature (C)",
        "RH": "Relative Humidity",
        "AH": "Absolute Humidity"
    })
).iloc[:9357]


raw_no2.head()


Saving AirQualityUCSDCSV.csv to AirQualityUCSDCSV (1).csv


Unnamed: 0,Date,Time,NOx(GT),PT08.S3(NOx),Temperature (C),Relative Humidity,Absolute Humidity
0,10/03/2004,18.00.00,166,1056,13.6,48.9,0.7578
1,10/03/2004,19.00.00,103,1174,13.3,47.7,0.7255
2,10/03/2004,20.00.00,131,1140,11.9,54.0,0.7502
3,10/03/2004,21.00.00,172,1092,11.0,60.0,0.7867
4,10/03/2004,22.00.00,131,1205,11.2,59.6,0.7888


In [9]:
"""C statistics"""
raw_no2.describe(include=[np.number])


Unnamed: 0,NO2(GT),PT08.S4(NO2),Temperature (C),Relative Humidity,Absolute Humidity
count,9357.0,9357.0,9357.0,9357.0,9357.0
mean,58.148873,1391.479641,9.778305,39.48538,-6.837604
std,126.940455,467.210125,43.203623,51.216145,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0
25%,53.0,1185.0,10.9,34.1,0.6923
50%,96.0,1446.0,17.2,48.6,0.9768
75%,133.0,1662.0,24.1,61.9,1.2962
max,340.0,2775.0,44.6,88.7,2.231


In [12]:
"""Check how missing values behave for NO2"""
subset = raw_no2[raw_no2["NO2(GT)"] == -200].select_dtypes(include=[np.number])

subset.eq(-200).all(axis=1).value_counts()


Unnamed: 0,count
False,1598
True,44


In [14]:
"""Replace missing values using historical average for No2"""

interpolated = raw_no2

numeric_cols = interpolated.select_dtypes(include=[np.number]).columns
interpolated[numeric_cols] = interpolated[numeric_cols].replace(-200, np.nan)

dt = pd.to_datetime(
    interpolated["Date"].astype(str) + " " + interpolated["Time"].astype(str).str.replace(".", ":", regex=False),
    dayfirst=True,
    errors="coerce"
)
hour = dt.dt.hour

hourly_mean = interpolated.groupby(hour)["NO2(GT)"].mean()
overall_mean = interpolated["NO2(GT)"].mean()

fill_vals = hour.map(hourly_mean).fillna(overall_mean)
interpolated["NO2(GT)"] = interpolated["NO2(GT)"].fillna(fill_vals)

other_cols = [c for c in numeric_cols if c != "NO2(GT)"]
interpolated[other_cols] = interpolated[other_cols].interpolate(limit_direction="both")

interpolated.head()


Unnamed: 0,Date,Time,NO2(GT),PT08.S4(NO2),Temperature (C),Relative Humidity,Absolute Humidity
0,10/03/2004,18.00.00,113.0,1692.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,92.0,1559.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,114.0,1555.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,122.0,1584.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,116.0,1490.0,11.2,59.6,0.7888


In [19]:
"""Normalize the data in each row"""
normalized = interpolated.copy()
numeric_cols = normalized.select_dtypes(include=[np.number]).columns

for column in numeric_cols:
    minimum = normalized[column].min()
    maximum = normalized[column].max()
    if maximum != minimum:
        normalized[column] = (normalized[column] - minimum) / (maximum - minimum)

normalized.head()


Unnamed: 0,Date,Time,NO2(GT),PT08.S4(NO2),Temperature (C),Relative Humidity,Absolute Humidity
0,10/03/2004,18.00.00,0.328402,0.51304,0.333333,0.499371,0.280066
1,10/03/2004,19.00.00,0.266272,0.453237,0.326882,0.484277,0.264282
2,10/03/2004,20.00.00,0.331361,0.451439,0.296774,0.563522,0.276352
3,10/03/2004,21.00.00,0.35503,0.464478,0.277419,0.638994,0.29419
4,10/03/2004,22.00.00,0.337278,0.422212,0.28172,0.633962,0.295216


In [20]:
" Date and time formatting and data type"
nox = normalized.copy()
nox["Time"] = nox["Time"].astype(str).str.replace(".", ":", regex=False)

nox["Datetime"] = pd.to_datetime(
    nox[["Date", "Time"]]
    .apply(lambda x: " ".join(x.values.astype(str)), axis=1),
    dayfirst=True,
    errors="coerce"
)

nox["Date"] = nox["Datetime"].dt.date
nox["Time"] = nox["Datetime"].dt.time

nox.dtypes


Unnamed: 0,0
Date,object
Time,object
NO2(GT),float64
PT08.S4(NO2),float64
Temperature (C),float64
Relative Humidity,float64
Absolute Humidity,float64
Datetime,datetime64[ns]


In [21]:
"""Save the cleaned data for future use"""
from pathlib import Path

path_to_save = Path("NOx.csv")
nox.to_csv(path_to_save, index=False)
