In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from google.colab import files

In [3]:

uploaded = files.upload()
filename = list(uploaded.keys())[0]

raw = pd.read_csv(filename, sep=",", decimal=",", quotechar='"')

raw = raw.loc[:, ~raw.columns.str.contains("^Unnamed")]

raw_nox = (
    raw
    .loc[:, ["Date", "Time", "NOx(GT)", "PT08.S3(NOx)", "T", "RH", "AH"]]
    .rename(columns={
        "T": "Temperature (C)",
        "RH": "Relative Humidity",
        "AH": "Absolute Humidity"
    })
).iloc[:9357].copy()

raw_nox.head()


Saving AirQualityUCSDCSV.csv to AirQualityUCSDCSV.csv


Unnamed: 0,Date,Time,NOx(GT),PT08.S3(NOx),Temperature (C),Relative Humidity,Absolute Humidity
0,10/03/2004,18.00.00,166,1056,13.6,48.9,0.7578
1,10/03/2004,19.00.00,103,1174,13.3,47.7,0.7255
2,10/03/2004,20.00.00,131,1140,11.9,54.0,0.7502
3,10/03/2004,21.00.00,172,1092,11.0,60.0,0.7867
4,10/03/2004,22.00.00,131,1205,11.2,59.6,0.7888


In [4]:
"""C statistics"""
raw_nox.describe(include=[np.number])


Unnamed: 0,NOx(GT),PT08.S3(NOx),Temperature (C),Relative Humidity,Absolute Humidity
count,9357.0,9357.0,9357.0,9357.0,9357.0
mean,168.616971,794.990168,9.778305,39.48538,-6.837604
std,257.433866,321.993552,43.203623,51.216145,38.97667
min,-200.0,-200.0,-200.0,-200.0,-200.0
25%,50.0,637.0,10.9,34.1,0.6923
50%,141.0,794.0,17.2,48.6,0.9768
75%,284.0,960.0,24.1,61.9,1.2962
max,1479.0,2683.0,44.6,88.7,2.231


In [5]:
"""Take a look at how many values may be missing"""
raw_nox[raw_nox["PT08.S3(NOx)"] == -200][raw_nox.columns[2:-1]]


Unnamed: 0,NOx(GT),PT08.S3(NOx),Temperature (C),Relative Humidity
524,99,-200,-200.0,-200.0
525,108,-200,-200.0,-200.0
526,131,-200,-200.0,-200.0
701,129,-200,-200.0,-200.0
702,154,-200,-200.0,-200.0
...,...,...,...,...
8111,1227,-200,-200.0,-200.0
8112,1061,-200,-200.0,-200.0
8113,1075,-200,-200.0,-200.0
8114,641,-200,-200.0,-200.0


In [6]:
"""Check how missing values behave for NOx"""
subset = raw_nox[raw_nox["NOx(GT)"] == -200].select_dtypes(include=[np.number])

subset.eq(-200).all(axis=1).value_counts()


Unnamed: 0,count
False,1595
True,44


In [8]:
"""Replace missing values using historical average for NOx(GT)"""

interpolated = raw_nox.copy()

numeric_cols = interpolated.select_dtypes(include=[np.number]).columns
interpolated[numeric_cols] = interpolated[numeric_cols].replace(-200, np.nan)

dt = pd.to_datetime(
    interpolated["Date"].astype(str) + " " + interpolated["Time"].astype(str).str.replace(".", ":", regex=False),
    dayfirst=True,
    errors="coerce"
)
hour = dt.dt.hour

hourly_mean = interpolated.groupby(hour)["NOx(GT)"].mean()
overall_mean = interpolated["NOx(GT)"].mean()

fill_vals = hour.map(hourly_mean).fillna(overall_mean)
interpolated["NOx(GT)"] = interpolated["NOx(GT)"].fillna(fill_vals)

other_cols = [c for c in numeric_cols if c != "NOx(GT)"]
interpolated[other_cols] = interpolated[other_cols].interpolate(limit_direction="both")

interpolated.head()


Unnamed: 0,Date,Time,NOx(GT),PT08.S3(NOx),Temperature (C),Relative Humidity,Absolute Humidity
0,10/03/2004,18.00.00,166.0,1056.0,13.6,48.9,0.7578
1,10/03/2004,19.00.00,103.0,1174.0,13.3,47.7,0.7255
2,10/03/2004,20.00.00,131.0,1140.0,11.9,54.0,0.7502
3,10/03/2004,21.00.00,172.0,1092.0,11.0,60.0,0.7867
4,10/03/2004,22.00.00,131.0,1205.0,11.2,59.6,0.7888


In [9]:
"""Normalize the data in each row"""
normalized = interpolated.copy()

for column in numeric_cols:
    minimum = interpolated[column].min()
    maximum = interpolated[column].max()
    normalized[column] = (interpolated[column] - minimum) / (maximum - minimum)

normalized.head()


Unnamed: 0,Date,Time,NOx(GT),PT08.S3(NOx),Temperature (C),Relative Humidity,Absolute Humidity
0,10/03/2004,18.00.00,0.111036,0.310885,0.333333,0.499371,0.280066
1,10/03/2004,19.00.00,0.068382,0.360864,0.326882,0.484277,0.264282
2,10/03/2004,20.00.00,0.087339,0.346463,0.296774,0.563522,0.276352
3,10/03/2004,21.00.00,0.115098,0.326133,0.277419,0.638994,0.29419
4,10/03/2004,22.00.00,0.087339,0.373994,0.28172,0.633962,0.295216


In [10]:
"""Fix Date and time formatting and data type"""
nox = normalized.copy()

nox["Time"] = nox["Time"].str.replace(".", ":", regex=False)
nox["Datetime"] = pd.to_datetime(
    nox[["Date", "Time"]]
    .apply(lambda x: " ".join(x.values.astype(str)), axis="columns"),
    dayfirst=True
)

nox["Date"] = nox["Datetime"].dt.date
nox["Time"] = nox["Datetime"].dt.time

nox.dtypes


Unnamed: 0,0
Date,object
Time,object
NOx(GT),float64
PT08.S3(NOx),float64
Temperature (C),float64
Relative Humidity,float64
Absolute Humidity,float64
Datetime,datetime64[ns]


In [11]:
"""Save the cleaned data for future use"""
from pathlib import Path

path_to_save = Path("NOx.csv")
nox.to_csv(path_to_save, index=False)
