In [34]:
import polars as pl

In [35]:
pollutant = pl.read_csv("../../data/raw/pollutant_data.csv")
measurement = pl.read_csv("../../data/raw/measurement_data.csv")
instrument = pl.read_csv("../../data/raw/instrument_data.csv")

In [36]:
questions = {
    "target": {
    }
}

In [37]:
measurement = measurement.join(instrument, on=["Measurement date", "Station code"], how="left")
normal = measurement.filter(pl.col("Instrument status") == 0)

In [38]:
measurement.filter(pl.col("Station code") == 204).filter(pl.col("Item code") == 0)

Measurement date,Station code,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,Item code,Average value,Instrument status
str,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64
"""2021-01-01 00:00:00""",204,37.572016,127.005008,0.004,0.059,0.002,1.2,73.0,57.0,0,0.004,0
"""2021-01-01 01:00:00""",204,37.572016,127.005008,0.004,0.058,0.002,1.2,71.0,59.0,0,0.004,0
"""2021-01-01 02:00:00""",204,37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,59.0,0,0.004,0
"""2021-01-01 03:00:00""",204,37.572016,127.005008,0.004,0.056,0.002,1.2,70.0,58.0,0,0.004,0
"""2021-01-01 04:00:00""",204,37.572016,127.005008,0.003,0.051,0.002,1.2,69.0,61.0,0,0.003,0
…,…,…,…,…,…,…,…,…,…,…,…,…
"""2023-12-31 19:00:00""",204,37.572016,127.005008,0.003,0.026,0.014,0.5,26.0,19.0,0,0.003,0
"""2023-12-31 20:00:00""",204,37.572016,127.005008,0.003,0.024,0.014,0.4,27.0,19.0,0,0.003,0
"""2023-12-31 21:00:00""",204,37.572016,127.005008,0.003,0.014,0.021,0.4,23.0,18.0,0,0.003,0
"""2023-12-31 22:00:00""",204,37.572016,127.005008,0.003,0.022,0.014,0.5,20.0,19.0,0,0.003,0


## Question 1

In [39]:
normal = (
    normal
    .with_columns(
        pl.col("Measurement date").str.to_datetime()
    )
    .with_columns(
        pl.col("Measurement date").dt.day().alias("day"),
        pl.col("Measurement date").dt.month().alias("month"),
        pl.col("Measurement date").dt.year().alias("year"),
    )
)

In [40]:
q1 = (
    normal
    .select(pl.col("SO2"))
    .mean()
)
response_q1 = q1["SO2"].to_list()[0]
questions["target"]["Q1"] = round(response_q1, 5)

## Question 2

In [41]:
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 1
    elif month in [3, 4, 5]:
        return 2
    elif month in [6, 7, 8]:
        return 3
    else:
        return 4

In [42]:
q2 = (
    normal
    .filter(pl.col("Station code") == 209)
    .with_columns(
        pl.col("Measurement date").map_elements(get_season, return_dtype=pl.Int64).alias("season")
    )
    .group_by("season")
    .agg(pl.col("CO").mean())
)
season_1 = q2.filter(pl.col("season") == 1)["CO"].to_list()[0]
season_2 = q2.filter(pl.col("season") == 2)["CO"].to_list()[0]
season_3 = q2.filter(pl.col("season") == 3)["CO"].to_list()[0]
season_4 = q2.filter(pl.col("season") == 4)["CO"].to_list()[0]
questions["target"]["Q2"] = {
    "1" : round(season_1, 5),
    "2" : round(season_2, 5),
    "3" : round(season_3, 5),
    "4" : round(season_4, 5)
}

## Question 3

In [43]:
normal = (
    normal
   .with_columns(
        pl.col("Measurement date").dt.hour().alias("hour")
    )
)

std_per_hour = (
    normal
    .group_by("hour")
    .agg(pl.col("O3").std().alias("std_O3"))
    .sort("std_O3", descending=True)
    .head(1)
)

response_3 = std_per_hour["hour"].to_list()[0]
questions["target"]["Q3"] = round(response_3, 5)

## Question 4

In [44]:
abnormal = (
    measurement
    .filter(pl.col("Instrument status") == 2)
)
q4 = (
    abnormal
    .group_by("Station code")
    .agg(pl.len().alias("n_measurements"))
    .sort(by = "n_measurements", descending=True)
    .head(1)
)
response_4 = q4["Station code"].to_list()[0]
questions["target"]["Q4"] = round(response_4, 5)

## Question 5

In [45]:
not_abnormal = (
    measurement
    .filter(pl.col("Instrument status") != 0)
)

q5 = (
    not_abnormal
    .group_by("Station code")
    .agg(pl.len().alias("n_measurements"))
    .sort(by = "n_measurements", descending=True)
    .head(1)
)
response_5 = q5["Station code"].to_list()[0]
questions["target"]["Q5"] = round(response_5, 5)

## Question 6

In [46]:
def obtain_label(measurement: float, very_bad: float, bad: float, normal: float):
    if measurement >= very_bad:
        return "Very bad"
    elif  measurement >= bad:
        return "Bad"
    elif  measurement >= normal:
        return "Normal"
    else:
        return "Good"
aux = pollutant.filter(pl.col("Item name") == "PM2.5")
very_bad = aux["Very bad"].to_list()[0]
bad = aux["Bad"].to_list()[0]
normal_m = aux["Normal"].to_list()[0]

q6 = (
    normal
    .with_columns(
        pl.col("PM2.5").map_elements(lambda x: obtain_label(x, very_bad, bad, normal_m), return_dtype=pl.String).alias("Label_PM2.5")
    )
)

q6 = (
    q6
    .group_by("Label_PM2.5")
    .agg(pl.len().alias("N_measurements_PM2.5"))
)

questions["target"]["Q6"] = {
    "Normal" : q6.filter(pl.col("Label_PM2.5") == "Normal")["N_measurements_PM2.5"].to_list()[0],
    "Bad" : q6.filter(pl.col("Label_PM2.5") == "Bad")["N_measurements_PM2.5"].to_list()[0],
    "Very bad" : q6.filter(pl.col("Label_PM2.5") == "Very bad")["N_measurements_PM2.5"].to_list()[0],
    "Good" : q6.filter(pl.col("Label_PM2.5") == "Good")["N_measurements_PM2.5"].to_list()[0]
}

## Write the answers

In [47]:
import json
with open("../../predictions/questions.json", "w") as file:
    json.dump(questions, file)