## Imports

In [64]:
from data_classes import LAQNData, HealthData, MetData, IncomeData
import numpy as np
import pandas as pd
from os import path, listdir
import wandb
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

## Config

In [8]:
config = {
    "architecture": "linear_regressor",
    "train_size": 0.8,
    "species": "NO2",
    "spatial_resolution": "regional",
    "temporal_resolution": "daily",
    "input_artifacts": ["laqn-regional", "met-resample", "income-regional"],
    "met_variables": ["temperature"]
    }

## Code

In [75]:
# To be moved to model_classes script when finished developing.

class HealthModel():
    def __init__(self, species, spatial_resolution, temporal_resolution, input_artifacts, met_variables):
        self.species = species
        self.spatial_resolution = spatial_resolution
        self.temporal_resolution = temporal_resolution
        self.input_artifacts = input_artifacts
        self.met_variables = met_variables

    def preprocess_and_log(self, train_size):
        with wandb.init(project="AQmortality", job_type="split-normalise-data") as run:
            df = pd.DataFrame()
            # use dataset artifacts
            for artifact in self.input_artifacts:
                data_artifact = run.use_artifact(f"{artifact}:latest")
                data_folder = data_artifact.download()
                if artifact == "met-resample":
                    for variable in self.met_variables:
                        file = f"{variable}.npz"
                        data = np.load(path.join(data_folder, file), allow_pickle=True)
                        if df.empty:
                            df = pd.DataFrame(index=pd.DatetimeIndex(data["x"]), data=data["y"], columns=[variable])
                        else:
                            df = df.join(pd.DataFrame(index=pd.DatetimeIndex(data["x"]), data=data["y"], columns=[variable]))
                else:
                    file = listdir(data_folder)[0]
                    data = np.load(path.join(data_folder, file), allow_pickle=True)
                    if df.empty:
                        df = pd.DataFrame(index=pd.DatetimeIndex(data["x"]), data=data["y"], columns=[file.replace(".npz", "")])
                    else:
                        df = df.join(pd.DataFrame(index=pd.DatetimeIndex(data["x"]), data=data["y"], columns=[file.replace(".npz", "")]))

            target_artifact = run.use_artifact("mortality-scaled:latest")
            target_folder = target_artifact.download()
            data = np.load(path.join(target_folder, "deaths.npz"), allow_pickle=True)
            df = df.join(pd.DataFrame(index=pd.DatetimeIndex(data["x"]), data=data["y"]*100000, columns=["deaths"]))
            df = df.dropna(axis=0)

            # make new train and test artifacts for regional scale data
            index = {"train": df.index[:int(len(df.index)*train_size)],
                    "test": df.index[int(len(df.index)*train_size):]}
            scaler = MinMaxScaler()
            x_scaler = scaler.fit(df.loc[train_index].drop("deaths", axis=1))
            for subset in ["train", "test"]:
                x = x_scaler.transform(df.loc[index[subset]].drop("deaths", axis=1))
                y = df.loc[index[subset]]["deaths"].values
                subset_data = wandb.Artifact(
                            f"xy_{subset}", type="dataset",
                            description=f"Input features (normalised) and targets for {subset}ing set.",
                            metadata={"input_shape":x.shape,
                                     "target_shape":y.shape,
                                     "species": self.species,
                                      "spatial_resolution": self.spatial_resolution,
                                      "temporal_resolution": self.temporal_resolution,
                                      "input_artifacts": self.input_artifacts,
                                      "met_variables": self.met_variables})
                with subset_data.new_file(subset + ".npz", mode="wb") as file:
                    np.savez(file, x=x, y=y)
                run.log_artifact(subset_data)
    
#     def train_and_log():
#         with wandb.init(project="AQmortality", job_type="train-regional-model") as run:
    # train linear regression model
    # log model training metrics
    # log trained model artifact – include input features description
    
    
#     def test_and_log():
#         with wandb.init(project="AQmortality", job_type="test-regional-model") as run:
    # use regional test data artifacts
    # normalise the inputs using the scaler fitted to the train set
    # use trained model artifact
    # test linear regression model
    # log model test metrics

In [76]:
model = HealthModel(
    species=config["species"], 
    spatial_resolution=config["spatial_resolution"], 
    temporal_resolution=config["temporal_resolution"], 
    input_artifacts=config["input_artifacts"], 
    met_variables=config["met_variables"])

In [77]:
model.preprocess_and_log(train_size=config["train_size"])

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…