In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.compose import make_column_transformer, ColumnTransformer

import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
CATEGORICAL_COLUMNS: list[str] = ["city", "country"]
DROP_COLUMNS: list[str] = ["id", "date", "pm2_5"]
Y: str = "target"
GROUP: str = "site_id"

In [3]:
data: pd.DataFrame = pd.read_csv(os.getenv("TRAIN_PATH"))
data.head()

Unnamed: 0,id,site_id,site_latitude,site_longitude,city,country,date,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,...,cloud_cloud_top_height,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,pm2_5
0,id_vjcx08sz91,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-10-25,13,,,...,,,,,,,,,,12.015
1,id_bkg215syli,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-02,12,,,...,,,,,,,,,,42.2672
2,id_oui2pot3qd,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-03,13,,,...,6791.682888,51171.802486,5791.682829,11.816715,0.192757,-96.41189,61.045123,-121.307414,41.898269,39.450741
3,id_9aandqzy4n,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-08,14,,,...,,,,,,,,,,10.5376
4,id_ali5x2m4iw,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-09,13,0.000267,0.774656,...,1451.050659,96215.90625,451.050598,10.521009,0.153114,-97.811241,49.513439,-126.064453,40.167355,19.431731


In [4]:
def create_target(x: pd.DataFrame) -> int:
    if 0. < x < 100:
        return 0
    return 1

data["target"] = data["pm2_5"].apply(create_target)

In [5]:
data = data.drop(columns=DROP_COLUMNS).drop(columns=[col for col in data.columns if "uvaerosollayerheight" in col]).dropna().reset_index(drop=True)

In [6]:
sgkf: GroupKFold = GroupKFold(n_splits=5)

train_index, test_index = next(sgkf.split(data.drop(columns=[Y, GROUP]), data[Y], groups=data[GROUP]))

In [7]:
X_train, y_train = data.loc[train_index].drop(columns=[Y, GROUP]), data.loc[train_index][Y]
X_test, y_test = data.loc[test_index].drop(columns=[Y, GROUP]), data.loc[test_index][Y]

In [8]:
column_transformer: ColumnTransformer = make_column_transformer(
    (OneHotEncoder(), CATEGORICAL_COLUMNS),
    remainder="passthrough",
    n_jobs=-1,
    verbose=True
)

classifier: RandomForestClassifier = RandomForestClassifier(
    n_jobs=-1,
    random_state = 666,
    verbose = 1,
    warm_start = False,
)

pipeline: Pipeline = Pipeline(
    steps=[
        ("column_transformer", column_transformer),
        ("classifier", classifier)
    ],
    verbose=True
)

pipeline.fit(X_train, y_train)

[Pipeline]  (step 1 of 2) Processing column_transformer, total=   0.7s
[Pipeline] ........ (step 2 of 2) Processing classifier, total=   0.2s


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished


In [9]:
calibrated_classifier: CalibratedClassifierCV = CalibratedClassifierCV(estimator=pipeline, n_jobs=-1, cv="prefit")
calibrated_classifier.fit(X_test, y_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [11]:
from skl2onnx import to_onnx

onx = to_onnx(calibrated_classifier, X_train[:1], options={RandomForestClassifier: {"zipmap": False}})
with open("classifier.onnx", "wb") as f:
    f.write(onx.SerializeToString())

RuntimeError: For operator SklearnCalibratedClassifierCV (type: SklearnCalibratedClassifierCV), at most 1 input(s) is(are) supported but we got 69 input(s) which are ['site_latitude', 'site_longitude', 'city', 'country', 'hour', 'sulphurdioxide_so2_column_number_density', 'sulphurdioxide_so2_column_number_density_amf', 'sulphurdioxide_so2_slant_column_number_density', 'sulphurdioxide_cloud_fraction', 'sulphurdioxide_sensor_azimuth_angle', 'sulphurdioxide_sensor_zenith_angle', 'sulphurdioxide_solar_azimuth_angle', 'sulphurdioxide_solar_zenith_angle', 'sulphurdioxide_so2_column_number_density_15km', 'month', 'carbonmonoxide_co_column_number_density', 'carbonmonoxide_h2o_column_number_density', 'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude', 'carbonmonoxide_sensor_azimuth_angle', 'carbonmonoxide_sensor_zenith_angle', 'carbonmonoxide_solar_azimuth_angle', 'carbonmonoxide_solar_zenith_angle', 'nitrogendioxide_no2_column_number_density', 'nitrogendioxide_tropospheric_no2_column_number_density', 'nitrogendioxide_stratospheric_no2_column_number_density', 'nitrogendioxide_no2_slant_column_number_density', 'nitrogendioxide_tropopause_pressure', 'nitrogendioxide_absorbing_aerosol_index', 'nitrogendioxide_cloud_fraction', 'nitrogendioxide_sensor_altitude', 'nitrogendioxide_sensor_azimuth_angle', 'nitrogendioxide_sensor_zenith_angle', 'nitrogendioxide_solar_azimuth_angle', 'nitrogendioxide_solar_zenith_angle', 'formaldehyde_tropospheric_hcho_column_number_density', 'formaldehyde_tropospheric_hcho_column_number_density_amf', 'formaldehyde_hcho_slant_column_number_density', 'formaldehyde_cloud_fraction', 'formaldehyde_solar_zenith_angle', 'formaldehyde_solar_azimuth_angle', 'formaldehyde_sensor_zenith_angle', 'formaldehyde_sensor_azimuth_angle', 'uvaerosolindex_absorbing_aerosol_index', 'uvaerosolindex_sensor_altitude', 'uvaerosolindex_sensor_azimuth_angle', 'uvaerosolindex_sensor_zenith_angle', 'uvaerosolindex_solar_azimuth_angle', 'uvaerosolindex_solar_zenith_angle', 'ozone_o3_column_number_density', 'ozone_o3_column_number_density_amf', 'ozone_o3_slant_column_number_density', 'ozone_o3_effective_temperature', 'ozone_cloud_fraction', 'ozone_sensor_azimuth_angle', 'ozone_sensor_zenith_angle', 'ozone_solar_azimuth_angle', 'ozone_solar_zenith_angle', 'cloud_cloud_fraction', 'cloud_cloud_top_pressure', 'cloud_cloud_top_height', 'cloud_cloud_base_pressure', 'cloud_cloud_base_height', 'cloud_cloud_optical_depth', 'cloud_surface_albedo', 'cloud_sensor_azimuth_angle', 'cloud_sensor_zenith_angle', 'cloud_solar_azimuth_angle', 'cloud_solar_zenith_angle']

In [None]:
import onnxruntime as rt

sess = rt.InferenceSession("rf_iris.onnx", providers=["CPUExecutionProvider"])
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: X_test.astype(np.float32)})[0]
