In [1]:
import csv
import gc
import json
import pickle
import os

import pandas as pd
import numpy as np
import sklearn as sk

from sklearn.tree import DecisionTreeClassifier
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

## Откроем датасет https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package

In [2]:
with open('datasets/weatherAUS.csv') as f:
    reader = csv.reader(f)
    _data = list(reader)
    df = pd.DataFrame(_data[1:], columns=_data[0])

In [3]:
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44,W,...,71,22,1007.7,1007.1,8,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0,,,WNW,44,NNW,...,44,25,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0,,,WSW,46,W,...,38,30,1007.6,1008.7,,2,21,23.2,No,No
3,2008-12-04,Albury,9.2,28,0,,,NE,24,SE,...,45,16,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1,,,W,41,ENE,...,82,33,1010.8,1006,7,8,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0,,,E,31,SE,...,51,24,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0,,,NNW,22,SE,...,56,21,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0,,,N,37,SE,...,53,24,1021,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27,0,,,SE,28,SSE,...,51,24,1019.4,1016.5,3,2,15.1,26,No,No


In [5]:
binary_cls_model = DecisionTreeClassifier(max_depth=100)

In [6]:
# Берем только числовые колонки
columns = ['RainToday', 'MinTemp', 'MaxTemp', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
# Убираем пустые значения
df = df[df[columns]!='NA'].dropna(subset=columns)
# Делим на X, Y
X, Y = df.loc[:, df.columns!='RainToday'], df.loc[:, df.columns=='RainToday']
df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,,,13.4,22.9,,,,,,,...,71,22,1007.7,1007.1,,,16.9,21.8,No,
1,,,7.4,25.1,,,,,,,...,44,25,1010.6,1007.8,,,17.2,24.3,No,
2,,,12.9,25.7,,,,,,,...,38,30,1007.6,1008.7,,,21,23.2,No,
3,,,9.2,28,,,,,,,...,45,16,1017.6,1012.8,,,18.1,26.5,No,
4,,,17.5,32.3,,,,,,,...,82,33,1010.8,1006,,,17.8,29.7,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145454,,,3.5,21.8,,,,,,,...,59,27,1024.7,1021.2,,,9.4,20.9,No,
145455,,,2.8,23.4,,,,,,,...,51,24,1024.6,1020.3,,,10.1,22.4,No,
145456,,,3.6,25.3,,,,,,,...,56,21,1023.5,1019.1,,,10.9,24.5,No,
145457,,,5.4,26.9,,,,,,,...,53,24,1021,1016.8,,,12.5,26.1,No,


In [7]:
binary_cls_model.fit(X[columns[1:]], Y)

In [8]:
binary_cls_model.predict(X.loc[:10, columns[1:]])

array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No'],
      dtype=object)

In [9]:
# Сохраняем модель

In [29]:
with open('models/scikit-learn-binary_cls_model.pckl', 'wb') as f:
    pickle.dump(binary_cls_model, f)

initial_type = [('input', FloatTensorType([None, 8]))]
onx = convert_sklearn(binary_cls_model, initial_types=initial_type)
with open("models/scikit-learn-binary_cls_model.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [32]:
# Проверяем onnx модель
import onnxruntime as rt

sess = rt.InferenceSession("models/scikit-learn-binary_cls_model.onnx")
input_name = sess.get_inputs()[0].name
pred_onx = sess.run(None, {input_name: np.array([[i for i in range(8)]], dtype=np.float32)})
print(pred_onx)

[array(['No'], dtype=object), [{'No': 1.0, 'Yes': 0.0}]]


## Columns
* 'MinTemp'
* 'MaxTemp'
* 'Humidity9am'
* 'Humidity3pm'
* 'Pressure9am'
* 'Pressure3pm'
* 'Temp9am'
* 'Temp3pm'

## Requirements
* scikit-learn
* pandas