In [1]:
import settings

import os

import pandas as pd

from sklearn.ensemble import HistGradientBoostingClassifier

DATA_DIR = settings.DATA_DIR

[INFO] Settings: Start initializing.
[INFO] Settings: Folder already exists in Dir /home/loe/Documents/Beijing-Internship/work/project/demo_re/src/../config/, skipping creation.
[INFO] Settings: Reading from file.
[INFO] Settings: Performing integrity check.
[INFO] Settings: New config file generated.
[INFO] Settings: Apply environment variable.
[INFO] Settings: Folder already exists in Dir /home/loe/Documents/Beijing-Internship/work/project/demo_re/src/../data/, skipping creation.
[INFO] Settings: Folder already exists in Dir /home/loe/Documents/Beijing-Internship/work/project/demo_re/src/../raw/, skipping creation.
[INFO] Settings: Folder already exists in Dir /home/loe/Documents/Beijing-Internship/work/project/demo_re/src/../config/, skipping creation.
[INFO] Settings: Folder already exists in Dir /home/loe/Documents/Beijing-Internship/work/project/demo_re/src/../model/, skipping creation.
[INFO] Settings: Folder already exists in Dir /home/loe/Documents/Beijing-Internship/work/proj

In [2]:
DATA_PATH = os.path.join(DATA_DIR, "processed.csv")
DATA_SPLIT_RATIO = 0.7
RAW_COLUMNS = [
    "inlet flow",
    "inlet COD",
    "inlet ammonia nitrogen",
    "inlet total nitrogen",
    "inlet phosphorus",
    "outlet COD",
    "outlet ammonia nitrogen",
    "outlet total nitrogen",
    "outlet phosphorus",
    "line 1 nitrate nitrogen",
    "line 2 nitrate nitrogen",
    "line 1 pump speed",
    "line 2 pump speed",
    "PAC pump 1 speed",
    "PAC pump 2 speed",
]

X_COLUMNS = RAW_COLUMNS[:-4]
Y_COLUMNS = RAW_COLUMNS[-4:]

TGT_COLUMNS = "line 1 pump speed discrete"

In [3]:
def load_data(data_path) -> pd.DataFrame:
    data = pd.read_csv(
        data_path,
        low_memory=False,
        index_col=0,
        parse_dates=["timestamp"],
    )
    train_size = int(data.shape[0] * DATA_SPLIT_RATIO)
    val_size = data.shape[0] - train_size
    train_data = pd.concat([data[:int(train_size/2)], data[int(train_size/2)+val_size:]])
    val_data = data[int(train_size/2):int(train_size/2)+val_size]
    return train_data, val_data

train_data, val_data = load_data(DATA_PATH)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 45493 entries, 2021-01-25 17:08:00 to 2022-09-01 17:10:00
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   inlet flow                  36920 non-null  float64
 1   inlet COD                   45490 non-null  float64
 2   inlet ammonia nitrogen      45490 non-null  float64
 3   inlet total nitrogen        45102 non-null  float64
 4   inlet phosphorus            45193 non-null  float64
 5   outlet COD                  44956 non-null  float64
 6   outlet ammonia nitrogen     37037 non-null  float64
 7   outlet total nitrogen       45387 non-null  float64
 8   outlet phosphorus           45385 non-null  float64
 9   line 1 nitrate nitrogen     45493 non-null  float64
 10  line 2 nitrate nitrogen     45493 non-null  float64
 11  line 1 pump speed           45493 non-null  float64
 12  line 2 pump speed           45493 non-null  float64
 

In [5]:
val_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 19497 entries, 2021-11-09 11:20:00 to 2022-03-26 19:00:00
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   inlet flow                  19456 non-null  float64
 1   inlet COD                   19492 non-null  float64
 2   inlet ammonia nitrogen      19497 non-null  float64
 3   inlet total nitrogen        18749 non-null  float64
 4   inlet phosphorus            18857 non-null  float64
 5   outlet COD                  19449 non-null  float64
 6   outlet ammonia nitrogen     13364 non-null  float64
 7   outlet total nitrogen       19394 non-null  float64
 8   outlet phosphorus           19452 non-null  float64
 9   line 1 nitrate nitrogen     19497 non-null  float64
 10  line 2 nitrate nitrogen     19497 non-null  float64
 11  line 1 pump speed           19497 non-null  float64
 12  line 2 pump speed           19497 non-null  float64
 

In [17]:
hgbc = HistGradientBoostingClassifier(
    max_iter=200,
    learning_rate=0.01,
)
train_data_X = train_data[X_COLUMNS].to_numpy()
train_data_y = train_data[TGT_COLUMNS].to_numpy()
hgbc.fit(train_data_X, train_data_y)
val_data_X = val_data[X_COLUMNS].to_numpy()
val_data_y = val_data[TGT_COLUMNS].to_numpy()
prediction = hgbc.predict(val_data_X)
accuracy = ((prediction==val_data_y).sum()) / val_data_y.shape[0]
print(accuracy)

0.08283325639842026
