In [None]:
!pip install xgboost

In [None]:
import os
import pandas as pd

dfs = []
for dirname, _, filenames in os.walk(RUNWAY_DATA_PATH):
    for filename in filenames:
        if filename.endswith(".csv"):
            d = pd.read_csv(os.path.join(dirname, filename))
        elif filename.endswith(".parquet"):
            d = pd.read_parquet(os.path.join(dirname, filename))
        else:
            raise ValueError("Not valid file type")
        dfs += [d]

df = pd.concat(dfs)
df.columns = df.columns.map(lambda x: x.lower())

In [None]:
X_columns = [
    "ambienttemperatue",
    "bearingshafttemperature",
    "blade1pitchangle",
    "blade2pitchangle",
    "blade3pitchangle",
    "controlboxtemperature",
    "gearboxbearingtemperature",
    "gearboxoiltemperature",
    "generatorrpm",
    "generatorwinding1temperature",
    "generatorwinding2temperature",
    "hubtemperature",
    "mainboxtemperature",
    "nacelleposition",
    "reactivepower",
    "rotorrpm",
    "turbinestatus",
    "winddirection",
    "windspeed",
]
y_column = "activepower"


X_df = df[X_columns]
y_df = df[y_column]

In [None]:
from sklearn.model_selection import train_test_split

## Split data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X_df, y_df, test_size=0.2, random_state=2024)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error


params = {
    "objective": "reg:squarederror",
    "learning_rate": LEARNING_RATE,
    "max_depth": MAX_DEPTH,
    "alpha": ALPHA,
    "n_estimators": N_ESTIMATORS,
    }

regr = xgb.XGBRegressor(
    objective=params["objective"],
    learning_rate=params["learning_rate"],
    max_depth=params["max_depth"],
    alpha=params["alpha"],
    n_estimators=params["n_estimators"],
)

In [None]:
regr.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

y_pred = regr.predict(X_valid)
mae = mean_absolute_error(y_pred, y_valid)
mse = mean_squared_error(y_pred, y_valid)
mse

In [None]:
import mlflow
import pandas as pd


class RunwayModel(mlflow.pyfunc.PythonModel):
    def __init__(self, xgb_regressor):
        self._regr = xgb_regressor

    def predict(self, context, X):
        df, uuid = X.drop(columns=["uuid"]), X[["uuid"]].reset_index(drop=True)
        uuid = uuid.rename(columns={"uuid": "pred_uuid"})
        pred = self._regr.predict(df)
        activepower_pred = {"activepower": pred}
        pred_df = pd.DataFrame(activepower_pred)
        return pd.concat([pred_df, uuid], axis=1)

In [None]:
import mlflow
import runway

with mlflow.start_run():
    runway_model = RunwayModel(regr)
    input_columns = X_columns + ["uuid"]
    input_df = df[input_columns]
    input_sample = input_df.sample(1)

    mlflow.log_params(params)
    mlflow.log_metric("valid_mae", mae)
    mlflow.log_metric("valid_mse", mse)

    runway.log_model(
        model=runway_model,
        model_name="my-xgboost-regressor",
        input_samples={"predict": input_sample},
    )