In [None]:
# !pip install pandas scikit-learn

In [None]:
import os
import pandas as pd

dfs = []
for dirname, _, filenames in os.walk(TRAIN_DATA):
    for filename in filenames:
        if filename.endswith(".csv"):
            d = pd.read_csv(os.path.join(dirname, filename))
        elif filename.endswith(".parquet"):
            d = pd.read_parquet(os.path.join(dirname, filename))
        else:
            raise ValueError("Not valid file type")
        dfs += [d]
df = pd.concat(dfs)

In [None]:
proc_df = df.set_index("datetime").drop(columns=["id"]).tail(1000)

In [None]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(proc_df, test_size=0.2)

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


class PCADetector:
    def __init__(self, n_components):
        self._use_columns = ...
        self._scaler = StandardScaler()
        self._pca = PCA(n_components=n_components)
    
    def fit(self, X):
        self._use_columns = X.columns
        X_scaled = self._scaler.fit_transform(X)
        self._pca.fit(X_scaled)
    
    def predict(self, X):
        X = X[self._use_columns]
        X_scaled = self._scaler.transform(X)
        recon = self._recon(X_scaled)
        recon_err = ((X_scaled - recon) ** 2).mean(1)
        recon_err_df = pd.DataFrame(recon_err, columns=["anomaly_score"], index=X.index)
        return recon_err_df
    
    def _recon(self, X):
        z = self._pca.transform(X)
        recon = self._pca.inverse_transform(z)
        return recon

    def reconstruct(self, X):
        X_scaled = self._scaler.transform(X)
        recon_scaled = self._recon(X_scaled)
        recon = self._scaler.inverse_transform(recon_scaled)
        recon_df = pd.DataFrame(recon, index=X.index, columns=X.columns)
        return recon_df

In [None]:
import runway

# start run
runway.start_run()

# log param
parameters = {"n_components": N_COMPONENTS}

runway.log_parameters(parameters)

detector = PCADetector(n_components=parameters["n_components"])
detector.fit(train)

train_pred = detector.predict(train)
valid_pred = detector.predict(valid)

# log metric
mean_train_recon_err = train_pred.mean()
mean_valid_recon_err = valid_pred.mean()

runway.log_metric("mean_train_recon_err", mean_train_recon_err)
runway.log_metric("mean_valid_recon_err", mean_valid_recon_err)

In [None]:
import runway

# log model
input_sample = proc_df.sample(1)
runway.log_model(model_name="pca-model", model=detector, input_samples={"predict": input_sample})

# stop run
runway.stop_run()