In [17]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss

class SurrogateLossLearner:
    def __init__(self):
        pass
    def predict(self, df, household_idx):
        # Line1
        df = self._prior(df, household_idx)

        # Line2
        scores = []
        params = [0, 1, 2]
        models = []
        for param in params:
            score, model = self._cv(df, param)
            scores.append(score)
            models.append(model)
        # Line3
        best_idx = np.argmin(scores)
        best_model = models[best_idx]

        # Line4, 5
        preds_new = best_model.predict(df.Energy.values.reshape(-1, 1))
        df["preds_new"] = preds_new
        return df


    def _cv(self, df, param):
        kf = KFold(n_splits=5, random_state=0, shuffle=True)
        losses = []
        for train_idx, test_idx in kf.split(df):
            # split data into train, test

            train_x, test_x = df.Energy.values[train_idx], df.Energy.values[test_idx]
            # TODO: whether or not use Energy only?
            train_y, test_y = df.preds_time.values[train_idx], df.preds_time.values[test_idx]

            # training
            model_energy = DecisionTreeClassifier(max_depth=param)
            model_energy.fit(train_x.values.reshape(-1, 1), train_y)
            # test
            preds = model_energy.predict_proba(test_x.values.reshape(-1, 1))
            # Loss
            loss = self._surrogate_loss(test_y, preds)
            losses.append(loss)
        return np.mean(losses)
    def _surrogate_loss(self, y, preds):
        raw_neg = sum(preds[y==0]==1)/preds.shape[0]
        raw_pos = sum(preds[y==1]==0)/preds.shape[0]
        det = 1-raw_pos-raw_neg

        conditional_noise_rate_neg = (1-sum(preds[y==1]==1)/preds.shape[0]+sum(preds[y==0]==0)/preds.shape[0])/2
        loss_pos = log_loss(y, preds)
        conditional_noise_rate = (1-raw_pos+raw_neg)/2
        loss_neg = log_loss(abs(-y), preds)
        num = (1-conditional_noise_rate_neg)*loss_pos-conditional_noise_rate*loss_neg
        return num / det

    def _prior(self, df, household_idx=1):
        if household_idx in [2]:
            preds_time = []
            for _, row in df.iterrows():
                if (row.Time >= 18) & (row.Time <= 40):
                    preds_time.append(0)
                else:
                    preds_time.append(1)
            df["preds_time"] = preds_time
        elif household_idx in [1, 3]:
            preds_time = []
            for _, row in df.iterrows():
                if (row["Time"] >= 24) & (row["Time"] <= 28):
                    preds_time.append(0)
                else:
                    preds_time.append(1)
            df["preds_time"] = preds_time
        return df

In [18]:
import pandas as pd
target_idx = 3
summer_idx = 0
target_X = pd.read_csv(f"./deep_occupancy_detection/data/{target_idx}_X_train.csv")
target_y_task = pd.read_csv(f"./deep_occupancy_detection/data/{target_idx}_Y_train.csv")[target_X.Season==summer_idx]
target_X = target_X[target_X.Season==summer_idx]
target_X["Time"] = [i for i in range(12, 44, 1)]*int(target_X.shape[0]/32)

In [19]:
surrogate_leaner = SurrogateLossLearner()
surrogate_leaner.predict(df=target_X, household_idx=target_idx)

AttributeError: 'DataFrame' object has no attribute 'Energy'