In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# FIXME: breakdown handling: sliding_window_with breakdown returns unencoded strings
# TODO: label handling: is this best practice, can I transform back, do I need to transform it back
# TODO: train test split
# TODO: run prepare_test_data at constructor
# TODO: testing
class DataPreparation:
    
    def __init__(self, dataset, history_window, horizon=1, target_column=None, breakdown_columns=[]):
        self.X_scaler = MinMaxScaler()
        self.Y_scaler = MinMaxScaler()
        self.validate = pd.DataFrame() 
        self.dataset = dataset
        self.history_window = history_window
        self.horizon = horizon
        self.target = target_column
        self.breakdown = breakdown_columns
        
        self.prepare_data()
    
    def prepare_data(self):
        if len(self.breakdown) > 0:
            label_encoded_data = self.handle_labels(self.dataset, self.breakdown)
            scaled_data = self.feature_scaling(label_encoded_data, self.target, self.breakdown)
            self.X, self.y = self.sliding_window_with_breakdown(
                scaled_data, self.history_window, self.horizon, self.target, self.breakdown)
        else:
            label_encoded_data = self.handle_labels(dataset)
            scaled_data = self.feature_scaling(label_encoded_data, target_column)
            self.X, self.y = self.sliding_window_preprocessing(scaled_data, history_window, horizon, target_column)
    
    def prepare_test_data(self, data):
        if len(self.breakdown) > 0:
            label_encoded_data = self.handle_labels(data, self.breakdown)
            scaled_data = self.feature_scaling(label_encoded_data, self.target, self.breakdown, test=True)
            X_test, y_test = self.sliding_window_with_breakdown(
                scaled_data, self.history_window, self.horizon, self.target, self.breakdown, test=True)
        else:
            label_encoded_data = self.handle_labels(data)
            scaled_data = self.feature_scaling(label_encoded_data, target_column)
            X_test, y_test = self.sliding_window_preprocessing(scaled_data, history_window, horizon, target_column, test=True)
        return X_test, self.inverse_scaling(y_test)
    
    def sliding_window_preprocessing(self, sequences, history_window, horizon=1, target_col=None, test=False):
        if target_col:
            target = sequences.pop(target_col)
            sequences[target_col] = target
        
        if not test:
            self.validate.append(sequences.tail(horizon).copy())
            sequences.drop(sequences.tail(horizon).index, inplace=True)
        
        sequences = np.array(sequences)

        X, y = list(), list()
        for i in range(len(sequences)):
            # find the end of this pattern
            end_ix = i + history_window
            horizon_end_ix = end_ix + horizon - 1
            # check if we are beyond the sequence
            if horizon_end_ix > len(sequences):
                break
            # gather input and output parts of the pattern
            seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1:horizon_end_ix, -1]
            X.append(seq_x)
            y.append(seq_y)
        return np.array(X), np.array(y)

    def sliding_window_with_breakdown(self, sequences, history_window, horizon=1, target_col=None, breakdown_columns=[], test=False):
        assert len(breakdown_columns) > 0 
        
        X, y = np.array([]), np.array([])
        cols = sequences[breakdown_columns].drop_duplicates()
        for row in cols.values:
            q = []
            for col_name, val in zip(breakdown_columns, row):
                if type(val) == int or type(val) == float:
                    q.append(f"{col_name}=={val}")
                else:
                    q.append(f"{col_name}=='{val}'")

            q = " and ".join(q)
            temp_df = sequences.query(q).copy()
            if not test:
                if len(temp_df) < history_window+horizon:
                    continue

                self.validate = self.validate.append(temp_df.tail(horizon).copy())
                temp_df.drop(temp_df.tail(horizon).index, inplace=True)

            if len(temp_df) < history_window+horizon:
                # print(len(temp_df), q)
                continue
            X_temp, y_temp = self.sliding_window_preprocessing(temp_df, history_window, horizon, target_col, test=True)
            if X.size == 0:
                X = X_temp.copy()
            else:
                X = np.concatenate((X, X_temp))
            if y.size == 0:
                y = y_temp.copy()
            else:
                y = np.concatenate((y, y_temp))

        return X, y
    
    def handle_labels(self, sequences, ignore_cols=[]):
        label_columns = []
        for dtype, col in zip(sequences.dtypes, sequences.columns):
            if dtype == "object" and col not in ignore_cols:
                label_columns.append(col)
        if len(label_columns) < 1:
            return sequences
        sequences = pd.concat([sequences, pd.get_dummies(sequences[label_columns], prefix='OneHot', drop_first=True)], axis=1)
        return sequences.drop(label_columns, axis=1)

    def feature_scaling(self, sequences, target_column, ignore_cols=[], test=False):
        labels = sequences[ignore_cols]
        feature_columns = sequences.drop([target_column]+ignore_cols, axis=1).columns.values
        
        if not test:
            self.X_scaler.fit(sequences[feature_columns])
            self.Y_scaler.fit(sequences[target_column].values.reshape(-1, 1))
        
        X_data = self.X_scaler.transform(sequences[feature_columns])
        Y_data = self.Y_scaler.transform(sequences[target_column].values.reshape(-1, 1))
            
        sequences[feature_columns], sequences[target_column] = X_data, Y_data
        sequences[ignore_cols] = labels
        return sequences

    def inverse_scaling(self, y):
        return self.Y_scaler.inverse_transform(y)