In [None]:
# import libraries
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.metrics import (r2_score, mean_squared_error, median_absolute_error, 
mean_absolute_error, mean_absolute_percentage_error)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


Code for the TimeBasedCV class adapted from:
https://towardsdatascience.com/time-based-cross-validation-d259b13d42b8

In [None]:
import pandas as pd
import datetime
from datetime import datetime as dt
from dateutil.relativedelta import *


class TimeBasedCV(object):
    '''
    Adpated from https://towardsdatascience.com/time-based-cross-validation-d259b13d42b8
    
    Parameters 
    ----------
    train_period: int
        number of time units to include in each train set
        default is 30 # TO FIX - this has been changed
    test_period: int
        number of time units to include in each test set 
        default is 7
    freq: string
        frequency of input parameters. possible values are: days, months, years, weeks, hours, minutes, seconds
        possible values designed to be used by dateutil.relativedelta class
        default is weeks
    '''
    
    
    def __init__(self, train_period=30, test_period=7, freq='weeks'):
        self.train_period = train_period
        self.test_period = test_period
        self.freq = freq

        
        
    def split(self, data, validation_split_date=None, date_column='transaction_date', gap=0):
        '''
        Generate indices to split data into training and test set
        
        Parameters 
        ----------
        data: pandas DataFrame
            your data, contain one column for the record date 
        validation_split_date: datetime.date()
            first date to perform the splitting on.
            if not provided will set to be the minimum date in the data after the first training set
        date_column: string, deafault='transaction_date'
            date of each record
        gap: int, default=0
            for cases the test set does not come right after the train set,
            *gap* days are left between train and test sets
        
        Returns 
        -------
        train_index ,test_index: 
            list of tuples (train index, test index) similar to sklearn model selection
        '''
        
        # check that date_column exist in the data:
        try:
            data[date_column]
        except:
            raise KeyError(date_column)
                    
        train_indices_list = []
        test_indices_list = []

        if validation_split_date==None:
            validation_split_date = data[date_column].min().date() + eval('relativedelta('+self.freq+'=self.train_period)')
        
        # initalise start/end of train/test periods
        start_train = data[date_column].min().date()
        end_train = validation_split_date 
        start_test = validation_split_date + datetime.timedelta(days=1)
        end_test = start_test + eval('relativedelta('+self.freq+'=self.test_period)')

        count = 1

        while end_test < (data[date_column].max().date() + datetime.timedelta(days=1)):
            # train indices:
            cur_train_indices = list(data[(data[date_column].dt.date>=start_train) & 
                                     (data[date_column].dt.date<end_train)].index)

            # test indices:
            cur_test_indices = list(data[(data[date_column].dt.date>=start_test) &
                                    (data[date_column].dt.date<end_test)].index)
            
            print("Train period:",start_train,"-" , end_train, ", Test period", start_test, "-", end_test,
                  "# train records", len(cur_train_indices), ", # test records", len(cur_test_indices))

            train_indices_list.append(cur_train_indices)
            test_indices_list.append(cur_test_indices)

            count += 1

            # update dates:
            start_train = data[date_column].min().date()             
            start_test = start_test + datetime.timedelta(weeks=9)
            end_train = start_test - datetime.timedelta(days=1)
            if count == 3:
                end_test = data[date_column].max().date() 
            else:
                end_test = start_test + datetime.timedelta(weeks=8)

        # mimic sklearn output  
        index_output = [(train,test) for train,test in zip(train_indices_list,test_indices_list)]

        self.n_splits = len(index_output)
        
        return index_output
    
    
    def get_n_splits(self):
        """Returns the number of splitting iterations in the cross-validator
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits 

In [None]:
# Load data
sales = pd.read_csv("../data/interim/transactions_sd_knits_resampled_engin_synth_gt_gb.csv")


In [None]:
def prepare_data(df):
    '''
    Returns appropriate data type and drops columns irrelavent to modelling process

        Parameters:
            df (dataframe): transactions dataframe with engineered features

        Returns:
            df (dataframe): transformed transactions dataframe with engineered features
    '''
    df['transaction_date'] = pd.to_datetime(df['transaction_date'], infer_datetime_format=True)
    df.sort_values(by=['transaction_date'])
    df['week_no'] = df['week_no'].astype('object')
    df['review'] = df['review'].astype('object')
    df.drop(columns=['month'], inplace=True)
    df.drop(columns=['p_id'], inplace=True)
    return(df)

In [None]:
def one_hot_encode_categorical(df):
    '''
    Returns dataframe with all object datatypes in dataframe one hot encoded. Drops 
    specific columns to get k-1 dummies.
        Parameters:
            df (dataframe): transactions dataframe with engineered features
        Returns:
            df_encoded (dataframe): transformed input
    '''

    df_encoded = pd.get_dummies(df) 
    df_encoded.drop(columns=['week_no_2', 'label_desc_lab_1', 'color_simple_Other', 
                            'review_0.0'], 
                    axis=1, 
                    inplace=True)
    return df_encoded

In [None]:
def log_price_quantity(df):
    '''
    Returns log values of price and quantity. 
    
            Parameters:
                df (dataframe): transactions dataframe

            Returns:
                df (dataframe): transformed input
    '''
    
    df['price_log'] = np.log(df['price'] + 1) # +1 to avoid senario of log(0)
    df['quantity_log'] = np.log(df['quantity'] + 1) # +1 to avoid senario of log(0) 
    df.drop(columns=['price'], inplace=True)
    df.drop(columns=['quantity'], inplace=True)
    return df

In [None]:
def temporal_test_train_split_for_cv(df, split_date='2021-10-3'):
   '''
   Function splits dataframe based on a specific date test X and y datasets ready for
   model prediction and returns unaltered dataframe of training data. Allows
   training data to be processed for CV splits by apply_tscv_split_to_training 

      Parameters:
            df (dataframe): transactions dataframe
            split_date (str): date in 'YYYY-MM-DD' (default '2021-10-3')

      Returns:
            df_train (dataframe): dataframe of training data
            X_test (dataframe): X test set (model test inputs)
            y_test_log (dataframe): y test set (model test targets) 
   '''

   df_train = df[df['transaction_date'] < split_date]
   df_test = df[df['transaction_date'] >= split_date]

   y_test_log = df_test['quantity_log']
   X_test = df_test.drop(['quantity_log', 'transaction_date'], axis=1)
 
   return df_train, X_test, y_test_log

In [None]:
def apply_tscv_split_to_training(df):
    '''
    Calls function to generate indicies for time series splitting of the training data which 
    incrementally increases the size of the training set while pushing along the validation set. 
    Currently set to create 3 folds. To be applied to the df_train dataframe already processed through 
    temporal_test_train_split_for_cv. NB: could be refined to have less hard code.
    
        Parameters:
            df (dataframe): df_train
        Returns:
            index_output (list of tuples): train_index, test_index list similar to sklearn model selection
            X_train (dataframe): X train set (model train inputs)
            y_train_log (dataframe): y train set (model train targets)

    '''
    
    tscv = TimeBasedCV(train_period=20,
                        test_period=7,
                        freq='weeks')
        
    index_output = tscv.split(df, validation_split_date=datetime.date(2021,4,10), date_column='transaction_date')

    train_labels_log = df['quantity_log']
    train_features = df.drop(['quantity_log', 'transaction_date'], axis=1)
    
    return index_output, train_features, train_labels_log

In [None]:
def create_results_dataframes(): 
    '''
    Returns dataframes to store metrics 'R2', 'RMSE', 'MAE', 'MAPE', 'MedAE','MedAPE'
    from training and testing data. To be used with print_model_metrics.

            Parameters:

            Returns:
                train_results_df (dataframe): empty dataframe for populating
                test_results_df (dataframe): empty dataframe for populating
    '''
        
    train_results_df = pd.DataFrame(index=['R2', 'RMSE', 'MAE', 'MAPE', 'MedAE','MedAPE'])
    test_results_df = pd.DataFrame(index=['R2', 'RMSE', 'MAE', 'MAPE', 'MedAE', 'MedAPE'])
    return train_results_df, test_results_df


In [None]:
# Function to output suite of metrics to dataframes
def print_model_metrics (model, y_train_log, X_train, y_test_log, X_test, 
                        model_name, train_results_df, test_results_df):
    '''
    Calculates 'R2', 'RMSE', 'MAE', 'MAPE', 'MedAE','MedAPE' for training and test data given 
    inputed model and stores in provided dataframe.  

        Parameters:
            model (estimator): model to run data through
            y_train_log (*array): y for training data
            X_train (*array): X for training data
            y_test_log (*array): y for test data
            X_test (*array): X for test data
            model_name (str): becomes name of column in results dataframes
            train_results_df (dataframe): dataframe to store model metrics
            test_results_df (dataframe): dataframe to store model metrics
        Returns:
        '''

    print(model_name + ' model:'), 
    y_train = np.exp(y_train_log)-1
    y_test = np.exp(y_test_log)-1
    y_pred_train = np.exp(model.predict(X_train))-1
    y_pred_test = np.exp(model.predict(X_test))-1

    r2_train = round(r2_score(y_train, y_pred_train),3)
    rmse_train = round(mean_squared_error(y_train, y_pred_train)**0.5,3)
    mae_train = round(mean_absolute_error(y_train, y_pred_train), 3)
    mape_train = round(mean_absolute_percentage_error(y_train, y_pred_train)*100,3)
    medae_train = round(median_absolute_error(y_train, y_pred_train),3)
    medAPE_train = round(np.median(np.abs((y_train - y_pred_train)/y_train))*100, 3)

    r2_test = round(r2_score(y_test, y_pred_test),3)
    rmse_test = round(mean_squared_error(y_test, y_pred_test)**0.5,3)
    mae_test = round(mean_absolute_error(y_test, y_pred_test), 3)
    mape_test = round(mean_absolute_percentage_error(y_test, y_pred_test)*100,3)
    medae_test = round(median_absolute_error(y_test, y_pred_test),3)
    medAPE_test = round(np.median(np.abs((y_test - y_pred_test)/y_test))*100, 3)

    train_results_df[model_name]=[r2_train, rmse_train, mae_train, mape_train, medae_train, medAPE_train]
    test_results_df[model_name]=[r2_test, rmse_test, mae_test, mape_test, medae_test, medAPE_test]

    display(train_results_df)
    display(test_results_df)

    #return train_results_df, test_results_df


In [None]:
knit_data = prepare_data(sales)

In [None]:
knit_data = one_hot_encode_categorical(knit_data)

In [None]:
knit_data = log_price_quantity(knit_data)

In [None]:
df_train, X_test, y_test_log = temporal_test_train_split_for_cv(knit_data)

In [None]:
index_output, X_train, y_train_log = apply_tscv_split_to_training(df_train)

In [None]:
# Apply timeseries cv split to the Random Forest regression

In [None]:
def find_best_random_forest_hyperparameters(X_train, y_train_log, index_output):
    '''
    Returns best random forest estimator based on random search of 100 interations through parameter grid
    which is included in the function.
        Parameters:
            X_train (*array): X for training data
            y_train_log (*array): y for training data
            index_output (list of tuples): train_index, test_index list similar to sklearn model selection
        Returns:
            rf_tuned_model (estimator): best estimator from the random search CV process 
    '''
    
    # Set up parameters for optimisation

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 6)]
    # Criterion to measure quality of a split
    criterion = ['squared_error', 'absolute_error', 'poisson']
    # Number of features to consider at every split
    max_features = list(range(2, len(X_train.columns), 2))
    # Maximum number of levels in tree
    max_depth = list(range(2,10))
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                    'criterion': criterion,
                    'max_features': max_features,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'bootstrap': bootstrap}

    print(random_grid)

    # Use the random grid to search for best hyperparameters

    # First create the base model to tune
    rf = RandomForestRegressor(random_state=0)
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_tuned_model = RandomizedSearchCV(estimator = rf, 
                                    param_distributions = random_grid, 
                                    n_iter = 100, 
                                    cv = index_output, # indexes generated from tscv
                                    verbose=2, 
                                    random_state=42, 
                                    n_jobs = -1) 
    # Fit the random search model
    rf_tuned_model.fit(X_train, y_train_log)

    return rf_tuned_model

In [None]:
# Run the following to get result
# rf_tuned_model = find_best_random_forest_hyperparameters(X_train, y_train_log, index_output)

In [None]:
#rf_tuned_model.best_params_

In [None]:
train_results_df, test_results_df = create_results_dataframes()

# Best Random Forest Model:

In [None]:
rf_best_model = RandomForestRegressor(max_features=50,
                                    max_depth=6,
                                    n_estimators=820,
                                    random_state=0,
                                    min_samples_split=2,
                                    min_samples_leaf=2,
                                    criterion='squared_error',
                                    bootstrap=False
                                    ).fit(X_train, y_train_log)

model_name = 'rf_best_model '
print_model_metrics (rf_best_model , y_train_log, X_train, y_test_log, X_test, model_name, train_results_df, test_results_df)

In [None]:
# Return weight of features in order of importance
feature_names = X_train.columns
coef = rf_best_model.feature_importances_
# sort them out in descending order
indices = np.argsort(abs(coef))[::-1]
#print(indices)
#print(feature_names[indices.astype(int)])

for i in indices:
    print(feature_names[i], ':', coef[i])