In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (r2_score, mean_absolute_error)

from mip import Model, xsum, maximize, BINARY, OptimizationStatus
from scipy.optimize import linprog

### Build prediction model *Can be modified*

In [None]:
knit_data = pd.read_csv("../data/interim/transactions_sd_knits_resampled_engin_synth_gt.csv")

In [None]:
def create_unseen_data(df):
    '''
    Creates a copy of 'unseen data', that is not used to train the model. 'unseen data' is used to compare real prices to predicted optimal prices.
        Parameters:
            df (df): Dataframe from which to extract 'unseen data'. 
        Returns:
            df (df): Dataframe containing unseen data.
    '''
    unseen_data = df[df['transaction_date'] >= '2021-10-3']
    return unseen_data

In [None]:
unseen_data = create_unseen_data(knit_data)

In [None]:
# DO NOT ADD TO CLASS
def prepare_data(df):
    knit_data['transaction_date'] = pd.to_datetime(knit_data['transaction_date'], infer_datetime_format=True)
    knit_data['week_no'] = knit_data['week_no'].astype('object')
    knit_data['review'] = knit_data['review'].astype('object')
    knit_data.drop(columns=['month'], inplace=True)
    knit_data.drop(columns=['p_id'], inplace=True)
    return(df)

In [None]:
knit_data = prepare_data(knit_data)

In [None]:
def one_hot_encode_categorical_aa(df):
    '''
    One hot encodes categorical variables.
        Parameters:
            df (df): Dataframe to one hot encode.
        Returns:
            df_encoded (df): Dataframe including one hot encoded 
            ohe_dropped_cols (list): List of one hot encoded columns that were dropped to get k-1 columns.
    '''
    df_encoded = pd.get_dummies(df) 
    # drop columns to get k-1 columns for 
    ohe_dropped_cols = ['week_no_2', 'label_desc_lab_1', 'color_simple_Other', 'review_0.0']
    df_encoded.drop(columns=ohe_dropped_cols, 
                    axis=1, 
                    inplace=True)
    return df_encoded, ohe_dropped_cols
    

In [None]:
knit_data, ohe_dropped_cols = one_hot_encode_categorical_aa(knit_data)

In [None]:
# DO NOT ADD TO CLASS
def log_price_quantity(df):
# take log of price and quantity, drop original columns
    df['price_log'] = np.log(df['price'] + 1)
    df['quantity_log'] = np.log(df['quantity'] + 1)
    df.drop(columns=['price'], inplace=True)
    df.drop(columns=['quantity'], inplace=True)
    
    return df

In [None]:
knit_data = log_price_quantity(knit_data)

In [None]:
def build_prediction_model(df):
    '''
    Builds and returns a centralised random forest model based on best parameters, trained on dates that are treated as 'historic',
    Also returns training data.
        Parameters:
            df (df): Dataframe from which to extract 'historic' data.
        Returns:
            RF_cen_model (model): RandomForestRegressor.fit() object.
            X_train (df): Dataframe used to train random forest model.
            y_train (df): Dataframe used to train random forest model.

    '''
    df_train = df[df['transaction_date'] < '2021-10-3']
    y_train = df_train['quantity_log']
    X_train = df_train.drop(['quantity_log', 'transaction_date'], axis=1)

    RF_cen_model = RandomForestRegressor(max_features=50, 
                                        max_depth=6, 
                                        n_estimators=820, 
                                        random_state=0,
                                        min_samples_split=2,
                                        min_samples_leaf=2,
                                        criterion='squared_error',
                                        bootstrap=False
                                        ).fit(X_train, y_train)

    return RF_cen_model, X_train, y_train

In [None]:
RF_cen_model, X_train_historic, y_train_historic = build_prediction_model(knit_data)

### apply model to unseen data

In [None]:
def build_X_unseen(X_historic, week_no, label_desc, color_simple, price, relative_price, ohe_dropped_cols=ohe_dropped_cols):
    '''
    Builds dataframe containing one row, which is used to predict demand for 'unseen'/future items.
        Parameters:
            X_historic (df): Dataframe used to train random forest model.
            week_no (int): Week in which 'unseen'/future item is to be sold. For testing pipeline, values between 44 - 52 are sensible.
            label_desc (object): Label to which 'unseen'/future item belongs. Takes the following values: 'lab_1', 'lab_2', 'lab_3', 'lab_4'.
            color_simple (object): Colour of 'unseen'/future item. Takes the following values: 'White', 'Pink', 'Black', 'Other', 'Blue', 
                                    'Zebra', 'Yellow', 'Green', 'Brown', 'Cream'.
            price (float): Suggested price of 'unseen'/future item.
            relative_price (float): Suggested relative price of 'unseen'/future item.
            ohe_dropped_cols (list): List of one hot encoded columns that were dropped in data preparation.
        Returns:
            X_unseen (df): Dataframe used to predict demand for an 'unseen' item.
    '''
# TODO throw error if variables other than those expected are shown e.g. a new label, new colour, unknown week

    # create output df with same colnames as training data, add in one hot encoded columns that were dropped in previous steps
    columns_all = list(X_historic.columns) 
    columns_all += ohe_dropped_cols
    X_unseen = pd.DataFrame(columns=columns_all)
    row_dict = {'price_log' : np.log(price + 1), 'price_comp_week' : relative_price}
    X_unseen = X_unseen.append(row_dict, ignore_index=True)

    # fill in one hot encoded columns
    week_no_match = 'week_no_' + str(week_no)
    X_unseen[week_no_match] = 1
    
    label_desc_match = 'label_desc_' + label_desc
    X_unseen[label_desc_match] = 1

    color_simple_match = 'color_simple_' + color_simple
    X_unseen[color_simple_match] = 1

    # add label and colour columns back into historic data 
    X_historic['label_desc_lab_1'] = np.where(X_historic[['label_desc_lab_2', 'label_desc_lab_3', 'label_desc_lab_4']].sum(axis=1) == 0, 1, 0)
    X_historic['color_simple_Other'] = np.where(X_historic[['color_simple_Black', 'color_simple_Blue','color_simple_Brown', 'color_simple_Cream', 
                                                            'color_simple_Green','color_simple_Pink', 'color_simple_White', 'color_simple_Yellow', 
                                                            'color_simple_Zebra']].sum(axis=1) == 0, 1, 0)

    # for rating, google trend, and review field, take the median value from historic data based on label and item colour as this information would not be available for 
    # predicting demand live
    # TODO if historic data for 1+ years is available, match based on week too - this should give better predictive demand prediction
    if X_historic[(X_historic[label_desc_match] == 1) & (X_historic[color_simple_match] == 1)].shape[0] > 0:
        X_unseen['star_rating'] = X_historic[(X_historic[label_desc_match] == 1) & (X_historic[color_simple_match] == 1)]['star_rating'].median()
        X_unseen['google_trends_knit'] = X_historic[(X_historic[label_desc_match] == 1) & (X_historic[color_simple_match] == 1)]['google_trends_knit'].median()
        X_unseen['google_trends_colour'] = X_historic[(X_historic[label_desc_match] == 1) & (X_historic[color_simple_match] == 1)]['google_trends_colour'].median()
        neg_rev_av = X_unseen['review_-1.0'] = X_historic[(X_historic[label_desc_match] == 1) & (X_historic[color_simple_match] == 1)]['review_-1.0'].median()
        pos_rev_av = X_unseen['review_1.0'] = X_historic[(X_historic[label_desc_match] == 1) & (X_historic[color_simple_match] == 1)]['review_1.0'].median()
        if neg_rev_av > pos_rev_av:
            X_unseen['review_-1.0'] = 1
            X_unseen['review_1.0'] = 0
        else:
            X_unseen['review_-1.0'] = 0
            X_unseen['review_1.0'] = 1
    
    # if a label and colour combination hasn't been seen before take the median all historic data 
    # this would be more meaningful if historic data for 1+ years was available and could be matched for week 
    else: 
        X_unseen['star_rating'] = X_historic['star_rating'].median()
        X_unseen['google_trends_knit'] = X_historic['google_trends_knit'].median()
        X_unseen['google_trends_colour'] = X_historic['google_trends_colour'].median()
        neg_rev_av = X_unseen['review_-1.0'] = X_historic['review_-1.0'].median()
        pos_rev_av = X_unseen['review_1.0'] = X_historic['review_1.0'].median()

        if neg_rev_av > pos_rev_av:
            X_unseen['review_-1.0'] = 1
            X_unseen['review_1.0'] = 0

        else:
            X_unseen['review_-1.0'] = 0
            X_unseen['review_1.0'] = 1


    # fill remaining NAs with 0
    X_unseen = X_unseen.fillna(0)

    # drop one hot encoded that were added in earlier step
    X_unseen.drop(columns=ohe_dropped_cols, inplace=True)

    # drop columns that were added onto historic data
    X_historic.drop(columns=['label_desc_lab_1', 'color_simple_Other'], inplace=True)

    # order columns correctly
    X_unseen = X_unseen[X_historic.columns]

    return X_unseen

In [None]:
def predict_demand(X_unseen):
    '''
    Predicts demand for input data.
        Parameters:
            X_unseen: Dataframe containing 1 row.
        Returns:
            prediction: Prediction for 'unseen'/future item as an interpretable value.
    '''
    prediction = RF_cen_model.predict(X_unseen)
    prediction = np.exp(prediction) - 1
    prediction = np.round(prediction)
    prediction = prediction[0]

    return prediction

### select competing products 

In [None]:
unseen_data.groupby(['week_no']).size().sort_values(ascending=False)

In [None]:
# Test/unseen data is from week 44 - 52
def select_competing_products(unseen_data, n_products, week_no, pc_lower_price_bound, pc_upper_price_bound, random_state):
    '''
    Randomly selects competing products and related features to test demand prediction and optimisation step.
        Parameters:
            unseen_data (df): Dataframe from which to randomly select competing items. This should be the 'unseen'/future data.
            n_products (int): Number of competing items to select. This should not be greater than the number of items that were actually sold for a particular week.
            week_no (int): Week number to select competing items from. For testing pipeline, values between 44 - 52 are sensible.
            pc_lower_price_bound (int): Percentage value by which to lower the price of an item for testing price optimisation.
            pc_upper_price_bound (int): Percentage value by which to increase the price of an item for testing price optimisation.
            random_state (int): Random state for reproducibility.
        Returns:
            competing_items_dict (dict): A dictionary of competing items, including item features such as week of same, label, colour, lower price bound and upper 
                                         price bound. 
    '''
    unseen_data = unseen_data[unseen_data['week_no'] == week_no]
    unseen_sample = unseen_data.sample(n=n_products, replace=False, random_state=random_state)
    unseen_sample_details = unseen_sample[['week_no', 'label_desc', 'color_simple', 'price']].reset_index(drop=True)
  
    competing_items_dict = {}
    for i in range(len(unseen_sample_details)):
        prod_name = 'unseen_' + str(i+1)
        price = unseen_sample_details.iloc[i][3]
        lpb = round(price - (price * (pc_lower_price_bound/100)), 2)
        upb = round(price + (price * (pc_upper_price_bound/100)), 2)
        array = [unseen_sample_details.iloc[i][0], unseen_sample_details.iloc[i][1], unseen_sample_details.iloc[i][2], lpb, upb]
        competing_items_dict[prod_name] = array

    return competing_items_dict

In [None]:
def calc_revenue_get_prices(unseen_data, n_products, week_no, random_state):
    '''
    Calculates and returns total revenue and returns prices for competing products, input should match those used in select_competing_products().
        Parameters:
            unseen_data (df): Dataframe from which to randomly select competing items. This should be the 'unseen'/future data.
            n_products (int): Number of competing items to select. This should not be greater than the number of items that were actually sold for a particular week.
            week_no (int): Week number to select competing items from. For testing pipeline, values between 44 - 52 are sensible.
            random_state (int): Random state for reproducibility.
        Returns:
            total_revenue (float): Total revenue for competing items in specified week.
            actual_prices (list): Prices for competing items in specified week.

    '''
    unseen_data = unseen_data[unseen_data['week_no'] == week_no]
    unseen_sample = unseen_data.sample(n=n_products, replace=False, random_state=random_state)
    unseen_sample['revenue'] = unseen_sample['price'] * unseen_sample['quantity']
    total_revenue = round(unseen_sample['revenue'].sum(), 2)
    actual_prices = list(round(unseen_sample['price'], 2))

    return total_revenue, actual_prices

In [None]:
competing_items_dict = select_competing_products(unseen_data=unseen_data, n_products=8, week_no=48, pc_lower_price_bound=10, pc_upper_price_bound=20, random_state=0)
total_revenue, actual_prices = calc_revenue_get_prices(unseen_data=unseen_data, n_products=8, week_no=48, random_state=0)

In [None]:
competing_items_dict

### Build demand matrix

In [None]:
def build_demand_matrix(min_price, max_price, increment, competing_items_dict, X_historic):
    '''
    Builds and returns array of prices, sum_prices and 3D matrix of demand predictions to be input into the price optimisation step.
        Parameters:
            min_price (int): Minimum price to consider for items when predicitng their demand.
            max_price (int): Maximum price to consider for items when predicitng their demand.
            increment (int): Increment by which to increase searched prices for predicting demand.
            competing_items_dict (dict): Dictionary of competing items, including item features such as week of same, label, colour, lower price bound and upper 
                                         price bound.
            X_historic (df): Dataframe used to train random forest model.
        Returns:
            demand_matrix (ndarray): Predicted demands matrix of dimension (n,k,j) with n the # of items, k the # of price possibilities, 
                                     and j # the number of sum of prices considered
            prices (ndarray): 1D-array containing the 'k' possible prices to attribute to the items.
            sum_prices (ndarray): 1D-array containing the sum of prices considered.
    '''

    competing_items = len(competing_items_dict)
    competing_items_keys = list(competing_items_dict.keys())
    prices = list(range(min_price, max_price+increment, increment))
    sum_prices = np.arange(min_price*competing_items, max_price*competing_items+increment, increment)

    demand_matrix = np.zeros((competing_items, len(prices), len(sum_prices)))

    for nn in range(competing_items):
        week_no = competing_items_dict[competing_items_keys[nn]][0]
        label_desc = competing_items_dict[competing_items_keys[nn]][1]
        color_simple = competing_items_dict[competing_items_keys[nn]][2]
        lpb = competing_items_dict[competing_items_keys[nn]][3]
        upb = competing_items_dict[competing_items_keys[nn]][4]
        
        for j2, jj in enumerate(prices):
            
            for k2, kk in enumerate(sum_prices):
                
                if (jj >= lpb) & (jj <= upb):
                    relative_price =  jj/(kk/competing_items)

                    # TODO this can be sped up
                    X_unseen = build_X_unseen(X_historic = X_historic, 
                                        week_no=week_no, 
                                        label_desc=label_desc, 
                                        color_simple=color_simple, 
                                        price=jj, 
                                        relative_price=relative_price
                                        )

                    demand_matrix[nn,j2,k2] = predict_demand(X_unseen)
                
                else:
                    demand_matrix[nn,j2,k2] = 0

    return demand_matrix, np.array(prices), sum_prices

In [None]:
demand_matrix, prices, sum_prices = build_demand_matrix(min_price=20, max_price=370, increment=2, competing_items_dict=competing_items_dict, X_historic=X_train_historic)

In [None]:
print(demand_matrix)

In [None]:
# DO NOT ADD TO CLASS
def lp_mip_solver(demands,prices,sum_prices):

# the function takes three inputs. 
# demands: matrix of size 'n*k*j' with n the # of items, k the # of price possibilities, and j # the number of sum of prices considered   
# prices: list of 'k' possible prices 
# sum_prices: vector typically ranging from n*min(prices) to n*max(prices) 
    
    n, k, num_loops = np.shape(demands) # n corresponds to the number of products and k to the number of prices considered in the optimisation problem


    # sanity check
    assert  num_loops == len(sum_prices), 'the demands matrix last dimension is different from len(sum_prices)'
    assert  k == len(prices), 'the demands matrix middle dimension is different from len(prices)'


    # initialising empty variables
    optimum_solution = np.zeros(n*k) # optimum solution 
    revenue_prediction = 0 # optimum revenue

        

    # Constraints are recast out of the shape A*x = b. 
    # Two types of constraints are considered. The sum of the prices of a single item must be equal to 1 (for a binary variable this means that an item has only a single price!)
    A = np.array([[
        1 if j >= k*(i) and j < k*(i+1) else 0
        for j in range(k*n)
    ] for i in range(n)])


    # The second set of constraints is defined and added to A here: The sum of the prices must be equal to sum_prices[aa]
    A = np.append(A, np.tile([prices], n), axis=0)


    objective_loop, LBk = loop_k(demands, prices, sum_prices, A)

    # step 2 (LP bound Algorithm) in `Analytics for an Online Retailer: Demand Forecasting and Price Optimization`
    sum_prices_sorted = sum_prices[np.argsort(objective_loop)[::-1]]
    objective_sorted = objective_loop[np.argsort(objective_loop)[::-1]]
    demands_sorted = demands[:,:,np.argsort(objective_loop)[::-1]]
    LBk_sorted = LBk[np.argsort(objective_loop)[::-1]]

    
    # step 3 (LP bound Algorithm)
    k_hat = np.argmax(LBk_sorted)
    LB = LBk_sorted[k_hat]

    ll = 0
    flag = True
    while flag == True: # looping voer the MIP probelm while flag == True
        
        demands_submatrix = demands_sorted[:,:,ll]
        r = np.multiply(np.tile([prices], n), np.array(demands_submatrix).reshape(1, k*n)).flatten()
        b = [np.append(np.ones(n), sum_prices_sorted[ll])] # constraint vector: [1....1 sum_prices[ll]]
        m = Model() # calling the model object and initiation

        x = [m.add_var(var_type=BINARY) for i in range(k*n)] # defining the different variables: n*k variables and defining them as binary

        m.objective = maximize(xsum(r[i] * x[i] for i in range(k*n))) # objective function defined as r*x to be maximised

        for j in range(n+1): # adding the different constraints to the problem Ax = b
            m += xsum(A[j,i] * x[i] for i in range(n*k)) == b[0][j]

        status = m.optimize() # calling the solver

        if status == OptimizationStatus.OPTIMAL and m.objective_value > LB:
            k_hat = ll
            LB = m.objective_value
            revenue_prediction = m.objective_value # then we want this to be our objective
            optimum_solution = np.array([ x[aa].x for aa in range(n*k)]) #recording the solution
        
        if ll==num_loops-1:
            flag = False
        elif status == OptimizationStatus.OPTIMAL and LB >= objective_sorted[ll+1]:
            flag = False
        else:
            ll+=1
            
            
    optimal_prices = np.matmul(optimum_solution.reshape(n,k),prices) # returning the vector of optimum prices for each item

    return optimal_prices, revenue_prediction

In [None]:
# DO NOT ADD TO CLASS
def loop_k(demands, prices, sum_prices, A):
    # function where the loop over the different values of the sum of the prices: sum_prices[aa] is executed
    
    
    n, k, num_loops = np.shape(demands) 

    
    # r is the vector giving the different price * demand combinations 
    # It is used to define the cost function to maximise: max_x( tranpose(r) * x )

    #initialisation
    objective_loop = np.zeros(len(sum_prices)) # best objective function in each solution
    LBk = np.zeros(len(sum_prices))

    for aa in range(num_loops): 
        demands_submatrix = demands[:,:,aa]
        r = np.multiply(np.tile([prices], n), np.array(demands_submatrix).reshape(1, k*n))

        b = [np.append(np.ones(n), sum_prices[aa])] # constraint vector: [1....1 sum_prices[aa]]
        
        lp_sol = linprog(-r.flatten(), A_eq = A, b_eq = b) # calling the model object and initiation
        

        if lp_sol.status == 0:
            objective_loop[aa] = -lp_sol.fun
            LBk[aa] = -lp_sol.fun - np.max( np.max(prices*demands_submatrix,axis=1) - np.min(prices*demands_submatrix,axis=1) )
    

    
    return objective_loop, LBk

In [None]:
objective_loop, LBk = lp_mip_solver(demand_matrix, np.array(prices), sum_prices)

In [None]:
competing_items_dict

In [None]:
print('Real Prices:' + str(actual_prices) + '\nReal Revenue: ' + str(total_revenue) + '\nPredicted Prices :' + str(objective_loop) + '\nPredicted Revenue: ' + str(LBk))