# Production code setup

## Capture all the project stages in functions

### Cleaning the data

In [None]:
def cleaning_data(x):
    
    # Modify data types
    temp = x.astype({"month": "O", "wday": "O"})             
    
    # Fill null values
    temp.loc[x["event_name_1"].isna(),"event_name_1"] = 'no_event'

    def fill_mode(records):
        '''''
        Function that fills the null values of
        the records with values with the mode of that same product.
        Returns the same records but with no nulls.
        '''''
        mode = records["sell_price"].mode()[0]
        records.loc[records["sell_price"].isna(), "sell_price"] = mode
        return records

    temp = temp.groupby('item_id').apply(fill_mode)
      
    return(temp)

### Create variables

In [None]:
def create_variables(x):

    # Intermittent demand

    def stock_outage(sales, n):
        '''''
        Function that returns a binary variable, with a value of 1 after N days of
        0 sales in a row. It takes as an input the sales variable and the number of days
        N.
        '''''
        zero_sales = pd.Series(np.where(sales == 0, 1, 0))
        num_zeros = zero_sales.rolling(n).sum()
        stock_outage = np.where(num_zeros == n, 1, 0)

        return stock_outage
    
    x = x.sort_values(by = ['store_id','item_id','date'])
    x["stock_outage_3"] = x.groupby(["store_id", "item_id"])["sales"].transform(lambda x: stock_outage(x,3))
    x["stock_outage_7"] = x.groupby(["store_id", "item_id"])["sales"].transform(lambda x: stock_outage(x,7))
    x["stock_outage_15"] = x.groupby(["store_id", "item_id"])["sales"].transform(lambda x: stock_outage(x,15))


    # Lags variables

    def create_lags(x, variable, n_lags):
        '''''
        Function that returns a dataframe with a n_lags variables which receive the
        name of the input variable + "_lag_". Its value is just the shift of n_lags
        backwards.
        '''''
        lags = pd.DataFrame()

        #create all lags
        for i in range(1, n_lags+1):
            lags[variable + "_lag_" + str(i)] = df[variable].shift(i)

        return lags
    
    lags_sell_price_x = x.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="sell_price", n_lags=7))
    lags_stock_outage_3_x = x.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="stock_outage_3", n_lags=1))
    lags_stock_outage_7_x = x.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="stock_outage_7", n_lags=1))
    lags_stock_outage_15_x = x.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="stock_outage_15", n_lags=1))
    lags_sales_x = x.groupby(["store_id", "item_id"]).apply(lambda x: create_lags(df=x, variable="sales", n_lags=15))


    # Moving window variables

    def moving_minimum(x, variable, n_days):
        '''''
        Function that returns a dataframe with a n_days-1 variables. 
        It checks the variable "variable" and takes the minimum of the last i values
        for each record of the original dataset df.
        '''''
        mvgmin = pd.DataFrame()

        for i in range(2, n_days+1):
            mvgmin[variable + "_mvgmin_" + str(i)] = x[variable].shift(1).rolling(i).min()

        return mvgmin
    
    def moving_average(x, variable, n_days):
    
        mvgmean = pd.DataFrame()

        for i in range(2, n_days+1):
            mvgmean[variable + "_mvgmean_" + str(i)] = x[variable].shift(1).rolling(i).mean()

        return mvgmean
    
    def moving_maximum(x, variable, n_days):

        mvgmax = pd.DataFrame()

        for i in range(2, n_days+1):
            mvgmax[variable + "_mvgmax_" + str(i)] = x[variable].shift(1).rolling(i).max()

        return mvgmax
    
    moving_minimum_x = x.groupby(["store_id","item_id"]).apply(lambda x: moving_minimum(df=x, variable="sales", n_days = 15))
    moving_average_x = x.groupby(["store_id","item_id"]).apply(lambda x: moving_average(df=x, variable="sales", n_days = 15))
    moving_maximum_x = x.groupby(["store_id","item_id"]).apply(lambda x: moving_maximum(df=x, variable="sales", n_days = 15))


    # Join all the dataframes

    temp = pd.concat([lags_sell_price_x,
                    lags_stock_outage_3_x,
                    lags_stock_outage_7_x,
                    lags_stock_outage_15_x,
                    lags_sales_x,
                    moving_minimum_x,
                    moving_average_x,
                    moving_maximum_x], axis=1)
    
    x_concat = pd.concat([x, temp], axis=1)

    x_concat.dropna(inplace=True)

    to_remove = ["d","wm_yr_wk","sell_price","stock_outage_3","stock_outage_7","stock_outage_15"]
    x_concat.drop(columns=to_remove, inplace=True)

    # Create a single variable for the product-store feature
    x_concat.insert(loc=0, column="product_store", value=(x_concat["store_id"] + "_" + x_concat["item_id"]))
    x_concat.drop(columns=["item_id","store_id"], inplace=True)

    return x_concat



### Feature Engineering