## Feature Creation
- Statistics (Maximum, Minimum, Mean, Standard deviation, and Median) of input variables like Sales value.
    - Output: Above given statistics in the original dataframe for given granularity and each input variable like Sales, Unit sales, etc.
- Time trend variables - Ratio
    - Output: Ratio of time trend varibles as new column for each input variable.
    - The user also gives the lag period for which ratios are required. e.g. 1,3,6 for ratio of current week and previous one week, previous third week, and previous sixth week. The period can be month or quarter in place of weeks. 
- Location of output files with added features in Output/Feature_Creation folder

In [1]:
import numpy as np
import pandas as pd
import datetime
pd.options.mode.chained_assignment = None

In [2]:
def stats(df,
          date_col,
          gran_list,
          feature_col,
          period_level,
          *args):
    """
    Function to return statistics (min,max,mean,etc.) of the required features.
        
        Parameters in order:
            df : Dataframe
            date_col : Date column in datetime64 yyyy-mm-dd format
            gran_list (list): Granularity list e.g. ['Segment','Retailer','Manufacturer','Brand','APN']
                              for Segment X Retailer X ... X APN
            feature_col : Column for which statisics are required
            period_level : Period level e.g. Week column or Month column etc.
            *args : Optional feature column for which statistics are required.
            
        Returns:
            Dataframe with granularity and statistics columns for each input feature
            
    """
    
    df[date_col] = pd.to_datetime(df[date_col])
    df['Year'] = df[date_col].dt.year
    
    group = ['Year',period_level]
    gran1 = gran_list.copy()
    gran1.extend(group)
    
    dg = df.groupby(gran1).agg(Max = (feature_col,np.max),
                               Min = (feature_col,np.min),
                               Avg = (feature_col,np.mean),
                               Sd = (feature_col,np.std),
                               Med = (feature_col,np.median)).rename(columns = {'Max':feature_col+'_max',
                                                                                'Min':feature_col+'_min',
                                                                                'Avg':feature_col+'_mean',
                                                                                'Sd':feature_col+'_std',
                                                                                'Med':feature_col+'_median'})

    for i in args:
        dx = df.groupby(gran1).agg(Max = (i,np.max),
                                   Min = (i,np.min),
                                   Avg = (i,np.mean),
                                   Sd = (i,np.std),
                                   Med = (i,np.median)).rename(columns = {'Max':i+'_max',
                                                                          'Min':i+'_min',
                                                                          'Avg':i+'_mean',
                                                                          'Sd':i+'_std',
                                                                          'Med':i+'_median'})
        dg = pd.merge(dg,dx,on=gran1)
     
    dg = dg.reset_index()
    df = pd.merge(df,dg,on=gran1)
    
    # exporting
    df.to_csv('../Output/Feature_Creation/Stats_features_added.csv',index=False)
    
    return df

In [3]:
def time_trend_ratio(df,
                     date_col,
                     units_col,
                     gran_list,
                     period = 'weekly', # weekly, monthly, quarterly
                     lag_period = [1,2],
                     *args):
    """
    Function to get time trend variables for the given inputs. Gives ratio of current period feature/lag period feature.
    
        Parameters in order:
            df : Dataframe
            date_col : Date column in datetime64 yyyy-mm-dd format
            units_col : Feature column. Can be Units column, Sales column etc.
            gran_list (list): Granularity list across which to do calculations. 
                              e.g. ['Segment','Retailer','Manufacturer','Brand','APN'] for Segment X Retailer .. X APN
            period : Period level on which to calculate. Inputs : 'weekly' or 'monthly' or 'quarterly'
            lag_period (list)(int): List of lag period required e.g for previous two weeks, lag_period=[1,2]
            *args : Optional feature columns. Can be volume sales, sales column, etc.
        
        Returns:
            A dataframe with given granularity and time trend ratio columns
            
    """
    
    df[date_col] = pd.to_datetime(df[date_col])
    df['Year'] = df['Date'].dt.year
    
    if (period.lower().find('week')!=-1):
        df['Period'] = df['Date'].dt.week
    elif (period.lower().find('month')!=-1):
        df['Period'] = df['Date'].dt.month
    else:
        df['Period'] = df['Date'].dt.quarter
        
    group = ['Year','Period']
    gran1 = gran_list.copy()
    gran1.extend(group)
    
    dg = df.groupby(gran1)[units_col].agg(sum).reset_index()
    for i in args:
        dx = df.groupby(gran1)[i].agg(sum).reset_index()
        dg = pd.merge(dg,dx,on=gran1)
    dg = dg.sort_values(gran1,ignore_index=True)
    for j in lag_period:
        dg['lag_'+str(j)+"_"+units_col] = dg.groupby(gran_list)[units_col].shift(j)
    
    dg = dg.sort_values(gran1,ignore_index=True)
    for i in args:
        for j in lag_period:
            dg['lag_'+str(j)+"_"+str(i)] = dg.groupby(gran_list)[i].shift(j)
    
    
    for j in lag_period:
        dg['Ratio_'+str(j)+"_"+units_col] = dg[units_col]/dg['lag_'+str(j)+"_"+units_col]
    
    for i in args:
        for j in lag_period:
            dg['Ratio_'+str(j)+"_"+str(i)] = dg[i]/dg['lag_'+str(j)+"_"+str(i)]
            
    dg = dg.replace([np.inf, -np.inf], 0.0)
    dg = dg.rename(columns={units_col:'Agg_'+units_col})
    for i in args:
        dg = dg.rename(columns={i:'Agg_'+i})
    
    dg.drop(list(dg.filter(regex = 'lag')), axis = 1, inplace = True)
    
    df = pd.merge(df,dg,on= gran1)
    
    # exporting
    df.to_csv('../Output/Feature_Creation/Time_trend_features_added.csv',index=False)

    
    return df

### Validation on Model dataset

In [4]:
df = pd.read_excel('../Data/Testing_Data.xlsx')

In [5]:
df.head()

Unnamed: 0,Segment,Date,Retailer,Manufacturer,APN,Packsize,Brand,Sales,Unit_sales,Vol_Sales,wtd_distribution,PPG_Item_No
0,Bitesize,2017-04-02,Coles,Mars Wrigley,5000159491532,88g,Galaxy,9201.32,2570.0,226.16,75.9,Item_5000159491532
1,Bitesize,2017-04-02,Coles,Mars Wrigley,5000159494762,149g,M&M's,61.76,61.0,9.089,1.9,Item_5000159494762
2,Bitesize,2017-04-02,Coles,Mars Wrigley,9300682002134,140g,Maltesers,178508.416,61625.0,8627.5,99.9,Item_9300682002134
3,Bitesize,2017-04-02,Coles,Mars Wrigley,9300682013376,145g,M&M's,87435.36,30179.0,4375.955,100.0,Item_9300682013376
4,Bitesize,2017-04-02,Coles,Mars Wrigley,9300682015479,145g,M&M's,30701.95,10589.0,1535.405,95.9,Item_9300682015479


- Stats function

In [6]:
ds = df.copy()
ds['Month'] = ds['Date'].dt.month

In [7]:
# monthly statistics of Sales and Unit sales
ds = stats(ds,'Date',['Segment','Retailer','Manufacturer','Brand','APN'],'Sales','Month','Unit_sales')

In [8]:
ds.head()

Unnamed: 0,Segment,Date,Retailer,Manufacturer,APN,Packsize,Brand,Sales,Unit_sales,Vol_Sales,...,Sales_max,Sales_min,Sales_mean,Sales_std,Sales_median,Unit_sales_max,Unit_sales_min,Unit_sales_mean,Unit_sales_std,Unit_sales_median
0,Bitesize,2017-04-02,Coles,Mars Wrigley,5000159491532,88g,Galaxy,9201.32,2570.0,226.16,...,9201.32,4357.28,6301.038,2433.176391,4646.83,2586.0,951.0,1627.6,868.013998,1019.0
1,Bitesize,2017-04-09,Coles,Mars Wrigley,5000159491532,88g,Galaxy,8712.86,2586.0,227.568,...,9201.32,4357.28,6301.038,2433.176391,4646.83,2586.0,951.0,1627.6,868.013998,1019.0
2,Bitesize,2017-04-16,Coles,Mars Wrigley,5000159491532,88g,Galaxy,4646.83,1012.0,89.056,...,9201.32,4357.28,6301.038,2433.176391,4646.83,2586.0,951.0,1627.6,868.013998,1019.0
3,Bitesize,2017-04-23,Coles,Mars Wrigley,5000159491532,88g,Galaxy,4357.28,951.0,83.688,...,9201.32,4357.28,6301.038,2433.176391,4646.83,2586.0,951.0,1627.6,868.013998,1019.0
4,Bitesize,2017-04-30,Coles,Mars Wrigley,5000159491532,88g,Galaxy,4586.9,1019.0,89.672,...,9201.32,4357.28,6301.038,2433.176391,4646.83,2586.0,951.0,1627.6,868.013998,1019.0


- Time trend ratio function

In [9]:
dx = time_trend_ratio(df,'Date','Unit_sales',['Segment','Retailer','Manufacturer','Brand','APN'],'weekly',[1,3])

In [10]:
dx

Unnamed: 0,Segment,Date,Retailer,Manufacturer,APN,Packsize,Brand,Sales,Unit_sales,Vol_Sales,wtd_distribution,PPG_Item_No,Year,Period,Agg_Unit_sales,Ratio_1_Unit_sales,Ratio_3_Unit_sales
0,Bitesize,2017-04-02,Coles,Mars Wrigley,5000159491532,88g,Galaxy,9201.320,2570.0,226.160,75.9,Item_5000159491532,2017,13,2570.0,,
1,Bitesize,2017-04-02,Coles,Mars Wrigley,5000159494762,149g,M&M's,61.760,61.0,9.089,1.9,Item_5000159494762,2017,13,61.0,,
2,Bitesize,2017-04-02,Coles,Mars Wrigley,9300682002134,140g,Maltesers,178508.416,61625.0,8627.500,99.9,Item_9300682002134,2017,13,61625.0,,
3,Bitesize,2017-04-02,Coles,Mars Wrigley,9300682013376,145g,M&M's,87435.360,30179.0,4375.955,100.0,Item_9300682013376,2017,13,30179.0,,
4,Bitesize,2017-04-02,Coles,Mars Wrigley,9300682015479,145g,M&M's,30701.950,10589.0,1535.405,95.9,Item_9300682015479,2017,13,10589.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96297,Block,2020-03-22,Woolworths,All Other Manufacturer,9403142002375,200g,Whittakers,4.796,1.0,0.200,0.2,Item_9403142002375,2020,12,1.0,1.000000,1.000000
96298,Block,2020-03-22,Woolworths,All Other Manufacturer,9403142004652,100g,Whittakers,2186.008,489.0,48.900,26.9,Item_9403142004652,2020,12,489.0,1.038217,0.564665
96299,Block,2020-03-22,Woolworths,All Other Manufacturer,9403142004836,100g,Whittakers,2678.324,600.0,60.000,27.8,Item_9403142004836,2020,12,600.0,1.011804,0.574713
96300,Block,2020-03-22,Woolworths,Lindt & Sprungli,9542009984,100g,Lindt Excellence,39306.531,11410.0,1141.000,100.0,Item_9542009984,2020,12,11410.0,0.430436,0.465315
