# Feature Engineering

1. create price feature based on POS sales and POS dollars
2. Impute nulls in the dataset
3. Vectorization
4. Moving Average Feature

In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import warnings

warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('ds_challenge_data_cleaned.csv')

In [30]:
df.columns

Index(['Hash_Trans_ID', 'SKU_ID', 'Transaction_Date', 'Transaction_Year',
       'Transaction_Month', 'Transaction_Day', 'Transaction_Hour',
       'Transaction_Min', 'Transaction_Sec', 'Transaction_Day_of_Week',
       'Transaction_Week', 'Transaction_Day_of_Year', 'holiday_nm',
       'Transaction_Type', 'Transaction_Sales_Type', 'POS_Sales',
       'POS_UnitsSold', 'Department_ID', 'Class_ID', 'SubClass_ID',
       'SubClass_Desc', 'Product_Weight_Grams', 'Product_Height_CM',
       'Product_Length_CM', 'Product_Width_CM', 'Product_Volumn_Metric',
       'PRO'],
      dtype='object')

## Pricing features

In [31]:
df['Price'] = df['POS_Sales']/df['POS_UnitsSold']

#### create SKU level price features seperate for each year

In [32]:
#Main assumption is that we know the price one year before
sku_df = df.groupby(['SKU_ID','Transaction_Year']).agg({'Price':np.mean}).reset_index()
sku_df['SKU_AVG_Price'] = sku_df['Price']
sku_df = sku_df[['SKU_ID','Transaction_Year','SKU_AVG_Price']]

sku_df2 = df.groupby(['SKU_ID','Transaction_Year']).agg({'Price':np.min}).reset_index()
sku_df2['SKU_MIN_Price'] = sku_df2['Price']
sku_df2 = sku_df2[['SKU_ID','Transaction_Year','SKU_MIN_Price']]

sku_df3 = df.groupby(['SKU_ID','Transaction_Year']).agg({'Price':np.max}).reset_index()
sku_df3['SKU_MAX_Price'] = sku_df3['Price']
sku_df3 = sku_df3[['SKU_ID','Transaction_Year','SKU_MAX_Price']]

In [33]:
sku_df = sku_df.merge(sku_df2, on=['SKU_ID','Transaction_Year']).merge(sku_df3, on=['SKU_ID','Transaction_Year'])
df = df.merge(sku_df, on=['SKU_ID','Transaction_Year'], how='left')

# Vectorization

Data is better to be aggregated to transaction level instead of transaction-sku level
At transaction level vectors that are built based on product attributes are immune to problems like similar SKUs and product cannibalization 


In [34]:
#Remove Nulls to start Vectorization
#df.fillna('none', inplace = True)
df.fillna(0, inplace = True)

### Txn Type, Sales Type and Holiday

In [35]:
df.groupby('Transaction_Type')['PRO'].mean()

Transaction_Type
Commercial Line Sale    0.613808
Front line - Sales      0.419146
Front line -Self Chk    0.309424
Garden Sales            0.230607
Pro Desk - Refund       0.737839
Pro Desk - Sales        0.590201
Returns desk - Refnd    0.406545
Sp. Srvc. Dsk  VPOS     0.475993
Sp. Srvc. Dsk Refnd     0.407097
Tool Rental -Refund     0.651173
Tools - Refund          0.674731
Name: PRO, dtype: float64

In [None]:
df[((df['Transaction_Type']=='Pro Desk - Refund') & (df['PRO']==0))]

In [37]:
df.groupby('holiday_nm')['PRO'].mean()

holiday_nm
0                        0.525168
Boxing Day               0.328440
Boxing Day (Observed)    0.315217
Canada Day               0.376829
Civic Holiday            0.410287
Family Day               0.276243
Good Friday              0.328205
Labour Day               0.377809
New Year's Day           0.349398
Thanksgiving             0.347150
Victoria Day             0.350222
Name: PRO, dtype: float64

In [38]:
df.groupby('Transaction_Sales_Type')['PRO'].mean()

Transaction_Sales_Type
Gift Card Sales         0.238806
Returns                 0.380371
Sales                   0.490536
Special Order Return    0.562130
Special Order Sales     0.712379
Name: PRO, dtype: float64

In [39]:
df.groupby('Department_ID')['PRO'].mean()

Department_ID
11    0.463248
13    0.238806
16    0.603203
17    0.225131
21    0.562079
22    0.590631
23    0.456892
24    0.529044
25    0.555719
26    0.553793
27    0.469824
28    0.413927
29    0.553191
30    0.569670
59    0.432815
78    0.782479
Name: PRO, dtype: float64

### Clustering SubClass Desc

In [40]:
df['SubClass_Desc_Clean'] = df['SubClass_Desc'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [41]:
df['SubClass_Desc_Clean']= df['SubClass_Desc_Clean'].apply(lambda x: x.lower().replace('XX','').replace('xx','')  )

In [42]:
df['SubClass_Desc_Clean']= df['SubClass_Desc_Clean']\
        .apply(lambda x: x.replace('&','').replace('-','').replace('/','').replace('$','')\
                .replace('%','').replace(' ','').replace("'",'').replace(".",'').strip()  )

In [43]:
#vectorizer = TfidfVectorizer(stop_words='english')
vectorizer = CountVectorizer(stop_words="english", analyzer='char', ngram_range=(3, 5), max_df=1.0, min_df=1, max_features=None)
X = vectorizer.fit_transform(df['SubClass_Desc_Clean'])

In [44]:
true_k = 10
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, n_jobs=-1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10, n_init=1, n_jobs=-1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [45]:
df['SubClass_Desc_Cluster'] = df['SubClass_Desc_Clean'].apply(lambda x: model.predict(vectorizer.transform([x])).astype('int')[0] )

In [46]:
df.groupby(['SubClass_Desc_Cluster']).count()['Hash_Trans_ID']

SubClass_Desc_Cluster
0      1205
1      4707
2      2495
3       244
4      2770
5      9953
6      1578
7      7947
8      2150
9    288798
Name: Hash_Trans_ID, dtype: int64

In [47]:
df.groupby(['SubClass_Desc_Cluster'])['PRO'].mean()

SubClass_Desc_Cluster
0    0.527801
1    0.591884
2    0.560321
3    0.520492
4    0.653791
5    0.462976
6    0.582383
7    0.523216
8    0.611628
9    0.519817
Name: PRO, dtype: float64

In [48]:
df = pd.get_dummies(data=df, columns = ['Transaction_Type','Transaction_Sales_Type','Department_ID','SubClass_Desc_Cluster'])

In [None]:
df.columns

## Put back everything into Transaction level

In [50]:
df_final = df.groupby('Hash_Trans_ID').agg({'SKU_ID':np.size,
                                            'Transaction_Date':np.max,
                                            'Transaction_Year':np.max, 'Transaction_Month':np.max,
                                            'Transaction_Day':np.max, 'Transaction_Hour':np.max, 
                                            'Transaction_Day_of_Week':np.max,
                                            'Transaction_Week':np.max, 
                                            'Transaction_Day_of_Year':np.max, 
                                            'holiday_nm':np.max,
                                            'PRO':np.max}).reset_index()

df_final = df_final.rename(columns={'SKU_ID': 'SKU_ID_Count'})

### Create Sum/Min/Max/Std Vectors 

In [51]:
df_sum = df.groupby('Hash_Trans_ID').agg({
                                'POS_Sales':np.sum,
                                'POS_UnitsSold':np.sum, 
                                'Product_Length_CM':np.sum,
                                'Product_Height_CM':np.sum,
                                'Product_Width_CM':np.sum,
                                'Product_Weight_Grams':np.sum, 
                                'Product_Volumn_Metric':np.sum,
                                'Price':np.sum, 
                                'SKU_AVG_Price':np.sum,
                                'SKU_MIN_Price':np.sum, 
                                'SKU_MAX_Price':np.sum, 
                                'Transaction_Type_1':np.sum,
                                'Transaction_Type_2':np.sum,
                                'Transaction_Type_3':np.sum,
                                'Transaction_Type_4':np.sum,
                                'Transaction_Type_5':np.sum,
                                'Transaction_Type_6':np.sum,
                                'Transaction_Type_7':np.sum,
                                'Transaction_Type_8':np.sum,
                                'Transaction_Type_9':np.sum,
                                'Transaction_Type_10':np.sum,
                                'Transaction_Type_11':np.sum,
                                'Transaction_Sales_Type_12':np.sum,
                                'Transaction_Sales_Type_13':np.sum, 
                                'Transaction_Sales_Type_14':np.sum,
                                'Transaction_Sales_Type_15':np.sum,
                                'Transaction_Sales_Type_16':np.sum,
                                'Department_ID_13':np.sum, 'Department_ID_16':np.sum, 'Department_ID_17':np.sum,
                                'Department_ID_21':np.sum, 'Department_ID_22':np.sum, 'Department_ID_23':np.sum,
                                'Department_ID_24':np.sum, 'Department_ID_25':np.sum, 'Department_ID_26':np.sum,
                                'Department_ID_27':np.sum, 'Department_ID_28':np.sum, 'Department_ID_29':np.sum,
                                'Department_ID_30':np.sum, 'Department_ID_59':np.sum, 'Department_ID_78':np.sum,
                                'SubClass_Desc_Cluster_0':np.sum, 'SubClass_Desc_Cluster_1':np.sum,
                                'SubClass_Desc_Cluster_2':np.sum, 'SubClass_Desc_Cluster_3':np.sum,
                                'SubClass_Desc_Cluster_4':np.sum, 'SubClass_Desc_Cluster_5':np.sum,
                                'SubClass_Desc_Cluster_6':np.sum, 'SubClass_Desc_Cluster_7':np.sum,
                                'SubClass_Desc_Cluster_8':np.sum, 'SubClass_Desc_Cluster_9':np.sum
                                }).reset_index()

df_sum = df_sum.rename(columns={'POS_Sales': 'POS_Sales_Sum',
                                    'POS_UnitsSold': 'POS_UnitsSold_Sum', 'Product_Length_CM': 'Product_Lengthe_Sum',
                                    'Product_Width_CM': 'Product_Widthe_Sum', 'Product_Weight_Grams': 'Product_Weighte_Sum',
                                    'Price': 'Price_Sum',  
                                    'SKU_AVG_Price':'SKU_AVG_Price_Sum',
                                    'SKU_MIN_Price':'SKU_MIN_Price_Sum', 
                                    'SKU_MAX_Price':'SKU_MAX_Price_Sum',
                                    'Product_Height_CM': 'Product_Heighte_Sum',
                                    'Product_Volumn_Metric': 'Product_Volumee_Sum', 'Product_Height_CM': 'Product_Heighte_Sum'})

df_final = df_final.merge(df_sum, on=['Hash_Trans_ID'], how='left')

In [52]:
df_mean = df.groupby('Hash_Trans_ID').agg({
                                'Price':np.mean, 
                                'SKU_AVG_Price':np.mean,
                                'SKU_MIN_Price':np.mean, 
                                'SKU_MAX_Price':np.mean, 
                                'POS_Sales':np.mean,
                                'POS_UnitsSold':np.mean, 
                                'Product_Length_CM':np.mean, 
                                'Product_Width_CM':np.mean,
                                'Product_Weight_Grams':np.mean, 
                                'Product_Height_CM':np.mean,
                                'Product_Volumn_Metric':np.mean}).reset_index()

df_mean = df_mean.rename(columns={'Price':'Price_Avg', 
                                'SKU_AVG_Price':'SKU_AVG_Price_Avg',
                                'SKU_MIN_Price':'SKU_MIN_Price_Avg', 
                                'SKU_MAX_Price':'SKU_MAX_Price_Avg', 
                                'POS_Sales':'POS_Sales_Avg',
                                'POS_UnitsSold':'POS_UnitsSold_Avg', 
                                'Product_Length_CM':'Product_Length_Avg', 
                                'Product_Width_CM':'Product_Width_Avg',
                                'Product_Weight_Grams':'Product_Weight_Avg', 
                                'Product_Height_CM':'Product_Height_Avg',
                                'Product_Volumn_Metric':'Product_Volumn_Avg'})

df_final = df_final.merge(df_mean, on=['Hash_Trans_ID'], how='left')

In [53]:
df_max = df.groupby('Hash_Trans_ID').agg({
                                'Price':np.max, 
                                'SKU_AVG_Price':np.max,
                                'SKU_MIN_Price':np.max, 
                                'SKU_MAX_Price':np.max,
                                'POS_Sales':np.max,'POS_UnitsSold':np.max, 
                                'Product_Length_CM':np.max, 'Product_Width_CM':np.max,
                                'Product_Weight_Grams':np.max, 'Product_Height_CM':np.max,
                                'Product_Volumn_Metric':np.max}).reset_index()

df_max = df_max.rename(columns={'POS_Sales': 'POS_Sales_Max', 'POS_UnitsSold': 'POS_UnitsSold_Max',
                                    'Product_Length_CM': 'Product_Length_Max', 'Product_Width_CM': 'Product_Width_Max',
                                    'Product_Weight_Grams': 'Product_Weight_Max', 
                                    'Price': 'Price_Max',  
                                    'SKU_AVG_Price':'SKU_AVG_Price_Max',
                                    'SKU_MIN_Price':'SKU_MIN_Price_Max', 
                                    'SKU_MAX_Price':'SKU_MAX_Price_Max',
                                    'Product_Height_CM': 'Product_Height_Max', 'Product_Volumn_Metric': 'Product_Volume_Max'})

df_final = df_final.merge(df_max, on=['Hash_Trans_ID'], how='left')

In [54]:
df_min = df.groupby('Hash_Trans_ID').agg({
                                'Price':np.min, 
                                'SKU_AVG_Price':np.min,
                                'SKU_MIN_Price':np.min, 
                                'SKU_MAX_Price':np.min,
                                'POS_Sales':np.min,'POS_UnitsSold':np.min, 
                                'Product_Length_CM':np.min, 'Product_Width_CM':np.min,
                                'Product_Weight_Grams':np.min, 'Product_Height_CM':np.min,
                                'Product_Volumn_Metric':np.min}).reset_index()

df_min = df_min.rename(columns={'POS_Sales': 'POS_Sales_Min', 'POS_UnitsSold': 'POS_UnitsSold_Min',
                                    'Product_Length_CM': 'Product_Length_Min', 'Product_Width_CM': 'Product_Width_Min',
                                    'Product_Weight_Grams': 'Product_Weight_Min', 
                                    'Price': 'Price_Min',  
                                    'SKU_AVG_Price':'SKU_AVG_Price_Min',
                                    'SKU_MIN_Price':'SKU_MIN_Price_Min', 
                                    'SKU_MAX_Price':'SKU_MAX_Price_Min',
                                    'Product_Height_CM': 'Product_Height_Min', 'Product_Volumn_Metric': 'Product_Volume_Min'})

df_final = df_final.merge(df_min, on=['Hash_Trans_ID'], how='left')

In [55]:
df_std = df.groupby('Hash_Trans_ID').agg({
                                'Price':np.std, 
                                'SKU_AVG_Price':np.std,
                                'SKU_MIN_Price':np.std, 
                                'SKU_MAX_Price':np.std,
                                'POS_Sales':np.std,'POS_UnitsSold':np.std, 
                                'Product_Length_CM':np.std, 'Product_Width_CM':np.std,
                                'Product_Weight_Grams':np.std, 'Product_Height_CM':np.std,
                                'Product_Volumn_Metric':np.std}).reset_index()

df_std = df_std.rename(columns={'POS_Sales': 'POS_Sales_Std', 'POS_UnitsSold': 'POS_UnitsSold_Std',
                                    'Product_Length_CM': 'Product_Length_Std', 'Product_Width_CM': 'Product_Width_Std',
                                    'Product_Weight_Grams': 'Product_Weight_Std', 
                                    'Price': 'Price_Std',  
                                    'SKU_AVG_Price':'SKU_AVG_Price_Std',
                                    'SKU_MIN_Price':'SKU_MIN_Price_Std', 
                                    'SKU_MAX_Price':'SKU_MAX_Price_Std',
                                    'Product_Height_CM': 'Product_Height_Std', 'Product_Volumn_Metric': 'Product_Volume_Std'})

df_final = df_final.merge(df_std, on=['Hash_Trans_ID'], how='left')

## ClassID and SubClassID Vectorization

In [56]:
df['basket_row'] = df.sort_values(['SKU_ID'], ascending=[True]) \
             .groupby(['Hash_Trans_ID']) \
             .cumcount() + 1
df['basket_row'] = df['basket_row'].apply(lambda x: 11 if x>10 else x)

In [57]:
pt = pd.pivot_table(df, values=['POS_Sales','Class_ID','SubClass_ID'], index='Hash_Trans_ID', 
                    columns=['basket_row'], aggfunc=np.max).reset_index()
pt.fillna(0, inplace = True)

In [58]:
df_final = df_final.merge(pt, on=['Hash_Trans_ID'], how='left')

# Moving Average Feature

In [59]:
df_final = df_final.sort_values(by=['Transaction_Date','Hash_Trans_ID'])

In [60]:
for col in ['Transaction_Type_Commercial Line Sale','Transaction_Type_Front line - Sales','Transaction_Type_Front line -Self Chk',
            'Transaction_Type_Garden Sales','Transaction_Type_Pro Desk - Refund','Transaction_Type_Pro Desk - Sales',
            'Transaction_Type_Returns desk - Refnd','Transaction_Type_Sp. Srvc. Dsk  VPOS','Transaction_Type_Sp. Srvc. Dsk Refnd',
            'Transaction_Type_Tool Rental -Refund','Transaction_Type_Tools - Refund']:
    df_cat = df_final[df_final[col]>0]
    df_cat = df_cat[['Transaction_Date','PRO']].rolling(50, on='Transaction_Date').mean()
    df_cat.fillna(0, inplace = True)
    df_cat = df_cat.groupby('Transaction_Date').mean().reset_index()
    df_cat = df_cat.rename(columns={'PRO':'Moving_Avg_Txn_'+col})
    df_final = df_final.merge(df_cat, on=['Transaction_Date'], how='left')

df_final.fillna(0, inplace = True)

## Write the outputs for modeling

In [61]:
df_final.to_csv('ds_challenge_data_final.csv', index=False, sep='\t')