In [2]:
import featuretools as ft
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")



In [3]:
train = pd.read_csv("Train_UWu5bXk.txt")
test = pd.read_csv("Test_u94Q5KV.txt")

In [4]:
train.sample()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
474,FDU25,12.35,Low Fat,0.026681,Canned,56.4246,OUT046,1997,Small,Tier 1,Supermarket Type1,810.9444


In [5]:
test.sample(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
5101,FDJ03,12.35,Regular,0.072381,Dairy,48.4692,OUT035,2004,Small,Tier 2,Supermarket Type1
3100,FDJ48,11.3,Low Fat,0.056665,Baking Goods,248.0118,OUT018,2009,Medium,Tier 3,Supermarket Type2
1061,FDL15,17.85,Low Fat,0.046635,Meat,154.2682,OUT046,1997,Small,Tier 1,Supermarket Type1
2054,NCA41,16.75,Low Fat,0.032719,Health and Hygiene,194.2162,OUT018,2009,Medium,Tier 3,Supermarket Type2
1533,NCD55,14.0,Low Fat,0.02443,Household,42.6454,OUT018,2009,Medium,Tier 3,Supermarket Type2


In [6]:
#Data Preparation

test_Item_identifier = test['Item_Identifier']
test_Outlet_Identifier = test['Outlet_Identifier']
sales = train['Item_Outlet_Sales']
train.drop(['Item_Outlet_Sales'],axis=1,inplace=True)

In [7]:
#Combine both train and test data sets

combined = train.append(test,ignore_index=True)

In [8]:
#Checking missing values
combined.isna().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [9]:
#Filling in the missing values.
combined['Item_Weight'].fillna(combined['Item_Weight'].mean(),inplace=True)
combined['Outlet_Size'].fillna('missing',inplace=True)

In [10]:
#Data Preprocessing
combined['Item_Fat_Content'].value_counts()

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

In [11]:
# Replacing categories

combined['Item_Fat_Content'] = combined['Item_Fat_Content'].replace({'LF':'Low Fat','reg':"Regular","low fat":"Low Fat"})



### Feature Engineering using Feature Tools
<p>  It is necessary to have a unique identifier feature in the dataset.</p>

In [12]:
# Creating a unique identifier from combination of item and outlet

combined['id'] = combined['Item_Identifier'] + combined['Outlet_Identifier']
combined.drop(['Item_Identifier'], axis=1,inplace=True)

In [13]:
#Create entity set 

es = ft.EntitySet(id='sales')

In [14]:
# add a data frame

es.entity_from_dataframe(entity_id='bigmart',dataframe=combined,index='id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [15]:
es.normalize_entity(base_entity_id='bigmart',new_entity_id='outlet',index='Outlet_Identifier',
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [16]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier


In [17]:
#Deep feature synthesis

feature_matrix,feature_names = ft.dfs(entityset=es,target_entity='bigmart',max_depth=2,verbose=1,n_jobs=3)

Built 33 features
EntitySet scattered to 3 workers in 2 seconds
Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [18]:
feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.SUM(bigmart.Item_Weight)',
       'outlet.SUM(bigmart.Item_Visibility)', 'outlet.SUM(bigmart.Item_MRP)',
       'outlet.STD(bigmart.Item_Weight)',
       'outlet.STD(bigmart.Item_Visibility)', 'outlet.STD(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MAX(bigmart.Item_Visibility)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.SKEW(bigmart.Item_Weight)',
       'outlet.SKEW(bigmart.Item_Visibility)', 'outlet.SKEW(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Visibility)', 'outlet.MIN(bigmart.Item_MRP)',
       'outlet.MEAN(bigmart.Item_Weight)',
       'outlet.MEAN(bigmart.Item_Visibility)', 'outlet.MEAN(bigmart.Item_MRP)',
       'outlet.COUNT(bigmart)', 'outlet.NU

In [19]:
feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,...,outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Fat_Content),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Fat_Content),outlet.MODE(bigmart.Item_Type)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DRA12OUT010,11.6,Low Fat,0.068535,Soft Drinks,143.0154,OUT010,1998,missing,Tier 3,Grocery Store,...,0.0,32.6558,12.72287,0.101939,141.159742,925,2,16,Low Fat,Fruits and Vegetables
DRA12OUT013,11.6,Low Fat,0.040912,Soft Drinks,142.3154,OUT013,1987,High,Tier 3,Supermarket Type1,...,0.0,31.49,12.788139,0.060242,141.128428,1553,2,16,Low Fat,Fruits and Vegetables
DRA12OUT017,11.6,Low Fat,0.041178,Soft Drinks,140.3154,OUT017,2007,missing,Tier 2,Supermarket Type1,...,0.0,32.09,12.78208,0.061142,140.998931,1543,2,16,Low Fat,Snack Foods
DRA12OUT018,11.6,Low Fat,0.041113,Soft Drinks,142.0154,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,0.0,31.89,12.803638,0.059976,141.000899,1546,2,16,Low Fat,Fruits and Vegetables
DRA12OUT027,12.792854,Low Fat,0.040748,Soft Drinks,140.0154,OUT027,1985,Medium,Tier 3,Supermarket Type3,...,0.0,31.29,12.792854,0.060344,141.012347,1559,2,16,Low Fat,Fruits and Vegetables


In [20]:
feature_matrix = feature_matrix.reindex(index=combined['id'])
feature_matrix = feature_matrix.reset_index()

In [21]:
feature_matrix.head()

Unnamed: 0,id,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,...,outlet.MIN(bigmart.Item_Visibility),outlet.MIN(bigmart.Item_MRP),outlet.MEAN(bigmart.Item_Weight),outlet.MEAN(bigmart.Item_Visibility),outlet.MEAN(bigmart.Item_MRP),outlet.COUNT(bigmart),outlet.NUM_UNIQUE(bigmart.Item_Fat_Content),outlet.NUM_UNIQUE(bigmart.Item_Type),outlet.MODE(bigmart.Item_Fat_Content),outlet.MODE(bigmart.Item_Type)
0,FDA15OUT049,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,...,0.0,32.4558,12.803003,0.059,141.163199,1550,2,16,Low Fat,Fruits and Vegetables
1,DRC01OUT018,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,...,0.0,31.89,12.803638,0.059976,141.000899,1546,2,16,Low Fat,Fruits and Vegetables
2,FDN15OUT049,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,...,0.0,32.4558,12.803003,0.059,141.163199,1550,2,16,Low Fat,Fruits and Vegetables
3,FDX07OUT010,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,...,0.0,32.6558,12.72287,0.101939,141.159742,925,2,16,Low Fat,Fruits and Vegetables
4,NCD19OUT013,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,...,0.0,31.49,12.788139,0.060242,141.128428,1553,2,16,Low Fat,Fruits and Vegetables
