# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances as pw_dist
from datetime import datetime

In [2]:
class MercariFeatureEngineering(object):
    
    def __init__(self, train_filepath, test_filepath, delimiter=','):
        self.train_df = pd.read_csv(train_filepath, delimiter=delimiter)
        self.test_df = pd.read_csv(test_filepath, delimiter=delimiter)
        #self.stop_words = set(stopwords.words('english'))
        self.alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
                             'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
                             'u', 'v', 'w', 'x', 'y', 'z'])
        self.rf = None
        self.brand_encoder = LabelEncoder()
        self.cat_top_encoder = LabelEncoder()
        self.cat_mid_encoder = LabelEncoder()
        self.cat_bot_encoder = LabelEncoder()
    
    def fill_na(self, df, column_name, new_col, fill_with):
        df[new_col] = df[column_name].isnull().astype(int)
        df[column_name] = df[column_name].fillna(fill_with)
    
    def split_categories(self, df, column_name, split_on):
        top, middle, bottom = [], [], []
        for i, row in df.iterrows():
            hierarchy_string = row[column_name]
            hierarchy_list = hierarchy_string.split(split_on)
            top.append(hierarchy_list[0])
            middle.append(hierarchy_list[1])
            bottom.append(hierarchy_list[2])
        df['cat_top'] = top
        df['cat_mid'] = middle
        df['cat_bot'] = bottom
    
    def clean_column(self, brand_name):
        word = brand_name.lower()
        word = ''.join([letter for letter in word if letter in self.alphabet])
        return word
    
    def drop_rows_with_value(self, df, column, value):
        df = df[df[column] != value]
    
    def apply_func(self, df, new_name, from_col, func):
        df[new_name] = df[from_col].apply(lambda x: func(x))
    
    def _categorical_labels(self, df, column_name, new_col, encoder):
        df[new_col] = encoder.transform(df[column_name])
        
    def dummify_categories(self, col_top, col_mid, col_bot, col_brand, df):
        brand_dummies = pd.get_dummies(df[col_brand])
        cat_top_dummies = pd.get_dummies(df[col_top])
        cat_mid_dummies = pd.get_dummies(df[col_mid])
        cat_bot_dummies = pd.get_dummies(df[col_bot])
        result = pd.concat([brand_dummies, cat_top_dummies,
                            cat_mid_dummies, cat_bot_dummies],
                            axis=1, join_axes=[brand_dummies.index])
        return result
    
    def train_encoders(self):
        self.brand_encoder.fit(list(set(self.train_df['brand_name']).union(set(self.test_df['brand_name']))))
        self.cat_top_encoder.fit(list(set(self.train_df['cat_top']).union(set(self.test_df['cat_top']))))
        self.cat_mid_encoder.fit(list(set(self.train_df['cat_mid']).union(set(self.test_df['cat_mid']))))
        self.cat_bot_encoder.fit(list(set(self.train_df['cat_bot']).union(set(self.test_df['cat_bot']))))
        
    def create_labels(self, df):
        self._categorical_labels(df, 'brand_name', 'brand_numeric', self.brand_encoder)
        self._categorical_labels(df, 'cat_top', 'cat_top_numeric', self.cat_top_encoder)
        self._categorical_labels(df, 'cat_mid', 'cat_mid_numeric', self.cat_mid_encoder)
        self._categorical_labels(df, 'cat_bot', 'cat_bot_numeric', self.cat_bot_encoder)
    
    def engineer_features(self, df):
        if 'price' in df.columns:
            self.drop_rows_with_value(df, 'price', 0)
        self.fill_na(df, 'category_name', 'cat_Was_null', 'None/None/None')
        self.fill_na(df, 'brand_name', 'brand_was_null', 'no_label')
        self.fill_na(df, 'item_description', 'desc_was_null', 'No description')
        print('All Nulls Filled, New Binary Columns Created!')
        self.split_categories(df, 'category_name', '/')
        print('Categories Split!')
        self.apply_func(df, 'brand_name', 'brand_name', self.clean_column)
        self.apply_func(df, 'cat_top', 'cat_top', self.clean_column)
        self.apply_func(df, 'cat_mid', 'cat_mid', self.clean_column)
        self.apply_func(df, 'cat_bot', 'cat_bot', self.clean_column)
        print('Cleaned Columns')
  

In [3]:
start = datetime.now()
f_eng = MercariFeatureEngineering('data/train.tsv', 'data/test.tsv', delimiter='\t')
f_eng.engineer_features(f_eng.train_df)
f_eng.engineer_features(f_eng.test_df)
end = datetime.now()
print (end-start)

All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
0:02:30.770584


In [4]:
start = datetime.now()
f_eng.train_encoders()
f_eng.create_labels(f_eng.train_df)
f_eng.create_labels(f_eng.test_df)
end = datetime.now()
print('Created Categorical Labels: {}'.format(end - start))    

Created Categorical Labels: 0:00:11.857281


In [5]:
f_eng.train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat_Was_null,brand_was_null,desc_was_null,cat_top,cat_mid,cat_bot,brand_numeric,cat_top_numeric,cat_mid_numeric,cat_bot_numeric
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,nolabel,10.0,1,No description yet,0,1,0,men,tops,tshirts,3375,5,102,824
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,razer,52.0,0,This keyboard is in great condition and works ...,0,0,0,electronics,computerstablets,componentsparts,3906,1,30,212
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,target,10.0,1,Adorable top with a hint of lace and a key hol...,0,0,0,women,topsblouses,blouse,4600,10,103,94
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,nolabel,35.0,1,New with tags. Leather horses. Retail for [rm]...,0,1,0,home,homedcor,homedcoraccents,3375,3,54,406
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,nolabel,44.0,0,Complete with certificate of authenticity,0,1,0,women,jewelry,necklaces,3375,10,58,538


In [6]:
f_eng.test_df.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description,cat_Was_null,brand_was_null,desc_was_null,cat_top,cat_mid,cat_bot,brand_numeric,cat_top_numeric,cat_mid_numeric,cat_bot_numeric
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,nolabel,1,Size 7,0,1,0,women,jewelry,rings,3375,10,58,663
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,nolabel,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined...",0,1,0,other,officesupplies,shippingsupplies,3375,7,72,698
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,coach,1,Brand new coach bag. Bought for [rm] at a Coac...,0,0,0,vintagecollectibles,bagsandpurses,handbag,1081,9,7,379
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,nolabel,0,-floral kimono -never worn -lightweight and pe...,0,1,0,women,sweaters,cardigan,3375,10,97,155
4,4,Life after Death,3,Other/Books/Religion & Spirituality,nolabel,1,Rediscovering life after the loss of a loved o...,0,1,0,other,books,religionspirituality,3375,7,14,658


# Regression

In [7]:
class MercariRegression(object):
    
    def __init__(self):
        self.sim_rf = None
        
    def generator(self, X, subset_size=10):
        n_rows = X.shape[0]
        last = n_rows%subset_size
        n_iters = (n_rows - n_rows%subset_size)/subset_size
        for i in xrange(n_iters):
            yield X[(i*subset_size):(i+1)*subset_size]
        yield X[-last:]
    
    def which_leaf(self, X):
        ret_mat = np.empty((X.shape[0], len(self.sim_rf.estimators_)))
        for i, tree in enumerate(self.sim_rf.estimators_):
            labels = tree.apply(X)
            ret_mat[:, i] = labels
        return ret_mat
    
    def jac_sim(self, train_leaf, test_leaf):
        similarity_matrix = 1 - pw_dist(test_leaf, train_leaf,
                                        n_jobs=-1, metric='hamming')
        return similarity_matrix
        
    def rf_sim(self, n_estimators, X_train, y_train, X_test):
        self.sim_rf = RandomForestRegressor(n_estimators=n_estimators,
                                            verbose=5, n_jobs=-1)
        self.sim_rf.fit(X_train, y_train)
        print ('Done Fitting Random Forest')
        
        

In [8]:
start = datetime.now()
X_train = f_eng.train_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
y_train = f_eng.train_df['price']
X_test = f_eng.test_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
end = datetime.now()
print (end - start)

0:00:00.858850


In [9]:
start = datetime.now()
reg = MercariRegression()
reg.rf_sim(100, X_train, y_train, X_test)
end = datetime.now()
print (end - start)

building tree 1 of 100building tree 2 of 100building tree 5 of 100building tree 4 of 100building tree 3 of 100 

building tree 7 of 100


building tree 6 of 100building tree 8 of 100


building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.4s


building tree 13 of 100
building tree 14 of 100 
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100building tree 18 of 100

building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100building tree 26 of 100 

building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
 building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   33.3s


building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100building tree 92 of 100

building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
Done Fitting Random Forest
0:00:59.925188


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   59.6s finished


In [10]:
start = datetime.now()
X_leaves = X_train.sample(frac=0.25)
train_leaf = reg.which_leaf(X_leaves)
mid = datetime.now()
print ('Leaves for Train Set Found: {}'.format(mid-start))
test_leaf = reg.which_leaf(X_test)
end = datetime.now()
print ("Leaves for Test Set Found: {}".format(end - mid))
print ('Total Time: {}'.format(end - start))

Leaves for Train Set Found: 0:00:07.099745
Leaves for Test Set Found: 0:00:12.931854
Total Time: 0:00:20.031599


In [11]:
total = 0
for item in reg.generator(train_leaf):
    total += item.shape[0]
print total

370634


In [None]:
train_leaf.shape

(370634, 100)

In [None]:
sim_mat = []
count = 0
start = datetime.now()
for item in reg.generator(test_leaf, 1000):
    count +=1
    sim = reg.jac_sim(train_leaf, item)
    sim_mat.extend(sim)
    print ('Loop: {}'.format(count))
end = datetime.now()
print (end - start)

Loop: 1
Loop: 2
Loop: 3
Loop: 4
Loop: 5
Loop: 6
Loop: 7
Loop: 8
Loop: 9
Loop: 10
Loop: 11
Loop: 12
Loop: 13
Loop: 14
Loop: 15
Loop: 16
Loop: 17
Loop: 18
Loop: 19
Loop: 20


In [24]:
# 3 min feature engineering
# 6 min 500 tree random forest training
# 4 min determine which leaf for each row
# _ min similarity. 7000 iterations * __ sec per iteration __> too long at 500 trees