# Feature Engineering

In [6]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [23]:
class MercariFeatureEngineering(object):
    
    def __init__(self, filepath, delimiter=','):
        self.df = pd.read_csv(filepath, delimiter=delimiter)
        #self.stop_words = set(stopwords.words('english'))
        self.alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
                             'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
                             'u', 'v', 'w', 'x', 'y', 'z'])
        self.rf = None
    
    def fill_na(self, column_name, new_col, fill_with):
        self.df[new_col] = self.df[column_name].isnull().astype(int)
        self.df[column_name] = self.df[column_name].fillna(fill_with)
    
    def split_categories(self, column_name, split_on):
        top, middle, bottom = [], [], []
        for i, row in self.df.iterrows():
            hierarchy_string = row[column_name]
            hierarchy_list = hierarchy_string.split(split_on)
            top.append(hierarchy_list[0])
            middle.append(hierarchy_list[1])
            bottom.append(hierarchy_list[2])
        self.df['cat_top'] = top
        self.df['cat_mid'] = middle
        self.df['cat_bot'] = bottom
    
    def clean_column(self, brand_name):
        word = brand_name.lower()
        word = ''.join([letter for letter in word if letter in self.alphabet])
        return word
    
    def drop_rows_with_value(self, column, value):
        self.df = self.df[self.df[column] != value]
    
    def apply_func(self, new_name, from_col, func):
        self.df[new_name] = self.df[from_col].apply(lambda x: func(x))
    
    def create_categorical_labels(self, column_name, new_col):
        self.df[new_col] = LabelEncoder().fit_transform(self.df[column_name])
    
    def engineer_features(self):
        if 'price' in self.df.columns:
            self.drop_rows_with_value('price', 0)
        self.fill_na('category_name', 'cat_Was_null', 'None/None/None')
        self.fill_na('brand_name', 'brand_was_null', 'no_label')
        self.fill_na('item_description', 'desc_was_null', 'No description')
        print ('All Nulls Filled, New Binary Columns Created!')
        self.split_categories('category_name', '/')
        print('Categories Split!')
        self.apply_func('brand_name', 'brand_name', self.clean_column)
        self.apply_func('category_top', 'cat_top', self.clean_column)
        self.apply_func('category_middle', 'cat_mid', self.clean_column)
        self.apply_func('category_bottom', 'cat_bot', self.clean_column)
        print('Cleaned Columns')
        self.create_categorical_labels('brand_name', 'brand_numeric')
        self.create_categorical_labels('cat_top', 'cat_top_numeric')
        self.create_categorical_labels('cat_mid', 'cat_mid_numeric')
        self.create_categorical_labels('cat_bot', 'cat_bot_numeric')
        print('Created Categorical Labels')
        

In [21]:
start = datetime.now()
train = MercariFeatureEngineering('data/train.tsv', delimiter='\t')
train.engineer_features()
end = datetime.now()
print end - start

All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
0:01:59.600541
