# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances as pw_dist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from string import punctuation
from multiprocessing import Pool
from datetime import datetime

In [2]:
class MercariFeatureEngineering(object):
    
    def __init__(self, train_filepath, test_filepath, delimiter=','):
        self.train_df = pd.read_csv(train_filepath, delimiter=delimiter)
        self.test_df = pd.read_csv(test_filepath, delimiter=delimiter)
        self.stop_words = set(stopwords.words('english'))
        self.alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
                             'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
                             'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
                             '5', '6', '7', '8', '9', '0'])
        self.rf = None
        self.brand_encoder = LabelEncoder()
        self.cat_top_encoder = LabelEncoder()
        self.cat_mid_encoder = LabelEncoder()
        self.cat_bot_encoder = LabelEncoder()
    
    def fill_na(self, df, column_name, new_col, fill_with):
        df[new_col] = df[column_name].isnull().astype(int)
        df[column_name] = df[column_name].fillna(fill_with)
    
    def split_categories(self, df, column_name, split_on):
        top, middle, bottom = [], [], []
        for i, row in df.iterrows():
            hierarchy_string = row[column_name]
            hierarchy_list = hierarchy_string.split(split_on)
            top.append(hierarchy_list[0])
            middle.append(hierarchy_list[1])
            bottom.append(hierarchy_list[2])
        df['cat_top'] = top
        df['cat_mid'] = middle
        df['cat_bot'] = bottom
    
    def clean_column(self, brand_name):
        word = brand_name.lower()
        word = ''.join([letter for letter in word if letter in self.alphabet])
        return word
    
    def drop_extremes(self, df, column, low, high):
        df = df.loc[(df[column] >= low) & (df[column] <= high)]
    
    def apply_func(self, df, new_name, from_col, func):
        df[new_name] = df[from_col].apply(lambda x: func(x))
    
    def _categorical_labels(self, df, column_name, new_col, encoder):
        df[new_col] = encoder.transform(df[column_name])
        
    def combine_columns(self, df, new_col, col_one, col_two):
        df[new_col] = df[col_one] + " " + df[col_two]
        
    def tokenize(self, string):
        clean_list = []
        description = string.lower()
        desc_list = description.split()
        for word in desc_list:
            word = word.strip(punctuation)
            if len(word) > 0:
                clean_list.append(word)
        return clean_list

    def no_stopwords(self, token_list):
        no_stop_words = [word for word in token_list
                         if word not in self.stop_words]
        return no_stop_words

    def only_alphabetical(self, token_list):
        cleaned = []
        for word in token_list:
            word = ''.join([letter for letter in word
                            if letter in self.alphabet])
            cleaned.append(word)
        return cleaned
    
    def train_encoders(self):
        self.brand_encoder.fit(list(set(self.train_df['brand_name']).union(set(self.test_df['brand_name']))))
        self.cat_top_encoder.fit(list(set(self.train_df['cat_top']).union(set(self.test_df['cat_top']))))
        self.cat_mid_encoder.fit(list(set(self.train_df['cat_mid']).union(set(self.test_df['cat_mid']))))
        self.cat_bot_encoder.fit(list(set(self.train_df['cat_bot']).union(set(self.test_df['cat_bot']))))
        
    def create_labels(self, df):
        self._categorical_labels(df, 'brand_name', 'brand_numeric', self.brand_encoder)
        self._categorical_labels(df, 'cat_top', 'cat_top_numeric', self.cat_top_encoder)
        self._categorical_labels(df, 'cat_mid', 'cat_mid_numeric', self.cat_mid_encoder)
        self._categorical_labels(df, 'cat_bot', 'cat_bot_numeric', self.cat_bot_encoder)
    
    def engineer_features(self, df):
        if 'price' in df.columns:
            self.drop_extremes(df, 'price', 3, 2000)
        self.fill_na(df, 'category_name', 'cat_Was_null', 'None/None/None')
        self.fill_na(df, 'brand_name', 'brand_was_null', 'missing label')
        self.fill_na(df, 'item_description', 'desc_was_null', 'missing description')
        print('All Nulls Filled, New Binary Columns Created!')
        self.split_categories(df, 'category_name', '/')
        print('Categories Split!')
        self.apply_func(df, 'brand_name', 'brand_name', self.clean_column)
        self.apply_func(df, 'cat_top', 'cat_top', self.clean_column)
        self.apply_func(df, 'cat_mid', 'cat_mid', self.clean_column)
        self.apply_func(df, 'cat_bot', 'cat_bot', self.clean_column)
        print('Cleaned Columns')
        self.combine_columns(df, 'name_desc', 'name', 'item_description')
        self.apply_func(df, 'item_tokens', 'name_desc', self.tokenize)
        self.apply_func(df, 'item_tokens', 'item_tokens', self.no_stopwords)
        self.apply_func(df, 'item_tokens', 'item_tokens', self.only_alphabetical)
        print ('NLP Done')

In [3]:
start = datetime.now()
f_eng = MercariFeatureEngineering('data/train.tsv', 'data/test.tsv', delimiter='\t')
f_eng.engineer_features(f_eng.train_df)
f_eng.engineer_features(f_eng.test_df)
end = datetime.now()
print (end-start)

All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
NLP Done
All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
NLP Done
0:04:49.320158


In [4]:
start = datetime.now()
f_eng.train_encoders()
f_eng.create_labels(f_eng.train_df)
f_eng.create_labels(f_eng.test_df)
end = datetime.now()
print('Created Categorical Labels: {}'.format(end - start))    

Created Categorical Labels: 0:00:12.249075


In [15]:
start = datetime.now()
vectorizer = TfidfVectorizer(tokenizer=lambda x:x, preprocessor=lambda x:x,
                             min_df=0.01, max_df=0.99)
train_vect = vectorizer.fit_transform(f_eng.train_df['item_tokens'])
test_vect = vectorizer.transform(f_eng.test_df['item_tokens'])
svd = TruncatedSVD(n_components=150, n_iter=10)
train_reduced = svd.fit_transform(train_vect)
test_reduced = svd.transform(test_vect)
end = datetime.now()
print (end - start)

0:04:08.877306


In [5]:
# 3 minutes

In [5]:
f_eng.train_df.columns

Index([u'train_id', u'name', u'item_condition_id', u'category_name',
       u'brand_name', u'price', u'shipping', u'item_description',
       u'cat_Was_null', u'brand_was_null', u'desc_was_null', u'cat_top',
       u'cat_mid', u'cat_bot', u'brand_numeric', u'cat_top_numeric',
       u'cat_mid_numeric', u'cat_bot_numeric'],
      dtype='object')

# Regression

In [17]:
class MercariRegression(object):
    
    def __init__(self):
        self.sim_rf = None
        
    def generator(self, X, subset_size=10):
        n_rows = X.shape[0]
        last = n_rows%subset_size
        n_iters = (n_rows - n_rows%subset_size)/subset_size
        for i in xrange(n_iters):
            yield X[(i*subset_size):(i+1)*subset_size]
        yield X[-last:]
    
    def which_leaf(self, X):
        ret_mat = np.empty((X.shape[0], len(self.sim_rf.estimators_)))
        for i, tree in enumerate(self.sim_rf.estimators_):
            labels = tree.apply(X)
            ret_mat[:, i] = labels
        return ret_mat

    def top_n_similar(self, sim_mat, idx_list, n_similar):
        idx_top_sim = np.empty((sim_mat.shape[0], n_similar))
        for i in xrange(sim_mat.shape[0]):
            top_sim = sim_mat[i].argsort()[-n_similar:][::-1]
            idx_top_sim[i] = idx_list[top_sim]
        return idx_top_sim

    def avg_top_sim(self, df, idx_top_sim):
        avg_list = []
        for row in idx_top_sim:
            avg_price = np.mean(df.iloc[row]['price'])
            avg_list.append(avg_price)
        return avg_list
    
    def jac_sim(self, train_leaf, test_leaf):
        similarity_matrix = 1 - pw_dist(test_leaf, train_leaf,
                                        n_jobs=-1, metric='hamming')
        return similarity_matrix

    def find_avgs(self, to_find, compare_to, price_df, num_comp=2000, batch_size=20000):
        all_avgs = []
        count = 0
        for item in self.generator(to_find, batch_size):
            begin = datetime.now()
            count += 1
            idx_list = np.random.choice(compare_to.shape[0], num_comp, replace=False)
            print ("Similar to {} items".format(len(idx_list)))
            sim = reg.jac_sim(compare_to[idx_list], item)
            idx_top_sim = reg.top_n_similar(sim, idx_list, 10)
            avg_top_sim = reg.avg_top_sim(price_df, idx_top_sim)
            all_avgs.extend(avg_top_sim)
            after = datetime.now()
            print ('Loop: {}, Time: {}'.format(count, (after - begin)))
        return all_avgs
        
    def rf_sim(self, n_estimators, X_train, y_train, X_test):
        self.sim_rf = RandomForestRegressor(n_estimators=n_estimators,
                                            verbose=5, n_jobs=-1)
        self.sim_rf.fit(X_train, y_train)
        print ('Done Fitting Random Forest')
        
        

In [17]:
start = datetime.now()
X_train = f_eng.train_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
y_train = f_eng.train_df['price']
X_test = f_eng.test_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
end = datetime.now()
print (end - start)

0:00:08.343796


In [18]:
start = datetime.now()
reg = MercariRegression()
reg.rf_sim(100, X_train, y_train, X_test)
end = datetime.now()
print (end - start)

building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 5 of 100
building tree 6 of 100building tree 4 of 100 building tree 8 of 100


building tree 7 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100 building tree 15 of 100

building tree 16 of 100


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.7s


building tree 17 of 100
building tree 18 of 100
 building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100building tree 39 of 100

building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 5

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   37.2s


building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
 building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
Done Fitting Random Forest
0:01:08.692005


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.1min finished


In [19]:
start = datetime.now()
leaves_df = f_eng.train_df[['brand_numeric', 'cat_top_numeric',
                            'cat_mid_numeric', 'cat_bot_numeric',
                            'price']]
X_leaves = leaves_df.drop('price', axis=1)
train_leaf = reg.which_leaf(X_leaves)
mid = datetime.now()
print ('Leaves for Train Set Found: {}'.format(mid-start))
test_leaf = reg.which_leaf(X_test)
end = datetime.now()
print ("Leaves for Test Set Found: {}".format(end - mid))
print ('Total Time: {}'.format(end - start))

Leaves for Train Set Found: 0:00:36.669502
Leaves for Test Set Found: 0:00:16.651613
Total Time: 0:00:53.321115


In [20]:
train_leaf.shape

(1482535, 100)

In [21]:
count = 0
i = 0
all_avgs = []
start = datetime.now()
for item in reg.generator(test_leaf, 5000):
    beg = datetime.now()
    count += 1
    idx_list = np.random.choice(train_leaf.shape[0], 2500, replace=False)
    sim = reg.jac_sim(train_leaf[idx_list], item)
    idx_top_sim = reg.top_n_similar(sim, idx_list, 10)
    avg_top_sim = reg.avg_top_sim(leaves_df, idx_top_sim)
    all_avgs.extend(avg_top_sim)
    after = datetime.now()
    print ('Loop: {}, Time: {}'.format(count, (after - beg)))
end = datetime.now()
print (end - start)

Loop: 1, Time: 0:00:53.728301
Loop: 2, Time: 0:00:13.088060
Loop: 3, Time: 0:00:07.808701
Loop: 4, Time: 0:00:07.625103
Loop: 5, Time: 0:00:07.746861
Loop: 6, Time: 0:00:07.701253
Loop: 7, Time: 0:00:08.073790
Loop: 8, Time: 0:00:08.039145
Loop: 9, Time: 0:00:08.634748
Loop: 10, Time: 0:00:08.770345
Loop: 11, Time: 0:00:08.560938
Loop: 12, Time: 0:00:09.085440
Loop: 13, Time: 0:00:08.168853
Loop: 14, Time: 0:00:08.209926
Loop: 15, Time: 0:00:08.061992
Loop: 16, Time: 0:00:08.301048
Loop: 17, Time: 0:00:08.001143
Loop: 18, Time: 0:00:08.199261
Loop: 19, Time: 0:00:08.822162
Loop: 20, Time: 0:00:10.059440
Loop: 21, Time: 0:00:08.738553
Loop: 22, Time: 0:00:08.330455
Loop: 23, Time: 0:00:08.189765
Loop: 24, Time: 0:00:08.389605
Loop: 25, Time: 0:00:07.969993
Loop: 26, Time: 0:00:08.884329
Loop: 27, Time: 0:00:09.687402
Loop: 28, Time: 0:00:10.205257
Loop: 29, Time: 0:00:09.129762
Loop: 30, Time: 0:00:08.174897
Loop: 31, Time: 0:00:08.345604
Loop: 32, Time: 0:00:07.987470
Loop: 33, Time: 0

In [22]:
count = 0
i = 0
all_avgs = []
start = datetime.now()
for item in reg.generator(train_leaf, 5000):
    beg = datetime.now()
    count += 1
    idx_list = np.random.choice(train_leaf.shape[0], 2500, replace=False)
    sim = reg.jac_sim(train_leaf[idx_list], item)
    idx_top_sim = reg.top_n_similar(sim, idx_list, 10)
    avg_top_sim = reg.avg_top_sim(leaves_df, idx_top_sim)
    all_avgs.extend(avg_top_sim)
    after = datetime.now()
    print ('Loop: {}, Time: {}'.format(count, (after - beg)))
end = datetime.now()
print (end - start)

Loop: 1, Time: 0:00:19.448310
Loop: 2, Time: 0:00:11.115330
Loop: 3, Time: 0:00:08.458359
Loop: 4, Time: 0:00:09.096590
Loop: 5, Time: 0:00:09.379714
Loop: 6, Time: 0:00:08.912444
Loop: 7, Time: 0:00:08.552984
Loop: 8, Time: 0:00:08.615740
Loop: 9, Time: 0:00:08.725061
Loop: 10, Time: 0:00:09.079894
Loop: 11, Time: 0:00:08.246678
Loop: 12, Time: 0:00:08.668587
Loop: 13, Time: 0:00:08.663419
Loop: 14, Time: 0:00:09.478912
Loop: 15, Time: 0:00:09.313366
Loop: 16, Time: 0:00:09.490402
Loop: 17, Time: 0:00:08.789027
Loop: 18, Time: 0:00:08.489424
Loop: 19, Time: 0:00:08.481500
Loop: 20, Time: 0:00:08.340848
Loop: 21, Time: 0:00:08.410691
Loop: 22, Time: 0:00:08.168076
Loop: 23, Time: 0:00:08.754104
Loop: 24, Time: 0:00:09.886593
Loop: 25, Time: 0:00:09.246459
Loop: 26, Time: 0:00:08.753126
Loop: 27, Time: 0:00:08.770223
Loop: 28, Time: 0:00:08.463101
Loop: 29, Time: 0:00:07.996607
Loop: 30, Time: 0:00:08.525240
Loop: 31, Time: 0:00:08.720608
Loop: 32, Time: 0:00:12.515937
Loop: 33, Time: 0

Loop: 261, Time: 0:00:08.790620
Loop: 262, Time: 0:00:08.140617
Loop: 263, Time: 0:00:09.250968
Loop: 264, Time: 0:00:10.619122
Loop: 265, Time: 0:00:09.305966
Loop: 266, Time: 0:00:08.725270
Loop: 267, Time: 0:00:08.324572
Loop: 268, Time: 0:00:08.460697
Loop: 269, Time: 0:00:08.414351
Loop: 270, Time: 0:00:08.319448
Loop: 271, Time: 0:00:07.971329
Loop: 272, Time: 0:00:07.996500
Loop: 273, Time: 0:00:08.209360
Loop: 274, Time: 0:00:07.958920
Loop: 275, Time: 0:00:08.422141
Loop: 276, Time: 0:00:08.372811
Loop: 277, Time: 0:00:09.093251
Loop: 278, Time: 0:00:09.166479
Loop: 279, Time: 0:00:08.303124
Loop: 280, Time: 0:00:07.947342
Loop: 281, Time: 0:00:08.373092
Loop: 282, Time: 0:00:08.335529
Loop: 283, Time: 0:00:08.340300
Loop: 284, Time: 0:00:08.086688
Loop: 285, Time: 0:00:08.884186
Loop: 286, Time: 0:00:08.956147
Loop: 287, Time: 0:00:08.248792
Loop: 288, Time: 0:00:08.301870
Loop: 289, Time: 0:00:08.506952
Loop: 290, Time: 0:00:08.788411
Loop: 291, Time: 0:00:08.491071
Loop: 29

In [14]:
# 3 min feature engineering
# 6 min 500 tree random forest training
# 4 min determine which leaf for each row
# _ min similarity. 7000 iterations * __ sec per iteration __> too long at 500 trees

In [26]:
np.unique(np.array(all_avgs), return_counts=True)

(array([   3.6,    3.9,    4.2, ...,  282.2,  319. ,  358.3]),
 array([14, 28, 13, ...,  4,  1,  3]))

In [27]:
all_avgs

[20.9,
 13.1,
 39.8,
 31.5,
 22.2,
 11.5,
 39.8,
 22.4,
 39.8,
 14.6,
 18.0,
 13.0,
 39.8,
 54.3,
 22.2,
 27.2,
 39.8,
 23.2,
 25.0,
 8.4,
 28.1,
 11.3,
 23.2,
 39.8,
 39.8,
 11.5,
 27.9,
 39.8,
 37.2,
 18.6,
 39.8,
 40.7,
 39.8,
 31.3,
 19.5,
 20.0,
 38.6,
 39.8,
 13.4,
 39.8,
 46.5,
 33.5,
 29.7,
 20.7,
 18.0,
 39.8,
 17.7,
 17.4,
 39.8,
 8.4,
 39.1,
 39.8,
 20.2,
 39.8,
 41.4,
 27.9,
 39.6,
 11.5,
 19.6,
 39.8,
 16.8,
 39.8,
 18.5,
 33.2,
 39.8,
 24.3,
 22.7,
 26.4,
 39.8,
 29.0,
 19.2,
 41.6,
 32.0,
 22.5,
 20.0,
 21.7,
 14.6,
 39.9,
 8.4,
 17.6,
 12.7,
 34.9,
 27.5,
 39.8,
 24.8,
 14.6,
 17.1,
 39.5,
 17.4,
 19.7,
 21.3,
 14.6,
 38.6,
 43.7,
 39.8,
 31.6,
 38.8,
 39.8,
 39.8,
 24.6,
 39.8,
 33.3,
 20.0,
 33.0,
 21.7,
 13.1,
 8.4,
 39.8,
 39.8,
 39.8,
 20.0,
 16.1,
 28.5,
 39.8,
 36.6,
 27.3,
 14.6,
 13.2,
 27.6,
 47.6,
 39.8,
 17.4,
 17.1,
 28.0,
 39.8,
 20.0,
 22.2,
 32.7,
 24.0,
 39.8,
 22.5,
 19.7,
 33.6,
 20.7,
 39.8,
 29.1,
 32.5,
 39.8,
 40.8,
 23.7,
 19.9,
 41.6,
 20.9,
 21

# Testing

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge

In [8]:
def rmsle(y, y0):
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

In [9]:
start = datetime.now()
f_eng = MercariFeatureEngineering('data/train.tsv', 'data/test.tsv', delimiter='\t')
f_eng.engineer_features(f_eng.train_df)
f_eng.engineer_features(f_eng.test_df)
end = datetime.now()
print (end-start)

All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
NLP Done
All Nulls Filled, New Binary Columns Created!
Categories Split!
Cleaned Columns
NLP Done
0:05:13.138558


Process PoolWorker-3:
    racquire()
Process PoolWorker-4:
Process PoolWorker-1:
Process PoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
    self.run()
    self.run()
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/npng/anaconda/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/Users/npng/anaconda/lib/python2.7/mul

In [10]:
start = datetime.now()
f_eng.train_encoders()
f_eng.create_labels(f_eng.train_df)
f_eng.create_labels(f_eng.test_df)
end = datetime.now()
print('Created Categorical Labels: {}'.format(end - start)) 

Created Categorical Labels: 0:00:15.603686


In [31]:
y = f_eng.train_df['price']
X_leaves, X_test, y_train, y_test= train_test_split(f_eng.train_df, y, test_size=0.2)
leaves_train = X_leaves[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
leaves_test = X_test[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]

In [32]:
start = datetime.now()
reg = MercariRegression()
reg.rf_sim(100, leaves_train, y_train, leaves_test)
end = datetime.now()
print (end - start)

building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100building tree 5 of 100
building tree 7 of 100
 building tree 8 of 100


building tree 6 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
 building tree 14 of 100
building tree 15 of 100
building tree 16 of 100


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.2s


building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100 building tree 23 of 100

building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
 building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   25.4s


building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 77 of 100building tree 76 of 100

building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
Done Fitting Random Forest
0:00:46.986142


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.6s finished


In [33]:
train_leaf = reg.which_leaf(leaves_train)
test_leaf = reg.which_leaf(leaves_test)

In [34]:
train_leaf.shape

(1186028, 100)

In [35]:
start = datetime.now()
test_avgs = reg.find_avgs(test_leaf, train_leaf, X_leaves, 2500, 50000)
end = datetime.now()
print ("Found Test Avgs: {}".format(end - start))
start = datetime.now()
train_avgs = reg.find_avgs(train_leaf, train_leaf, X_leaves, 2500, 50000)
end = datetime.now()
print ("Found Train Avgs: {}".format(end - start))

Similar to 2500 items
Loop: 1, Time: 0:00:53.038106
Similar to 2500 items
Loop: 2, Time: 0:00:37.703106
Similar to 2500 items
Loop: 3, Time: 0:00:36.615471
Similar to 2500 items
Loop: 4, Time: 0:00:36.823964
Similar to 2500 items
Loop: 5, Time: 0:00:36.037319
Similar to 2500 items
Loop: 6, Time: 0:00:33.149136
Found Test Avgs: 0:03:53.468822
Similar to 2500 items
Loop: 1, Time: 0:00:32.875996
Similar to 2500 items
Loop: 2, Time: 0:00:34.210873
Similar to 2500 items
Loop: 3, Time: 0:00:36.953344
Similar to 2500 items
Loop: 4, Time: 0:00:35.929450
Similar to 2500 items
Loop: 5, Time: 0:00:34.454099
Similar to 2500 items
Loop: 6, Time: 0:00:33.566481
Similar to 2500 items
Loop: 7, Time: 0:00:31.724407
Similar to 2500 items
Loop: 8, Time: 0:00:33.511072
Similar to 2500 items
Loop: 9, Time: 0:00:32.977810
Similar to 2500 items
Loop: 10, Time: 0:00:31.939333
Similar to 2500 items
Loop: 11, Time: 0:00:31.401375
Similar to 2500 items
Loop: 12, Time: 0:00:33.248974
Similar to 2500 items
Loop: 1

In [56]:
only_avg_rmsle = rmsle(np.array(y_test), np.array(test_avgs))
only_avg_rmsle

0.7336505723894812

In [41]:
tv = TfidfVectorizer(tokenizer=lambda x:x, preprocessor=lambda x:x, max_df=0.99, min_df=0.01)
train_vect = tv.fit_transform(X_leaves['item_tokens'])
test_vect = tv.transform(X_test['item_tokens'])

In [42]:
svd = TruncatedSVD(n_components=150, n_iter=10)
reduced_train = svd.fit_transform(train_vect)
reduced_test = svd.transform(test_vect)

In [39]:
print len(train_avgs)
print y_train.shape

1186028
(1186028,)


In [63]:
train_res = y_train - train_avgs
test_res = y_test - test_avgs

In [61]:
lasso = Lasso(alpha=0.1)
lasso.fit(reduced_train, train_res)
lasso_pred = lasso.predict(reduced_test)
pred = test_avgs + lasso_pred
print rmsle(y_test, pred)

0.7234239059732754


  


In [62]:
lasso = Lasso(alpha=0.1)
lasso.fit(reduced_train, y_train)
lass_pred = lasso.predict(reduced_test)
pred = np.array(test_avgs)*0.5+np.array(lass_pred)*0.5
print rmsle(y_test, pred)

0.7288304967878699


In [60]:
import lightgbm as lgb

In [77]:
params = {'learning_rate': 0.76,
          'application': 'regression',
          'max_depth': 3,
          'num_leaves': 99,
          'verbosity': -1,
          'metric': 'RMSE',
          'nthread': 4}
lgb_train = lgb.Dataset(reduced_train, y_train)
lgb_eval = lgb.Dataset(reduced_test, y_test)
watchlist = [reduced_test, test_res]

model = lgb.train(params, lgb_train, num_boost_round=7500,
                  valid_sets=lgb_eval, early_stopping_rounds=500, verbose_eval=500) 
predsL = model.predict(reduced_test)

pred_avg = np.array(test_avgs)*0.5 + np.array(predsL)*0.5
print rmsle(y_test, pred_avg)

Training until validation scores don't improve for 500 rounds.
[500]	valid_0's rmse: 35.2911
[1000]	valid_0's rmse: 35.3227
Early stopping, best iteration is:
[527]	valid_0's rmse: 35.276
0.688337127601009


  


In [76]:
pred_avg = np.array(test_avgs)*0.5 + np.array(predsL)*0.5
print rmsle(y_test, pred_avg)

1.1739884738421165


  


In [82]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()
train_col = list(X_train.columns)
test_col = list(X_test.columns)
train = h2o.H2OFrame.from_python(X_train, column_names=train_col)
test = h2o.H2OFrame.from_python(X_test, column_names=test_col)