# Feature Engineering

In [32]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances as pw_dist
from multiprocessing import Pool
from datetime import datetime

In [34]:
class MercariFeatureEngineering(object):
    
    def __init__(self, train_filepath, test_filepath, delimiter=','):
        self.pool = Pool(processes=4)
        self.train_df = pd.read_csv(train_filepath, delimiter=delimiter)
        self.test_df = pd.read_csv(test_filepath, delimiter=delimiter)
        #self.stop_words = set(stopwords.words('english'))
        self.alphabet = set(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
                             'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
                             'u', 'v', 'w', 'x', 'y', 'z'])
        self.rf = None
        self.brand_encoder = LabelEncoder()
        self.cat_top_encoder = LabelEncoder()
        self.cat_mid_encoder = LabelEncoder()
        self.cat_bot_encoder = LabelEncoder()
    
    def fill_na(self, df, column_name, new_col, fill_with):
        df[new_col] = df[column_name].isnull().astype(int)
        df[column_name] = df[column_name].fillna(fill_with)
    
    def split_categories(self, df, column_name, split_on):
        top, middle, bottom = [], [], []
        for i, row in df.iterrows():
            hierarchy_string = row[column_name]
            hierarchy_list = hierarchy_string.split(split_on)
            top.append(hierarchy_list[0])
            middle.append(hierarchy_list[1])
            bottom.append(hierarchy_list[2])
        df['cat_top'] = top
        df['cat_mid'] = middle
        df['cat_bot'] = bottom
    
    def clean_column(self, brand_name):
        word = brand_name.lower()
        word = ''.join([letter for letter in word if letter in self.alphabet])
        return word
    
    def drop_rows_with_value(self, df, column, value):
        df = df[df[column] != value]
    
    def apply_func(self, df, new_name, from_col, func):
        df[new_name] = df[from_col].apply(lambda x: func(x))
    
    def _categorical_labels(self, df, column_name, new_col, encoder):
        df[new_col] = encoder.transform(df[column_name])
        
    def dummify_categories(self, col_top, col_mid, col_bot, col_brand, df):
        brand_dummies = pd.get_dummies(df[col_brand])
        cat_top_dummies = pd.get_dummies(df[col_top])
        cat_mid_dummies = pd.get_dummies(df[col_mid])
        cat_bot_dummies = pd.get_dummies(df[col_bot])
        result = pd.concat([brand_dummies, cat_top_dummies,
                            cat_mid_dummies, cat_bot_dummies],
                            axis=1, join_axes=[brand_dummies.index])
        return result
    
    def train_encoders(self):
        self.brand_encoder.fit(list(set(self.train_df['brand_name']).union(set(self.test_df['brand_name']))))
        self.cat_top_encoder.fit(list(set(self.train_df['cat_top']).union(set(self.test_df['cat_top']))))
        self.cat_mid_encoder.fit(list(set(self.train_df['cat_mid']).union(set(self.test_df['cat_mid']))))
        self.cat_bot_encoder.fit(list(set(self.train_df['cat_bot']).union(set(self.test_df['cat_bot']))))
        
    def create_labels(self, df):
        self._categorical_labels(df, 'brand_name', 'brand_numeric', self.brand_encoder)
        self._categorical_labels(df, 'cat_top', 'cat_top_numeric', self.cat_top_encoder)
        self._categorical_labels(df, 'cat_mid', 'cat_mid_numeric', self.cat_mid_encoder)
        self._categorical_labels(df, 'cat_bot', 'cat_bot_numeric', self.cat_bot_encoder)
    
    def engineer_features(self, df):
        if 'price' in df.columns:
            self.drop_rows_with_value(df, 'price', 0)
            df = df[df['price'] < 2000]
        self.fill_na(df, 'category_name', 'cat_Was_null', 'None/None/None')
        self.fill_na(df, 'brand_name', 'brand_was_null', 'no_label')
        self.fill_na(df, 'item_description', 'desc_was_null', 'No description')
        print('All Nulls Filled, New Binary Columns Created!')
        self.split_categories(df, 'category_name', '/')
        print('Categories Split!')
        self.apply_func(df, 'brand_name', 'brand_name', self.clean_column)
        self.apply_func(df, 'cat_top', 'cat_top', self.clean_column)
        self.apply_func(df, 'cat_mid', 'cat_mid', self.clean_column)
        self.apply_func(df, 'cat_bot', 'cat_bot', self.clean_column)
        print('Cleaned Columns')

In [4]:
start = datetime.now()
f_eng = MercariFeatureEngineering('data/train.tsv', 'data/test.tsv', delimiter='\t')
f_eng.engineer_features(f_eng.train_df)
f_eng.engineer_features(f_eng.test_df)
end = datetime.now()
print (end-start)

PicklingError: Can't pickle <type 'instancemethod'>: attribute lookup __builtin__.instancemethod failed

In [5]:
start = datetime.now()
f_eng.train_encoders()
f_eng.create_labels(f_eng.train_df)
f_eng.create_labels(f_eng.test_df)
end = datetime.now()
print('Created Categorical Labels: {}'.format(end - start))    

Created Categorical Labels: 0:00:14.406529


In [5]:
# 3 minutes

# Regression

In [3]:
class MercariRegression(object):
    
    def __init__(self):
        self.sim_rf = None
        
    def generator(self, X, subset_size=10):
        n_rows = X.shape[0]
        last = n_rows%subset_size
        n_iters = (n_rows - n_rows%subset_size)/subset_size
        for i in xrange(n_iters):
            yield X[(i*subset_size):(i+1)*subset_size]
        yield X[-last:]
    
    def which_leaf(self, X):
        ret_mat = np.empty((X.shape[0], len(self.sim_rf.estimators_)))
        for i, tree in enumerate(self.sim_rf.estimators_):
            labels = tree.apply(X)
            ret_mat[:, i] = labels
        return ret_mat

    def top_n_similar(self, sim_mat, idx_list, n_similar):
        idx_top_sim = np.empty((sim_mat.shape[0], n_similar))
        for i in xrange(sim_mat.shape[0]):
            top_sim = sim_mat[i].argsort()[-n_similar:][::-1]
            idx_top_sim[i] = idx_list[top_sim]
        return idx_top_sim

    def avg_top_sim(self, df, idx_top_sim):
        avg_list = []
        for row in idx_top_sim:
            avg_price = np.mean(df.iloc[row]['price'])
            avg_list.append(avg_price)
        return avg_list
    
    def jac_sim(self, train_leaf, test_leaf):
        similarity_matrix = 1 - pw_dist(test_leaf, train_leaf,
                                        n_jobs=-1, metric='hamming')
        return similarity_matrix
        
    def rf_sim(self, n_estimators, X_train, y_train, X_test):
        self.sim_rf = RandomForestRegressor(n_estimators=n_estimators,
                                            verbose=5, n_jobs=-1)
        self.sim_rf.fit(X_train, y_train)
        print ('Done Fitting Random Forest')
        
        

In [7]:
start = datetime.now()
X_train = f_eng.train_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
y_train = f_eng.train_df['price']
X_test = f_eng.test_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric']]
end = datetime.now()
print (end - start)

0:00:00.905672


In [8]:
start = datetime.now()
reg = MercariRegression()
reg.rf_sim(200, X_train, y_train, X_test)
end = datetime.now()
print (end - start)

building tree 1 of 200 building tree 3 of 200building tree 5 of 200building tree 4 of 200building tree 6 of 200 
building tree 2 of 200



building tree 7 of 200

building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.6s


building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200building tree 27 of 200

building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200
building tree 48 of 200
building tree 49 of 200
building tree 50 of 200
building tree 51 of 200
building tree 52 of 200
building tree 53 of 200building tree 54 

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   34.3s


building tree 64 of 200
building tree 65 of 200
building tree 66 of 200
building tree 67 of 200
building tree 68 of 200
building tree 69 of 200
building tree 70 of 200
building tree 71 of 200
building tree 72 of 200
building tree 73 of 200
building tree 74 of 200
building tree 75 of 200
building tree 76 of 200
building tree 77 of 200
building tree 78 of 200
building tree 79 of 200
building tree 80 of 200
building tree 81 of 200
building tree 82 of 200
building tree 83 of 200
building tree 84 of 200
building tree 85 of 200
building tree 86 of 200
building tree 87 of 200
building tree 88 of 200
building tree 89 of 200
building tree 90 of 200
building tree 91 of 200
building tree 92 of 200
building tree 93 of 200
building tree 94 of 200
building tree 95 of 200
building tree 96 of 200
building tree 97 of 200
building tree 98 of 200
building tree 99 of 200
building tree 100 of 200
building tree 101 of 200
building tree 102 of 200
building tree 103 of 200
building tree 104 of 200
building tr

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.6min


building tree 154 of 200
building tree 155 of 200
building tree 156 of 200
building tree 157 of 200
building tree 158 of 200
building tree 159 of 200
building tree 160 of 200
building tree 161 of 200
building tree 162 of 200
building tree 163 of 200
building tree 164 of 200
building tree 165 of 200
building tree 166 of 200
building tree 167 of 200
building tree 168 of 200
building tree 169 of 200
building tree 170 of 200
building tree 171 of 200
building tree 172 of 200
building tree 173 of 200
building tree 174 of 200
building tree 175 of 200
building tree 176 of 200
building tree 177 of 200
building tree 178 of 200
building tree 179 of 200
building tree 180 of 200
building tree 181 of 200
building tree 182 of 200
building tree 183 of 200
building tree 184 of 200
building tree 185 of 200
building tree 186 of 200
building tree 187 of 200
building tree 188 of 200
building tree 189 of 200
building tree 190 of 200
building tree 191 of 200
building tree 192 of 200
building tree 193 of 200


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.1min finished


In [9]:
start = datetime.now()
leaves_df = f_eng.train_df[['brand_numeric', 'cat_top_numeric',
                            'cat_mid_numeric', 'cat_bot_numeric',
                            'price']]
X_leaves = leaves_df.drop('price', axis=1)
train_leaf = reg.which_leaf(X_leaves)
mid = datetime.now()
print ('Leaves for Train Set Found: {}'.format(mid-start))
test_leaf = reg.which_leaf(X_test)
end = datetime.now()
print ("Leaves for Test Set Found: {}".format(end - mid))
print ('Total Time: {}'.format(end - start))

Leaves for Train Set Found: 0:01:01.858149
Leaves for Test Set Found: 0:00:30.028553
Total Time: 0:01:31.886702


In [10]:
train_leaf.shape

(1482535, 200)

In [18]:
count = 0
i = 0
all_avgs = []
start = datetime.now()
for item in reg.generator(test_leaf, 1000):
    beg = datetime.now()
    count += 1
    idx_list = np.random.choice(train_leaf.shape[0], 20000, replace=False)
    sim = reg.jac_sim(train_leaf[idx_list], item)
    idx_top_sim = reg.top_n_similar(sim, idx_list, 10)
    avg_top_sim = reg.avg_top_sim(leaves_df, idx_top_sim)
    all_avgs.extend(avg_top_sim)
    after = datetime.now()
    print ('Loop: {}, Time: {}'.format(count, (after - beg)))
end = datetime.now()
print (end - start)

Loop: 1, Time: 0:00:02.622087
Loop: 2, Time: 0:00:02.653745
Loop: 3, Time: 0:00:02.532209
Loop: 4, Time: 0:00:02.496474
Loop: 5, Time: 0:00:02.527320
Loop: 6, Time: 0:00:02.700418
Loop: 7, Time: 0:00:02.473603
Loop: 8, Time: 0:00:02.575439
Loop: 9, Time: 0:00:02.610487
Loop: 10, Time: 0:00:02.631745
Loop: 11, Time: 0:00:02.434194
Loop: 12, Time: 0:00:02.510135
Loop: 13, Time: 0:00:02.471904
Loop: 14, Time: 0:00:02.516701
Loop: 15, Time: 0:00:02.502873
Loop: 16, Time: 0:00:02.405672
Loop: 17, Time: 0:00:02.489064
Loop: 18, Time: 0:00:02.489061
Loop: 19, Time: 0:00:02.474514
Loop: 20, Time: 0:00:02.575966
Loop: 21, Time: 0:00:02.488491
Loop: 22, Time: 0:00:02.492019
Loop: 23, Time: 0:00:02.445964
Loop: 24, Time: 0:00:02.367798
Loop: 25, Time: 0:00:02.493596
Loop: 26, Time: 0:00:02.476551
Loop: 27, Time: 0:00:02.466858
Loop: 28, Time: 0:00:02.466986
Loop: 29, Time: 0:00:02.526523
Loop: 30, Time: 0:00:02.488721
Loop: 31, Time: 0:00:02.441587
Loop: 32, Time: 0:00:02.395893
Loop: 33, Time: 0

Loop: 261, Time: 0:00:02.413084
Loop: 262, Time: 0:00:02.501333
Loop: 263, Time: 0:00:02.478998
Loop: 264, Time: 0:00:02.509419
Loop: 265, Time: 0:00:02.481637
Loop: 266, Time: 0:00:02.492062
Loop: 267, Time: 0:00:02.510234
Loop: 268, Time: 0:00:02.518705
Loop: 269, Time: 0:00:02.500593
Loop: 270, Time: 0:00:02.514235
Loop: 271, Time: 0:00:02.506950
Loop: 272, Time: 0:00:02.499798
Loop: 273, Time: 0:00:02.479496
Loop: 274, Time: 0:00:02.515961
Loop: 275, Time: 0:00:02.488471
Loop: 276, Time: 0:00:02.525177
Loop: 277, Time: 0:00:02.500430
Loop: 278, Time: 0:00:02.578047
Loop: 279, Time: 0:00:02.390090
Loop: 280, Time: 0:00:02.411274
Loop: 281, Time: 0:00:02.496937
Loop: 282, Time: 0:00:02.487449
Loop: 283, Time: 0:00:02.490916
Loop: 284, Time: 0:00:02.516212
Loop: 285, Time: 0:00:02.520355
Loop: 286, Time: 0:00:02.504663
Loop: 287, Time: 0:00:02.407710
Loop: 288, Time: 0:00:02.490959
Loop: 289, Time: 0:00:02.587709
Loop: 290, Time: 0:00:02.501692
Loop: 291, Time: 0:00:02.569960
Loop: 29

Loop: 518, Time: 0:00:02.511146
Loop: 519, Time: 0:00:02.529345
Loop: 520, Time: 0:00:02.553939
Loop: 521, Time: 0:00:02.408495
Loop: 522, Time: 0:00:02.465753
Loop: 523, Time: 0:00:02.529673
Loop: 524, Time: 0:00:02.571656
Loop: 525, Time: 0:00:02.547872
Loop: 526, Time: 0:00:02.428824
Loop: 527, Time: 0:00:02.535617
Loop: 528, Time: 0:00:02.534724
Loop: 529, Time: 0:00:02.420826
Loop: 530, Time: 0:00:02.520923
Loop: 531, Time: 0:00:02.503204
Loop: 532, Time: 0:00:02.504613
Loop: 533, Time: 0:00:02.556733
Loop: 534, Time: 0:00:02.447005
Loop: 535, Time: 0:00:02.561417
Loop: 536, Time: 0:00:02.641827
Loop: 537, Time: 0:00:02.568548
Loop: 538, Time: 0:00:02.431593
Loop: 539, Time: 0:00:02.572932
Loop: 540, Time: 0:00:02.572143
Loop: 541, Time: 0:00:02.529605
Loop: 542, Time: 0:00:02.559100
Loop: 543, Time: 0:00:02.518294
Loop: 544, Time: 0:00:02.551771
Loop: 545, Time: 0:00:02.433267
Loop: 546, Time: 0:00:02.562174
Loop: 547, Time: 0:00:02.499299
Loop: 548, Time: 0:00:02.445174
Loop: 54

In [14]:
# 3 min feature engineering
# 6 min 500 tree random forest training
# 4 min determine which leaf for each row
# _ min similarity. 7000 iterations * __ sec per iteration __> too long at 500 trees

In [19]:
np.unique(np.array(all_avgs), return_counts=True)

(array([   3.2,    3.3,    3.4, ...,  529.4,  566.5,  654.1]),
 array([1, 1, 4, ..., 1, 2, 1]))

In [20]:
all_avgs

[31.5,
 9.6,
 31.4,
 29.7,
 29.8,
 9.8,
 28.8,
 41.4,
 25.5,
 14.8,
 20.8,
 13.1,
 29.6,
 51.6,
 30.5,
 27.5,
 33.4,
 15.6,
 32.7,
 10.6,
 9.2,
 12.6,
 11.5,
 29.8,
 53.0,
 9.8,
 29.6,
 29.6,
 71.7,
 24.4,
 14.0,
 18.7,
 29.2,
 22.2,
 15.0,
 25.6,
 29.5,
 29.6,
 9.4,
 19.5,
 43.4,
 60.7,
 33.7,
 11.9,
 14.1,
 29.6,
 10.8,
 26.0,
 29.6,
 10.6,
 21.3,
 26.2,
 15.4,
 29.6,
 14.6,
 26.7,
 28.8,
 9.8,
 14.3,
 28.8,
 28.9,
 28.8,
 21.0,
 38.3,
 27.5,
 39.9,
 37.5,
 19.7,
 29.6,
 17.5,
 12.1,
 18.5,
 92.7,
 26.3,
 22.5,
 15.2,
 18.9,
 28.4,
 10.6,
 20.9,
 13.3,
 25.0,
 99.2,
 29.6,
 21.5,
 18.9,
 17.8,
 16.9,
 34.7,
 20.1,
 12.7,
 18.9,
 29.5,
 70.7,
 51.2,
 35.0,
 18.1,
 28.3,
 13.9,
 34.3,
 29.6,
 39.3,
 22.5,
 27.5,
 19.4,
 9.6,
 10.6,
 20.7,
 16.9,
 29.0,
 25.6,
 15.7,
 21.0,
 26.2,
 23.2,
 16.6,
 14.8,
 18.4,
 17.6,
 40.3,
 29.6,
 34.7,
 17.8,
 15.5,
 27.4,
 22.5,
 30.5,
 71.5,
 30.2,
 29.6,
 28.2,
 20.1,
 34.7,
 27.1,
 27.9,
 19.7,
 25.1,
 27.3,
 36.9,
 12.1,
 23.1,
 37.1,
 31.5,
 22.7,

# Testing

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
def rmsle(pred, actual):
    sle = (np.log10(pred+1) - np.log10(actual+1))**2
    msle = sum(sle)/len(pred)
    rmsle = np.sqrt(msle)
    return rmsle

In [19]:
X = f_eng.train_df[['brand_numeric', 'cat_top_numeric', 'cat_mid_numeric', 'cat_bot_numeric', 'price']]
y = f_eng.train_df['price']
X_leaves, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)
X_train = X_leaves.drop('price', axis=1)
X_test = X_test.drop('price', axis=1)

In [24]:
start = datetime.now()
reg = MercariRegression()
reg.rf_sim(200, X_train, y_train, X_test)
end = datetime.now()
print (end - start)

building tree 2 of 200building tree 1 of 200building tree 3 of 200

building tree 4 of 200 
 building tree 6 of 200building tree 8 of 200
building tree 5 of 200 

building tree 7 of 200

building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.5s


building tree 15 of 200

building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200
building tree 48 of 200
building tree 49 of 200
building tree 50 of 200
building tree 51 of 200
building tree 52 of 200
building tree 53 of 200
building tree 54 of 200
building tree 55 of 200
building tree 5

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   25.5s


building tree 65 of 200
building tree 66 of 200
building tree 67 of 200
building tree 68 of 200
building tree 69 of 200
building tree 70 of 200
building tree 71 of 200
building tree 72 of 200
building tree 73 of 200
building tree 74 of 200
building tree 75 of 200
building tree 76 of 200
building tree 77 of 200
building tree 78 of 200
building tree 79 of 200
building tree 80 of 200
building tree 81 of 200
building tree 82 of 200
building tree 83 of 200
building tree 84 of 200
building tree 85 of 200
building tree 86 of 200
building tree 87 of 200
building tree 88 of 200
building tree 89 of 200
building tree 90 of 200
building tree 91 of 200
building tree 92 of 200
building tree 93 of 200
building tree 94 of 200
building tree 95 of 200
building tree 96 of 200
building tree 97 of 200
building tree 98 of 200
building tree 99 of 200
building tree 100 of 200
building tree 101 of 200
building tree 102 of 200
building tree 103 of 200
building tree 104 of 200
building tree 105 of 200
building t

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.2min


building tree 154 of 200
building tree 155 of 200
building tree 156 of 200
building tree 157 of 200
building tree 158 of 200
building tree 159 of 200
building tree 160 of 200
building tree 161 of 200
building tree 162 of 200
building tree 163 of 200
building tree 164 of 200
building tree 165 of 200
building tree 166 of 200
building tree 167 of 200
building tree 168 of 200
building tree 169 of 200
building tree 170 of 200
building tree 171 of 200
building tree 172 of 200
building tree 173 of 200
building tree 174 of 200
building tree 175 of 200
building tree 176 of 200
building tree 177 of 200
building tree 178 of 200
building tree 179 of 200
building tree 180 of 200
building tree 181 of 200
building tree 182 of 200
building tree 183 of 200
building tree 184 of 200
building tree 185 of 200
building tree 186 of 200
building tree 187 of 200
building tree 188 of 200
building tree 189 of 200
building tree 190 of 200
building tree 191 of 200
building tree 192 of 200
building tree 193 of 200


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.6min finished


In [26]:
train_leaf = reg.which_leaf(X_train)
test_leaf = reg.which_leaf(X_test)

In [27]:
count = 0
i = 0
all_avgs = []
start = datetime.now()
for item in reg.generator(test_leaf, 1000):
    beg = datetime.now()
    count += 1
    idx_list = np.random.choice(train_leaf.shape[0], 20000, replace=False)
    sim = reg.jac_sim(train_leaf[idx_list], item)
    idx_top_sim = reg.top_n_similar(sim, idx_list, 10)
    avg_top_sim = reg.avg_top_sim(X_leaves, idx_top_sim)
    all_avgs.extend(avg_top_sim)
    after = datetime.now()
    print ('Loop: {}, Time: {}'.format(count, (after - beg)))
end = datetime.now()
print (end - start)

Loop: 1, Time: 0:00:02.307632
Loop: 2, Time: 0:00:02.380220
Loop: 3, Time: 0:00:02.915831
Loop: 4, Time: 0:00:02.766734
Loop: 5, Time: 0:00:02.302242
Loop: 6, Time: 0:00:02.425524
Loop: 7, Time: 0:00:02.372600
Loop: 8, Time: 0:00:02.427255
Loop: 9, Time: 0:00:02.428044
Loop: 10, Time: 0:00:02.432093
Loop: 11, Time: 0:00:02.652630
Loop: 12, Time: 0:00:02.858418
Loop: 13, Time: 0:00:02.442779
Loop: 14, Time: 0:00:02.414294
Loop: 15, Time: 0:00:02.469371
Loop: 16, Time: 0:00:02.636700
Loop: 17, Time: 0:00:02.530799
Loop: 18, Time: 0:00:02.390215
Loop: 19, Time: 0:00:02.424533
Loop: 20, Time: 0:00:02.399109
Loop: 21, Time: 0:00:02.382813
Loop: 22, Time: 0:00:02.316239
Loop: 23, Time: 0:00:02.379700
Loop: 24, Time: 0:00:02.369058
Loop: 25, Time: 0:00:02.390194
Loop: 26, Time: 0:00:02.375532
Loop: 27, Time: 0:00:02.290495
Loop: 28, Time: 0:00:02.378007
Loop: 29, Time: 0:00:02.292451
Loop: 30, Time: 0:00:02.598207
Loop: 31, Time: 0:00:02.403705
Loop: 32, Time: 0:00:02.327301
Loop: 33, Time: 0

Loop: 261, Time: 0:00:02.288392
Loop: 262, Time: 0:00:02.332055
Loop: 263, Time: 0:00:02.308499
Loop: 264, Time: 0:00:02.439256
Loop: 265, Time: 0:00:02.300237
Loop: 266, Time: 0:00:02.371130
Loop: 267, Time: 0:00:02.427758
Loop: 268, Time: 0:00:02.303662
Loop: 269, Time: 0:00:02.678517
Loop: 270, Time: 0:00:02.388829
Loop: 271, Time: 0:00:02.455218
Loop: 272, Time: 0:00:02.362820
Loop: 273, Time: 0:00:02.269138
Loop: 274, Time: 0:00:02.293557
Loop: 275, Time: 0:00:02.291566
Loop: 276, Time: 0:00:02.271517
Loop: 277, Time: 0:00:02.324339
Loop: 278, Time: 0:00:02.282557
Loop: 279, Time: 0:00:02.369848
Loop: 280, Time: 0:00:02.297490
Loop: 281, Time: 0:00:02.316858
Loop: 282, Time: 0:00:02.271352
Loop: 283, Time: 0:00:02.294967
Loop: 284, Time: 0:00:02.296791
Loop: 285, Time: 0:00:02.413761
Loop: 286, Time: 0:00:02.389017
Loop: 287, Time: 0:00:02.425538
Loop: 288, Time: 0:00:02.376033
Loop: 289, Time: 0:00:02.368161
Loop: 290, Time: 0:00:02.305753
Loop: 291, Time: 0:00:02.834967
Loop: 29

In [28]:
all_avgs

[24.9,
 53.1,
 23.0,
 20.8,
 23.2,
 21.8,
 22.8,
 13.8,
 25.5,
 279.1,
 30.0,
 25.1,
 25.3,
 20.3,
 23.5,
 32.6,
 25.7,
 16.0,
 30.8,
 30.6,
 34.5,
 13.5,
 26.8,
 26.8,
 17.9,
 18.6,
 26.4,
 23.4,
 17.8,
 26.7,
 21.4,
 14.6,
 34.7,
 21.7,
 26.8,
 30.7,
 19.1,
 16.3,
 27.3,
 11.9,
 14.0,
 34.5,
 18.1,
 28.6,
 24.5,
 26.8,
 12.1,
 112.5,
 23.2,
 16.1,
 21.1,
 26.2,
 26.8,
 35.2,
 34.1,
 8.6,
 92.1,
 18.1,
 26.8,
 16.2,
 24.2,
 24.8,
 22.4,
 26.7,
 26.8,
 7.9,
 15.8,
 14.6,
 22.9,
 53.1,
 17.1,
 23.0,
 21.1,
 28.3,
 28.1,
 17.7,
 20.6,
 15.5,
 30.3,
 26.9,
 24.6,
 40.9,
 6.4,
 10.0,
 26.8,
 9.9,
 26.8,
 17.9,
 26.2,
 19.2,
 26.8,
 16.4,
 21.0,
 61.9,
 22.7,
 23.0,
 28.6,
 17.4,
 23.1,
 112.5,
 32.8,
 14.7,
 26.2,
 18.6,
 23.7,
 8.8,
 10.0,
 29.8,
 26.8,
 26.6,
 22.7,
 26.8,
 11.9,
 21.8,
 14.6,
 19.7,
 21.5,
 28.8,
 21.1,
 11.1,
 111.0,
 28.1,
 14.0,
 16.2,
 21.1,
 6.4,
 22.7,
 22.4,
 36.7,
 19.3,
 14.6,
 16.4,
 25.2,
 11.7,
 26.8,
 24.5,
 34.5,
 25.8,
 28.6,
 28.2,
 32.8,
 27.4,
 41.5,
 

In [33]:
only_avg_rmsle = rmsle(np.array(all_avgs), np.array(y_test))
only_avg_rmsle

0.3127888174099005