In [33]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import scipy as sp
import xgboost as xgb

from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

pd.options.display.max_colwidth = -1

%matplotlib inline

In [2]:
# look at distrubution in number of categories over test and train

def num_cats(text):
    try:
        return len(text.split('/'))
    except:
        return 0
    
def split_cat(text):
    try:
        split = text.split('/')
        if len(split) == 3:
            split.extend([np.nan, np.nan])
        elif len(split) == 4:
            split.extend([np.nan]) 
        return split
    except:
        # choosing np.nan enables a get_dummies option to ignore / include missing values
        return (np.nan, np.nan, np.nan, np.nan, np.nan)
    
def print_horiz_line():
    print("-----------------------")

In [None]:
train = pd.read_csv('data/train.tsv', sep='\t')
test = pd.read_csv('data/test.tsv', sep='\t')

# size of data

print("Size of data:")
print("train, {}".format(train.shape))
print("test, {}".format(test.shape))

In [None]:
train['num_cats'] = train['category_name'].apply(lambda x: num_cats(x))
test['num_cats'] = train['category_name'].apply(lambda x: num_cats(x))

print("Training distribution")
print(train.num_cats.value_counts())
print("Testing distribution")
print(test.num_cats.value_counts())

In [None]:
train['cat0'], train['cat1'], train['cat2'], train['cat3'], train['cat4'] =\
    zip(*train['category_name'].apply(lambda x: split_cat(x)))
    
test['cat0'], test['cat1'], test['cat2'], test['cat3'], test['cat4'] = \
    zip(*test['category_name'].apply(lambda x: split_cat(x)))

In [None]:
# # sanity check
train.loc[train.num_cats==4, ['category_name', 'cat0', 'cat1', 
                              'cat2', 'cat3', 'cat4']].sample(1)

# # sanity check
# train.loc[train.num_cats==5, ['category_name', 'cat0', 'cat1', 
#                               'cat2', 'cat3', 'cat4']].sample(1)

# # sanity check
# train.loc[train.num_cats==3, ['category_name', 'cat0', 'cat1',
#                               'cat2', 'cat3', 'cat4']].sample(1)

# # sanity check
# train.loc[train.num_cats==0, ['category_name', 'cat0', 'cat1',
#                               'cat2', 'cat3', 'cat4']].sample(1)

In [9]:
def print_train_test_diff(train, test, column):
    num_examples = 10
    
    print("Training and testing differences in column {}"
         .format(column))
    unique_train = set(train.loc[:, column].unique())
    unique_test = set(test.loc[:, column].unique())
    unique_total = unique_train.union(unique_test)
    print("Number of unique categories, {}"
         .format(len(unique_total)))
    print("Number of unique training categories, {}"
         .format(len(unique_train)))
    print("Number of unique testing categories, {}"
         .format(len(unique_test)))
    print("Number of training categories not in test, {}"
         .format(len(unique_train.difference(unique_test))))
    if len(unique_train.difference(unique_test)) > 0:
        print("... some examples are: {}"
             .format(", ".join(list(unique_train.difference(unique_test))[:num_examples])))
    print("Number of test categories not in training, {}"
         .format(len(unique_test.difference(unique_train))))
    if len(unique_test.difference(unique_train)) > 0:
        print("... some examples are: {}"
             .format(", ".join(list(unique_test.difference(unique_train))[:num_examples])))
    print("Size of intersection, {}"
         .format(len(unique_test.intersection(unique_train))))
    print_horiz_line()

print_train_test_diff(train, test, 'cat0')
print_train_test_diff(train, test, 'cat1')
print_train_test_diff(train, test, 'cat2')
print_train_test_diff(train, test, 'cat3')
print_train_test_diff(train, test, 'cat4')
print_train_test_diff(train, test, 'brand_name')

Training and testing differences in column cat0
Number of unique categories, 11
Number of unique training categories, 11
Number of unique testing categories, 11
Number of training categories not in test, 0
Number of test categories not in training, 0
Size of intersection, 11
-----------------------
Training and testing differences in column cat1
Number of unique categories, 114
Number of unique training categories, 114
Number of unique testing categories, 114
Number of training categories not in test, 0
Number of test categories not in training, 0
Size of intersection, 114
-----------------------
Training and testing differences in column cat2
Number of unique categories, 883
Number of unique training categories, 871
Number of unique testing categories, 834
Number of training categories not in test, 49
... some examples are: Fabric Postcard, St Patricks, Towel, House, Cozy, Something Blue, Competitive Swimwear, Non-Slip Bath Mats, Bass Guitars, Tiles
Number of test categories not in tr

In [10]:
def print_column_info(df, column, num_top, num_bot=None):
    print("Top {} value counts:\n{}\n"
          .format(num_top, df[column].value_counts().iloc[:num_top]))
    if num_bot is not None:
        print("Bottom {} value counts:\n{}\n"
             .format(num_bot, df[column].value_counts().iloc[-num_bot:]))

In [11]:
print("Train data, cat0 info")
print_column_info(train, 'cat0', 11)
print("Test data, cat0 info")
print_column_info(test, 'cat0', 11)
print_horiz_line()

print("Train data, cat1 info")
print_column_info(train, 'cat1', 10, 10)
print("Test data, cat1 info")
print_column_info(test, 'cat1', 10, 10)
print_horiz_line()

print("Train data, cat2 info")
print_column_info(train, 'cat2', 10, 10)
print("Test data, cat2 info")
print_column_info(test, 'cat2', 10, 10)
print_horiz_line()

print("Train data, cat3 info")
print_column_info(train, 'cat3', 10)
print("Test data, cat3 info")
print_column_info(test, 'cat3', 10)
print_horiz_line()

print("Train data, cat4 info")
print_column_info(train, 'cat4', 10)
print("Test data, cat4 info")
print_column_info(test, 'cat4', 10)
print_horiz_line()

Train data, cat0 info
Top 11 value counts:
Women                     664385
Beauty                    207828
Kids                      171689
Electronics               122690
Men                       93680 
Home                      67871 
Vintage & Collectibles    46530 
Other                     45351 
Handmade                  30842 
Sports & Outdoors         25342 
Name: cat0, dtype: int64

Test data, cat0 info
Top 11 value counts:
Women                     310048
Beauty                    97265 
Kids                      80225 
Electronics               57679 
Men                       44066 
Home                      31966 
Vintage & Collectibles    21410 
Other                     21409 
Handmade                  14335 
Sports & Outdoors         11898 
Name: cat0, dtype: int64

-----------------------
Train data, cat1 info
Top 10 value counts:
Athletic Apparel             134383
Makeup                       124624
Tops & Blouses               106960
Shoes                       

In [43]:
def onehot_sparse(train, test, column, prefix='_', dummy_na=True,
                   rel_thresh=5, only_rel_cats=True, verbose=False):
    
    num_train = len(train)
    num_test = len(test)
    
    all_data = pd.concat([train, test], axis=0)

    if only_rel_cats:
        train_counts = train[column].value_counts()

        # these are good categories to train on
        train_thresh = train_counts.loc[train_counts >= rel_thresh]

        if verbose:
            print_horiz_line()
            print('Column, {}'.format(column))
            print("{} of {} values pass threshold of {}"
                  .format(len(train_thresh), len(train_counts), rel_thresh))
            print("Ignoring these categories, we include {}% of the train data"
                  .format(100*train_thresh.sum() / train_counts.sum()))

        test_cats = test[column].unique()
        rel_cats = set(train_thresh.index).intersection(set(test_cats))
        all_data.loc[~all_data[column].isin(rel_cats), column] = np.nan
        sparse_df = pd.get_dummies(all_data[column], prefix=prefix,
                                     dummy_na=dummy_na, sparse=True)
        sparse_cats = sparse_df.to_coo().tocsr()
        sparse_train = sparse_cats[:num_train]
        sparse_test = sparse_cats[-num_test:]
        return sparse_train, sparse_test

In [None]:
tr, ts = onehot_sparse(train, test, 'brand_name', prefix='bn')

In [44]:
train = pd.read_csv('data/train.tsv', sep='\t')
test = pd.read_csv('data/test.tsv', sep='\t')

train['cat0'], train['cat1'], train['cat2'], train['cat3'], train['cat4'] =\
    zip(*train['category_name'].apply(lambda x: split_cat(x)))
    
test['cat0'], test['cat1'], test['cat2'], test['cat3'], test['cat4'] = \
    zip(*test['category_name'].apply(lambda x: split_cat(x)))

bn_train, bn_test = onehot_sparse(train, test, 'brand_name', prefix='bn', verbose=True)
cat0_train, cat0_test = onehot_sparse(train, test, 'cat0', prefix='cat0', verbose=True)
cat1_train, cat1_test = onehot_sparse(train, test, 'cat1', prefix='cat1', verbose=True)
cat2_train, cat2_test = onehot_sparse(train, test, 'cat2', prefix='cat2', verbose=True)
cat3_train, cat3_test = onehot_sparse(train, test, 'cat3', prefix='cat3', verbose=True)
cat4_train, cat4_test = onehot_sparse(train, test, 'cat4', prefix='cat4', verbose=True)

-----------------------
Column, brand_name
2390 of 4809 values pass threshold of 5
Ignoring these categories, we include 99.47579169574033% of the train data
-----------------------
Column, cat0
10 of 10 values pass threshold of 5
Ignoring these categories, we include 100.0% of the train data
-----------------------
Column, cat1
113 of 113 values pass threshold of 5
Ignoring these categories, we include 100.0% of the train data
-----------------------
Column, cat2
750 of 870 values pass threshold of 5
Ignoring these categories, we include 99.98157441227795% of the train data
-----------------------
Column, cat3
5 of 6 values pass threshold of 5
Ignoring these categories, we include 99.95443153337891% of the train data
-----------------------
Column, cat4
2 of 2 values pass threshold of 5
Ignoring these categories, we include 100.0% of the train data


In [45]:
train_values = sp.sparse.hstack([bn_train, cat0_train, cat1_train, cat2_train,
                          cat3_train, cat4_train])
test_values = sp.sparse.hstack([bn_test, cat0_test, cat1_test, cat2_test,
                          cat3_test, cat4_test])
target = train.price.apply(lambda x: math.log(1+x))

In [46]:
xtrain, xval, ytrain, yval = train_test_split(train_values, target, test_size=0.1,
                                              random_state=0)

In [49]:
dtrain = xgb.DMatrix(xtrain, ytrain)
dval = xgb.DMatrix(xval, yval)
max_rounds = 100000

params = {
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'eta': .3,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8
#     'predictor': 'gpu_predictor'
}

evallist = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(params, dtrain, max_rounds, evallist,
                  early_stopping_rounds=120, verbose_eval=True)

[0]	train-rmse:1.87651	val-rmse:1.87216
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 120 rounds.
[1]	train-rmse:1.39884	val-rmse:1.39411
[2]	train-rmse:1.08991	val-rmse:1.08466
[3]	train-rmse:0.895474	val-rmse:0.88975
[4]	train-rmse:0.783265	val-rmse:0.777212
[5]	train-rmse:0.719126	val-rmse:0.713245
[6]	train-rmse:0.685376	val-rmse:0.679592
[7]	train-rmse:0.666222	val-rmse:0.660674
[8]	train-rmse:0.655796	val-rmse:0.650366
[9]	train-rmse:0.649713	val-rmse:0.644472
[10]	train-rmse:0.645868	val-rmse:0.640523
[11]	train-rmse:0.643005	val-rmse:0.637858
[12]	train-rmse:0.640551	val-rmse:0.635321
[13]	train-rmse:0.638218	val-rmse:0.63303
[14]	train-rmse:0.636258	val-rmse:0.631223
[15]	train-rmse:0.634752	val-rmse:0.629695
[16]	train-rmse:0.633377	val-rmse:0.628321
[17]	train-rmse:0.631133	val-rmse:0.626008
[18]	train-rmse:0.629849	val-rmse:0.624728
[19]	train-rmse:0.628643	val-rmse:0.623647
[20]	train-rmse:

[187]	train-rmse:0.589632	val-rmse:0.586257
[188]	train-rmse:0.589482	val-rmse:0.586109
[189]	train-rmse:0.589413	val-rmse:0.586041
[190]	train-rmse:0.589364	val-rmse:0.586013
[191]	train-rmse:0.589303	val-rmse:0.58597
[192]	train-rmse:0.589262	val-rmse:0.585927
[193]	train-rmse:0.589203	val-rmse:0.58587
[194]	train-rmse:0.589112	val-rmse:0.585777
[195]	train-rmse:0.589072	val-rmse:0.585717
[196]	train-rmse:0.589024	val-rmse:0.585679
[197]	train-rmse:0.58897	val-rmse:0.585665
[198]	train-rmse:0.58893	val-rmse:0.585654
[199]	train-rmse:0.588886	val-rmse:0.585612
[200]	train-rmse:0.588841	val-rmse:0.585557
[201]	train-rmse:0.588789	val-rmse:0.585533
[202]	train-rmse:0.588647	val-rmse:0.585419
[203]	train-rmse:0.588596	val-rmse:0.585368
[204]	train-rmse:0.588553	val-rmse:0.585344
[205]	train-rmse:0.588502	val-rmse:0.585298
[206]	train-rmse:0.588464	val-rmse:0.585281
[207]	train-rmse:0.588419	val-rmse:0.585255
[208]	train-rmse:0.588347	val-rmse:0.585199
[209]	train-rmse:0.588312	val-rmse:0

[375]	train-rmse:0.583202	val-rmse:0.580921
[376]	train-rmse:0.583138	val-rmse:0.580859
[377]	train-rmse:0.583122	val-rmse:0.580846
[378]	train-rmse:0.58309	val-rmse:0.580818
[379]	train-rmse:0.583075	val-rmse:0.580809
[380]	train-rmse:0.583062	val-rmse:0.580797
[381]	train-rmse:0.583043	val-rmse:0.580787
[382]	train-rmse:0.583027	val-rmse:0.580775
[383]	train-rmse:0.583012	val-rmse:0.580765
[384]	train-rmse:0.582995	val-rmse:0.580746
[385]	train-rmse:0.582979	val-rmse:0.580736
[386]	train-rmse:0.582968	val-rmse:0.580727
[387]	train-rmse:0.582955	val-rmse:0.580722
[388]	train-rmse:0.582938	val-rmse:0.580718
[389]	train-rmse:0.582922	val-rmse:0.580709
[390]	train-rmse:0.582887	val-rmse:0.580685
[391]	train-rmse:0.582866	val-rmse:0.580673
[392]	train-rmse:0.582819	val-rmse:0.580617
[393]	train-rmse:0.582806	val-rmse:0.58061
[394]	train-rmse:0.582793	val-rmse:0.5806
[395]	train-rmse:0.582781	val-rmse:0.580594
[396]	train-rmse:0.58277	val-rmse:0.580593
[397]	train-rmse:0.582756	val-rmse:0.

[562]	train-rmse:0.580563	val-rmse:0.579068
[563]	train-rmse:0.580558	val-rmse:0.579064
[564]	train-rmse:0.580553	val-rmse:0.579058
[565]	train-rmse:0.580544	val-rmse:0.579059
[566]	train-rmse:0.580539	val-rmse:0.579057
[567]	train-rmse:0.58053	val-rmse:0.579055
[568]	train-rmse:0.580523	val-rmse:0.579052
[569]	train-rmse:0.580514	val-rmse:0.579042
[570]	train-rmse:0.580501	val-rmse:0.579037
[571]	train-rmse:0.580494	val-rmse:0.579036
[572]	train-rmse:0.580483	val-rmse:0.579018
[573]	train-rmse:0.580476	val-rmse:0.579011
[574]	train-rmse:0.580461	val-rmse:0.578999
[575]	train-rmse:0.580454	val-rmse:0.57899
[576]	train-rmse:0.580447	val-rmse:0.579002
[577]	train-rmse:0.580436	val-rmse:0.579003
[578]	train-rmse:0.580431	val-rmse:0.579
[579]	train-rmse:0.580426	val-rmse:0.578991
[580]	train-rmse:0.580417	val-rmse:0.578985
[581]	train-rmse:0.580408	val-rmse:0.578982
[582]	train-rmse:0.580392	val-rmse:0.57898
[583]	train-rmse:0.580385	val-rmse:0.578982
[584]	train-rmse:0.580378	val-rmse:0.5

[750]	train-rmse:0.579181	val-rmse:0.578344
[751]	train-rmse:0.579174	val-rmse:0.578332
[752]	train-rmse:0.57917	val-rmse:0.578334
[753]	train-rmse:0.579167	val-rmse:0.578335
[754]	train-rmse:0.579163	val-rmse:0.578338
[755]	train-rmse:0.579156	val-rmse:0.578345
[756]	train-rmse:0.579154	val-rmse:0.578357
[757]	train-rmse:0.57915	val-rmse:0.578355
[758]	train-rmse:0.579145	val-rmse:0.578358
[759]	train-rmse:0.579141	val-rmse:0.578355
[760]	train-rmse:0.579135	val-rmse:0.578358
[761]	train-rmse:0.579128	val-rmse:0.578357
[762]	train-rmse:0.579125	val-rmse:0.57835
[763]	train-rmse:0.579119	val-rmse:0.578342
[764]	train-rmse:0.579116	val-rmse:0.578341
[765]	train-rmse:0.579111	val-rmse:0.578342
[766]	train-rmse:0.579107	val-rmse:0.578339
[767]	train-rmse:0.579104	val-rmse:0.578336
[768]	train-rmse:0.5791	val-rmse:0.57833
[769]	train-rmse:0.579095	val-rmse:0.578334
[770]	train-rmse:0.579093	val-rmse:0.578331
[771]	train-rmse:0.579091	val-rmse:0.578323
[772]	train-rmse:0.579086	val-rmse:0.5

[938]	train-rmse:0.578359	val-rmse:0.578019
[939]	train-rmse:0.578355	val-rmse:0.578014
[940]	train-rmse:0.57835	val-rmse:0.57801
[941]	train-rmse:0.57835	val-rmse:0.57801
[942]	train-rmse:0.578348	val-rmse:0.578012
[943]	train-rmse:0.578346	val-rmse:0.57801
[944]	train-rmse:0.578344	val-rmse:0.578013
[945]	train-rmse:0.578341	val-rmse:0.578008
[946]	train-rmse:0.57834	val-rmse:0.578009
[947]	train-rmse:0.578335	val-rmse:0.578008
[948]	train-rmse:0.578333	val-rmse:0.578012
[949]	train-rmse:0.578311	val-rmse:0.578001
[950]	train-rmse:0.578308	val-rmse:0.578001
[951]	train-rmse:0.578308	val-rmse:0.578
[952]	train-rmse:0.578306	val-rmse:0.577994
[953]	train-rmse:0.578303	val-rmse:0.577996
[954]	train-rmse:0.578301	val-rmse:0.577993
[955]	train-rmse:0.578298	val-rmse:0.577994
[956]	train-rmse:0.578298	val-rmse:0.578
[957]	train-rmse:0.578295	val-rmse:0.578001
[958]	train-rmse:0.578295	val-rmse:0.578006
[959]	train-rmse:0.578293	val-rmse:0.578005
[960]	train-rmse:0.578291	val-rmse:0.578001


[1123]	train-rmse:0.577851	val-rmse:0.577876
[1124]	train-rmse:0.577843	val-rmse:0.577872
[1125]	train-rmse:0.57784	val-rmse:0.577875
[1126]	train-rmse:0.57784	val-rmse:0.57787
[1127]	train-rmse:0.577836	val-rmse:0.577872
[1128]	train-rmse:0.577835	val-rmse:0.577872
[1129]	train-rmse:0.577832	val-rmse:0.577878
[1130]	train-rmse:0.577831	val-rmse:0.577874
[1131]	train-rmse:0.577831	val-rmse:0.577878
[1132]	train-rmse:0.577829	val-rmse:0.577878
[1133]	train-rmse:0.577827	val-rmse:0.57788
[1134]	train-rmse:0.577826	val-rmse:0.577879
[1135]	train-rmse:0.577823	val-rmse:0.577883
[1136]	train-rmse:0.577822	val-rmse:0.577879
[1137]	train-rmse:0.577821	val-rmse:0.577879
[1138]	train-rmse:0.577821	val-rmse:0.57788
[1139]	train-rmse:0.57782	val-rmse:0.577873
[1140]	train-rmse:0.577819	val-rmse:0.577877
[1141]	train-rmse:0.577818	val-rmse:0.577884
[1142]	train-rmse:0.577817	val-rmse:0.577888
[1143]	train-rmse:0.577816	val-rmse:0.577884
[1144]	train-rmse:0.577814	val-rmse:0.57788
[1145]	train-rmse

[1307]	train-rmse:0.577596	val-rmse:0.57787
[1308]	train-rmse:0.577595	val-rmse:0.577868
[1309]	train-rmse:0.577594	val-rmse:0.577868
[1310]	train-rmse:0.577593	val-rmse:0.577872
[1311]	train-rmse:0.577592	val-rmse:0.577871
[1312]	train-rmse:0.577592	val-rmse:0.577873
[1313]	train-rmse:0.577591	val-rmse:0.577875
[1314]	train-rmse:0.577591	val-rmse:0.577878
[1315]	train-rmse:0.577589	val-rmse:0.57788
[1316]	train-rmse:0.577587	val-rmse:0.577881
[1317]	train-rmse:0.577587	val-rmse:0.57788
[1318]	train-rmse:0.577586	val-rmse:0.577882
[1319]	train-rmse:0.577585	val-rmse:0.577885
[1320]	train-rmse:0.577584	val-rmse:0.577894
[1321]	train-rmse:0.577583	val-rmse:0.5779
[1322]	train-rmse:0.577583	val-rmse:0.5779
[1323]	train-rmse:0.577584	val-rmse:0.577903
[1324]	train-rmse:0.577583	val-rmse:0.577898
[1325]	train-rmse:0.577583	val-rmse:0.577899
[1326]	train-rmse:0.577582	val-rmse:0.577901
[1327]	train-rmse:0.577581	val-rmse:0.577895
[1328]	train-rmse:0.57758	val-rmse:0.577895
[1329]	train-rmse:

[1491]	train-rmse:0.57739	val-rmse:0.577794
[1492]	train-rmse:0.577389	val-rmse:0.577793
[1493]	train-rmse:0.577388	val-rmse:0.577798
[1494]	train-rmse:0.577387	val-rmse:0.5778
[1495]	train-rmse:0.577383	val-rmse:0.577795
[1496]	train-rmse:0.577382	val-rmse:0.577794
[1497]	train-rmse:0.577382	val-rmse:0.577796
[1498]	train-rmse:0.577381	val-rmse:0.577801
[1499]	train-rmse:0.577379	val-rmse:0.577797
[1500]	train-rmse:0.577378	val-rmse:0.577797
[1501]	train-rmse:0.577376	val-rmse:0.577791
[1502]	train-rmse:0.577375	val-rmse:0.577792
[1503]	train-rmse:0.577373	val-rmse:0.577789
[1504]	train-rmse:0.577373	val-rmse:0.577793
[1505]	train-rmse:0.577372	val-rmse:0.577798
[1506]	train-rmse:0.577371	val-rmse:0.5778
[1507]	train-rmse:0.57737	val-rmse:0.577803
[1508]	train-rmse:0.57737	val-rmse:0.577803
[1509]	train-rmse:0.57737	val-rmse:0.577803
[1510]	train-rmse:0.577368	val-rmse:0.577806
[1511]	train-rmse:0.577368	val-rmse:0.577808
[1512]	train-rmse:0.577368	val-rmse:0.577806
[1513]	train-rmse:

[1674]	train-rmse:0.577233	val-rmse:0.577762
[1675]	train-rmse:0.577233	val-rmse:0.57776
[1676]	train-rmse:0.577233	val-rmse:0.577759
[1677]	train-rmse:0.577232	val-rmse:0.577758
[1678]	train-rmse:0.577232	val-rmse:0.577761
[1679]	train-rmse:0.577231	val-rmse:0.577759
[1680]	train-rmse:0.57723	val-rmse:0.577763
[1681]	train-rmse:0.577229	val-rmse:0.577763
[1682]	train-rmse:0.57723	val-rmse:0.577766
[1683]	train-rmse:0.57723	val-rmse:0.577765
[1684]	train-rmse:0.577228	val-rmse:0.577766
[1685]	train-rmse:0.577228	val-rmse:0.577768
[1686]	train-rmse:0.577228	val-rmse:0.57777
[1687]	train-rmse:0.577228	val-rmse:0.577775
[1688]	train-rmse:0.577227	val-rmse:0.577777
[1689]	train-rmse:0.577226	val-rmse:0.577777
[1690]	train-rmse:0.577226	val-rmse:0.57778
[1691]	train-rmse:0.577222	val-rmse:0.577783
[1692]	train-rmse:0.577222	val-rmse:0.57778
[1693]	train-rmse:0.57722	val-rmse:0.577777
[1694]	train-rmse:0.57722	val-rmse:0.577779
[1695]	train-rmse:0.57722	val-rmse:0.577779
[1696]	train-rmse:0.

In [None]:
rel_thresh = 5
column = 'brand_name'
train_counts = train[column].value_counts()
thresh_train = train_counts.loc[train_counts >= rel_thresh]
100*thresh_train.sum() / train_counts.sum()

In [None]:
train.head(1)

In [None]:
column = 'cat0'
a = train[column].value_counts()
a.loc[a > 30000].index
train[column].value_counts()
len(a)