In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import json

%matplotlib inline

plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (12,8)

In [2]:
df_train = pd.read_csv('train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('test_nolabel.tsv', sep='\t', index_col=0)

In [47]:
np.unique(df_train.sold_mode)

array([1, 2], dtype=int64)

In [3]:
new_col = np.empty(df_train.properties.values.shape[0], dtype=object)
for i, item_prop in enumerate(df_train.properties.values):
    if "Cee\'d" in item_prop:
        item_prop = item_prop.replace('"Cee\'d"','\'Ceed\'')
    else:
        item_prop = item_prop.replace('"', '')   
    j = [json.loads(item_prop.replace('\'', '"').replace('\\xa0', ''))]
    string = ' '.join([a["slug_name"] + ' ' + a["value"] for a in j[0]])
    new_col[i] = string
    #print(string)

In [4]:
X = df_train.drop(['sold_fast', 'properties', 'product_id', 'owner_id'], axis=1)
y = df_train['sold_fast']

In [5]:
X['prop_text'] = new_col
X.head()

Unnamed: 0,category_id,city,date_created,delivery_available,desc_text,img_num,lat,long,name_text,payment_available,price,product_type,region,sold_mode,subcategory_id,prop_text
1,4,Краснодар,2018-10-08,False,"Продаю стол раскладной, деревянный, советский ...",3,45.0686,38.9518,Стол,True,500.0,1,Краснодарский край,1,410,Тип Столы
2,4,Тюмень,2018-06-18,False,"Тарелки глубокие 6 шт. Блюдца, чашки по 6 шт. ...",2,57.184,65.5674,Посуда,False,300.0,1,Тюменская область,1,405,Тип Тарелки
4,9,Омск,2018-07-31,True,"Новый,с этикеткой. Размер L. Не подошёл по раз...",1,54.9889,73.4312,Костюм,True,1100.0,1,Омская область,1,908,Тип Костюмы с юбкой Размер 46-48 (L)
6,3,Санкт-Петербург,2018-04-17,False,"Складывается тростью, все колеса вниз. Сплошна...",4,59.959,30.4877,Коляска,True,5000.0,1,Ленинградская область,1,312,Тип Прогулочная Возраст До 3 лет
10,5,Москва,2018-02-09,False,"Неразлучники, птичкам по 1,5 года. Продаю с бо...",2,55.6473,37.4118,Волнистые попугаи,True,2000.0,1,Московская область,1,504,


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

name_text = X_train['name_text'].values
vectorizer_name_text = TfidfVectorizer(max_features=200, decode_error='ignore')
vectorizer_name_text.fit(name_text)

desc_text = X_train['desc_text'].values
vectorizer_desc_text = TfidfVectorizer(max_features=200, decode_error='ignore')
vectorizer_desc_text.fit(desc_text)

prop_text = X_train['prop_text'].values
vectorizer_prop_text = TfidfVectorizer(max_features=200, decode_error='ignore')
vectorizer_prop_text.fit(prop_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=200,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, OneHotEncoder
onehot_preprocess = ColumnTransformer([("dummy_col", OneHotEncoder(
    handle_unknown='ignore',
    sparse=False,
    categories=[
        sorted(set(X_train['region']) & set(df_test['region'])),
        sorted(set(X_train['category_id']) & set(df_test['category_id'])),
        sorted(set(X_train['subcategory_id']) & set(df_test['subcategory_id'])),
        sorted(set(X_train['sold_mode']) & set(df_test['sold_mode'])),
        sorted(set(X_train['product_type']) & set(df_test['product_type']))
    ]), ['region', 'category_id', 'subcategory_id', 'sold_mode', 'product_type'])])

onehot_preprocess.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('dummy_col',
                                 OneHotEncoder(categorical_features=None,
                                               categories=[['Адыгея',
                                                            'Алтайский край',
                                                            'Амурская область',
                                                            'Архангельская '
                                                            'область',
                                                            'Астраханская '
                                                            'область',
                                                            'Башкортостан',
                                                            'Белгородская '
                                                            'область',
                   

In [20]:
from scipy.sparse import hstack

def date2ymd(date):
    date_plitted = date.split('-')
    year = date_plitted[0]
    month = date_plitted[1]
    day = date_plitted[2]
    return int(month)

def preprocessing(data):
    
    x1 = data[['lat', 'long', 'price', 'sold_mode', 'img_num']].values
    x2 = np.array([date2ymd(i) for i in data['date_created']]).reshape(-1,1)
    x22 = (data['price'].values * data['product_type'].values).reshape(-1,1)
    x23 = (data['sold_mode'].values * data['subcategory_id'].values).reshape(-1,1)
    x24 = (data['sold_mode'].values * data['price'].values).reshape(-1,1)
    x3 = data[['payment_available', 'delivery_available']].values * 1.
    
    
    x1000 = vectorizer_name_text.transform(data['name_text']).toarray()
    x1001 = vectorizer_desc_text.transform(data['desc_text']).toarray()
    x1002 = vectorizer_prop_text.transform(data['prop_text']).toarray()
    
    x_new = onehot_preprocess.transform(data)
    # x2, x22, x23, x24, x3, x1000, x1001, x1002
    X = np.concatenate([x1, x2, x22, x23, x24, x3, x1000, x1001, x1002, x_new], axis=1)
    
    return X

In [21]:
X_train = preprocessing(X_train)#.tocsr()
X_val = preprocessing(X_val)#.tocsr()

In [24]:
X_sub = preprocessing(X)

In [11]:
X_train.shape, X_val.shape

((175640, 987), (175641, 987))

In [12]:
import catboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [13]:
cb = catboost.CatBoostClassifier(learning_rate=0.1, iterations=2000, depth=4)
tree = DecisionTreeClassifier(max_depth=4)

In [14]:
from sklearn.metrics import roc_auc_score

In [25]:
cb.fit(X_sub, y)

0:	learn: 0.6612106	total: 498ms	remaining: 16m 35s
1:	learn: 0.6355247	total: 964ms	remaining: 16m 3s
2:	learn: 0.6157374	total: 1.44s	remaining: 15m 58s
3:	learn: 0.5994293	total: 1.88s	remaining: 15m 38s
4:	learn: 0.5869003	total: 2.29s	remaining: 15m 12s
5:	learn: 0.5764090	total: 2.69s	remaining: 14m 53s
6:	learn: 0.5681556	total: 3.09s	remaining: 14m 40s
7:	learn: 0.5610138	total: 3.52s	remaining: 14m 37s
8:	learn: 0.5555694	total: 3.97s	remaining: 14m 38s
9:	learn: 0.5512238	total: 4.39s	remaining: 14m 33s
10:	learn: 0.5475781	total: 4.91s	remaining: 14m 46s
11:	learn: 0.5445090	total: 5.48s	remaining: 15m 7s
12:	learn: 0.5419921	total: 6.02s	remaining: 15m 20s
13:	learn: 0.5399368	total: 6.52s	remaining: 15m 25s
14:	learn: 0.5382130	total: 7.02s	remaining: 15m 28s
15:	learn: 0.5367062	total: 7.45s	remaining: 15m 24s
16:	learn: 0.5356171	total: 7.84s	remaining: 15m 15s
17:	learn: 0.5345579	total: 8.26s	remaining: 15m 9s
18:	learn: 0.5337512	total: 8.66s	remaining: 15m 2s
19:	lea

155:	learn: 0.5215760	total: 1m 19s	remaining: 15m 38s
156:	learn: 0.5215312	total: 1m 20s	remaining: 15m 41s
157:	learn: 0.5215056	total: 1m 20s	remaining: 15m 40s
158:	learn: 0.5214781	total: 1m 21s	remaining: 15m 39s
159:	learn: 0.5214319	total: 1m 21s	remaining: 15m 37s
160:	learn: 0.5214135	total: 1m 21s	remaining: 15m 36s
161:	learn: 0.5213971	total: 1m 22s	remaining: 15m 34s
162:	learn: 0.5213524	total: 1m 22s	remaining: 15m 34s
163:	learn: 0.5213299	total: 1m 23s	remaining: 15m 34s
164:	learn: 0.5213061	total: 1m 23s	remaining: 15m 33s
165:	learn: 0.5212813	total: 1m 24s	remaining: 15m 32s
166:	learn: 0.5212608	total: 1m 25s	remaining: 15m 34s
167:	learn: 0.5212452	total: 1m 25s	remaining: 15m 36s
168:	learn: 0.5212071	total: 1m 26s	remaining: 15m 41s
169:	learn: 0.5211819	total: 1m 27s	remaining: 15m 41s
170:	learn: 0.5211590	total: 1m 28s	remaining: 15m 41s
171:	learn: 0.5211292	total: 1m 28s	remaining: 15m 44s
172:	learn: 0.5211143	total: 1m 29s	remaining: 15m 44s
173:	learn

306:	learn: 0.5186091	total: 2m 27s	remaining: 13m 33s
307:	learn: 0.5185980	total: 2m 27s	remaining: 13m 32s
308:	learn: 0.5185673	total: 2m 28s	remaining: 13m 31s
309:	learn: 0.5185551	total: 2m 28s	remaining: 13m 31s
310:	learn: 0.5185439	total: 2m 29s	remaining: 13m 31s
311:	learn: 0.5185332	total: 2m 29s	remaining: 13m 30s
312:	learn: 0.5185213	total: 2m 30s	remaining: 13m 30s
313:	learn: 0.5185106	total: 2m 30s	remaining: 13m 30s
314:	learn: 0.5185006	total: 2m 31s	remaining: 13m 29s
315:	learn: 0.5184883	total: 2m 31s	remaining: 13m 28s
316:	learn: 0.5184748	total: 2m 32s	remaining: 13m 27s
317:	learn: 0.5184652	total: 2m 32s	remaining: 13m 26s
318:	learn: 0.5184440	total: 2m 32s	remaining: 13m 25s
319:	learn: 0.5184263	total: 2m 33s	remaining: 13m 25s
320:	learn: 0.5184153	total: 2m 33s	remaining: 13m 24s
321:	learn: 0.5184005	total: 2m 34s	remaining: 13m 23s
322:	learn: 0.5183878	total: 2m 34s	remaining: 13m 22s
323:	learn: 0.5183744	total: 2m 34s	remaining: 13m 21s
324:	learn

457:	learn: 0.5165749	total: 3m 32s	remaining: 11m 54s
458:	learn: 0.5165702	total: 3m 32s	remaining: 11m 53s
459:	learn: 0.5165604	total: 3m 33s	remaining: 11m 53s
460:	learn: 0.5165485	total: 3m 33s	remaining: 11m 53s
461:	learn: 0.5165436	total: 3m 34s	remaining: 11m 52s
462:	learn: 0.5165370	total: 3m 34s	remaining: 11m 52s
463:	learn: 0.5165299	total: 3m 35s	remaining: 11m 51s
464:	learn: 0.5165199	total: 3m 35s	remaining: 11m 51s
465:	learn: 0.5165052	total: 3m 35s	remaining: 11m 50s
466:	learn: 0.5164901	total: 3m 36s	remaining: 11m 50s
467:	learn: 0.5164823	total: 3m 36s	remaining: 11m 49s
468:	learn: 0.5164719	total: 3m 37s	remaining: 11m 48s
469:	learn: 0.5164639	total: 3m 37s	remaining: 11m 48s
470:	learn: 0.5164550	total: 3m 37s	remaining: 11m 47s
471:	learn: 0.5164425	total: 3m 38s	remaining: 11m 46s
472:	learn: 0.5164267	total: 3m 38s	remaining: 11m 46s
473:	learn: 0.5164161	total: 3m 39s	remaining: 11m 45s
474:	learn: 0.5164062	total: 3m 39s	remaining: 11m 45s
475:	learn

607:	learn: 0.5150837	total: 4m 41s	remaining: 10m 44s
608:	learn: 0.5150742	total: 4m 42s	remaining: 10m 44s
609:	learn: 0.5150661	total: 4m 42s	remaining: 10m 43s
610:	learn: 0.5150583	total: 4m 43s	remaining: 10m 43s
611:	learn: 0.5150488	total: 4m 43s	remaining: 10m 42s
612:	learn: 0.5150406	total: 4m 43s	remaining: 10m 42s
613:	learn: 0.5150294	total: 4m 44s	remaining: 10m 41s
614:	learn: 0.5150201	total: 4m 44s	remaining: 10m 41s
615:	learn: 0.5150136	total: 4m 45s	remaining: 10m 40s
616:	learn: 0.5150044	total: 4m 45s	remaining: 10m 40s
617:	learn: 0.5149951	total: 4m 45s	remaining: 10m 39s
618:	learn: 0.5149848	total: 4m 46s	remaining: 10m 38s
619:	learn: 0.5149762	total: 4m 46s	remaining: 10m 38s
620:	learn: 0.5149619	total: 4m 47s	remaining: 10m 37s
621:	learn: 0.5149546	total: 4m 47s	remaining: 10m 37s
622:	learn: 0.5149474	total: 4m 48s	remaining: 10m 36s
623:	learn: 0.5149387	total: 4m 48s	remaining: 10m 36s
624:	learn: 0.5149272	total: 4m 49s	remaining: 10m 36s
625:	learn

759:	learn: 0.5136722	total: 5m 50s	remaining: 9m 32s
760:	learn: 0.5136658	total: 5m 51s	remaining: 9m 31s
761:	learn: 0.5136596	total: 5m 51s	remaining: 9m 31s
762:	learn: 0.5136553	total: 5m 51s	remaining: 9m 30s
763:	learn: 0.5136470	total: 5m 52s	remaining: 9m 30s
764:	learn: 0.5136408	total: 5m 52s	remaining: 9m 29s
765:	learn: 0.5136319	total: 5m 53s	remaining: 9m 29s
766:	learn: 0.5136241	total: 5m 53s	remaining: 9m 28s
767:	learn: 0.5136158	total: 5m 54s	remaining: 9m 27s
768:	learn: 0.5136047	total: 5m 54s	remaining: 9m 27s
769:	learn: 0.5135914	total: 5m 54s	remaining: 9m 26s
770:	learn: 0.5135811	total: 5m 55s	remaining: 9m 26s
771:	learn: 0.5135736	total: 5m 55s	remaining: 9m 25s
772:	learn: 0.5135644	total: 5m 56s	remaining: 9m 25s
773:	learn: 0.5135544	total: 5m 56s	remaining: 9m 25s
774:	learn: 0.5135436	total: 5m 57s	remaining: 9m 25s
775:	learn: 0.5135372	total: 5m 58s	remaining: 9m 24s
776:	learn: 0.5135319	total: 5m 58s	remaining: 9m 24s
777:	learn: 0.5135256	total:

913:	learn: 0.5124372	total: 7m 4s	remaining: 8m 24s
914:	learn: 0.5124274	total: 7m 5s	remaining: 8m 24s
915:	learn: 0.5124174	total: 7m 5s	remaining: 8m 23s
916:	learn: 0.5124113	total: 7m 5s	remaining: 8m 23s
917:	learn: 0.5124047	total: 7m 6s	remaining: 8m 22s
918:	learn: 0.5123990	total: 7m 6s	remaining: 8m 21s
919:	learn: 0.5123786	total: 7m 7s	remaining: 8m 21s
920:	learn: 0.5123679	total: 7m 7s	remaining: 8m 20s
921:	learn: 0.5123604	total: 7m 8s	remaining: 8m 20s
922:	learn: 0.5123499	total: 7m 8s	remaining: 8m 20s
923:	learn: 0.5123429	total: 7m 9s	remaining: 8m 19s
924:	learn: 0.5123330	total: 7m 9s	remaining: 8m 19s
925:	learn: 0.5123188	total: 7m 10s	remaining: 8m 18s
926:	learn: 0.5123134	total: 7m 10s	remaining: 8m 18s
927:	learn: 0.5123068	total: 7m 10s	remaining: 8m 17s
928:	learn: 0.5122998	total: 7m 11s	remaining: 8m 17s
929:	learn: 0.5122924	total: 7m 11s	remaining: 8m 16s
930:	learn: 0.5122797	total: 7m 12s	remaining: 8m 16s
931:	learn: 0.5122687	total: 7m 12s	rema

1065:	learn: 0.5112684	total: 8m 12s	remaining: 7m 11s
1066:	learn: 0.5112582	total: 8m 12s	remaining: 7m 10s
1067:	learn: 0.5112539	total: 8m 13s	remaining: 7m 10s
1068:	learn: 0.5112495	total: 8m 13s	remaining: 7m 9s
1069:	learn: 0.5112375	total: 8m 13s	remaining: 7m 9s
1070:	learn: 0.5112293	total: 8m 14s	remaining: 7m 8s
1071:	learn: 0.5112238	total: 8m 14s	remaining: 7m 8s
1072:	learn: 0.5112163	total: 8m 15s	remaining: 7m 7s
1073:	learn: 0.5112076	total: 8m 15s	remaining: 7m 7s
1074:	learn: 0.5112000	total: 8m 16s	remaining: 7m 6s
1075:	learn: 0.5111955	total: 8m 16s	remaining: 7m 6s
1076:	learn: 0.5111853	total: 8m 16s	remaining: 7m 5s
1077:	learn: 0.5111770	total: 8m 17s	remaining: 7m 5s
1078:	learn: 0.5111659	total: 8m 17s	remaining: 7m 4s
1079:	learn: 0.5111618	total: 8m 18s	remaining: 7m 4s
1080:	learn: 0.5111516	total: 8m 18s	remaining: 7m 3s
1081:	learn: 0.5111423	total: 8m 18s	remaining: 7m 3s
1082:	learn: 0.5111360	total: 8m 19s	remaining: 7m 2s
1083:	learn: 0.5111256	to

1216:	learn: 0.5100743	total: 9m 21s	remaining: 6m 1s
1217:	learn: 0.5100660	total: 9m 22s	remaining: 6m 1s
1218:	learn: 0.5100603	total: 9m 22s	remaining: 6m
1219:	learn: 0.5100454	total: 9m 23s	remaining: 6m
1220:	learn: 0.5100385	total: 9m 23s	remaining: 5m 59s
1221:	learn: 0.5100335	total: 9m 24s	remaining: 5m 59s
1222:	learn: 0.5100237	total: 9m 24s	remaining: 5m 58s
1223:	learn: 0.5100162	total: 9m 25s	remaining: 5m 58s
1224:	learn: 0.5100105	total: 9m 25s	remaining: 5m 57s
1225:	learn: 0.5100010	total: 9m 26s	remaining: 5m 57s
1226:	learn: 0.5099957	total: 9m 26s	remaining: 5m 56s
1227:	learn: 0.5099867	total: 9m 27s	remaining: 5m 56s
1228:	learn: 0.5099810	total: 9m 27s	remaining: 5m 56s
1229:	learn: 0.5099758	total: 9m 27s	remaining: 5m 55s
1230:	learn: 0.5099693	total: 9m 28s	remaining: 5m 55s
1231:	learn: 0.5099595	total: 9m 28s	remaining: 5m 54s
1232:	learn: 0.5099529	total: 9m 29s	remaining: 5m 54s
1233:	learn: 0.5099437	total: 9m 29s	remaining: 5m 53s
1234:	learn: 0.50993

1366:	learn: 0.5090586	total: 10m 25s	remaining: 4m 49s
1367:	learn: 0.5090511	total: 10m 26s	remaining: 4m 49s
1368:	learn: 0.5090407	total: 10m 26s	remaining: 4m 48s
1369:	learn: 0.5090359	total: 10m 27s	remaining: 4m 48s
1370:	learn: 0.5090211	total: 10m 28s	remaining: 4m 48s
1371:	learn: 0.5090126	total: 10m 28s	remaining: 4m 47s
1372:	learn: 0.5090080	total: 10m 29s	remaining: 4m 47s
1373:	learn: 0.5089993	total: 10m 29s	remaining: 4m 46s
1374:	learn: 0.5089909	total: 10m 30s	remaining: 4m 46s
1375:	learn: 0.5089823	total: 10m 30s	remaining: 4m 46s
1376:	learn: 0.5089784	total: 10m 31s	remaining: 4m 45s
1377:	learn: 0.5089710	total: 10m 31s	remaining: 4m 45s
1378:	learn: 0.5089645	total: 10m 31s	remaining: 4m 44s
1379:	learn: 0.5089613	total: 10m 32s	remaining: 4m 44s
1380:	learn: 0.5089542	total: 10m 32s	remaining: 4m 43s
1381:	learn: 0.5089461	total: 10m 33s	remaining: 4m 43s
1382:	learn: 0.5089396	total: 10m 33s	remaining: 4m 42s
1383:	learn: 0.5089336	total: 10m 33s	remaining:

1514:	learn: 0.5079637	total: 11m 30s	remaining: 3m 41s
1515:	learn: 0.5079596	total: 11m 30s	remaining: 3m 40s
1516:	learn: 0.5079526	total: 11m 31s	remaining: 3m 40s
1517:	learn: 0.5079406	total: 11m 31s	remaining: 3m 39s
1518:	learn: 0.5079342	total: 11m 32s	remaining: 3m 39s
1519:	learn: 0.5079315	total: 11m 32s	remaining: 3m 38s
1520:	learn: 0.5079258	total: 11m 33s	remaining: 3m 38s
1521:	learn: 0.5079207	total: 11m 33s	remaining: 3m 37s
1522:	learn: 0.5079146	total: 11m 34s	remaining: 3m 37s
1523:	learn: 0.5079105	total: 11m 34s	remaining: 3m 36s
1524:	learn: 0.5079002	total: 11m 35s	remaining: 3m 36s
1525:	learn: 0.5078908	total: 11m 35s	remaining: 3m 36s
1526:	learn: 0.5078872	total: 11m 35s	remaining: 3m 35s
1527:	learn: 0.5078800	total: 11m 36s	remaining: 3m 35s
1528:	learn: 0.5078708	total: 11m 36s	remaining: 3m 34s
1529:	learn: 0.5078583	total: 11m 37s	remaining: 3m 34s
1530:	learn: 0.5078514	total: 11m 37s	remaining: 3m 33s
1531:	learn: 0.5078465	total: 11m 37s	remaining:

1662:	learn: 0.5069907	total: 12m 35s	remaining: 2m 32s
1663:	learn: 0.5069871	total: 12m 35s	remaining: 2m 32s
1664:	learn: 0.5069843	total: 12m 35s	remaining: 2m 32s
1665:	learn: 0.5069734	total: 12m 36s	remaining: 2m 31s
1666:	learn: 0.5069681	total: 12m 36s	remaining: 2m 31s
1667:	learn: 0.5069596	total: 12m 37s	remaining: 2m 30s
1668:	learn: 0.5069532	total: 12m 37s	remaining: 2m 30s
1669:	learn: 0.5069443	total: 12m 38s	remaining: 2m 29s
1670:	learn: 0.5069407	total: 12m 38s	remaining: 2m 29s
1671:	learn: 0.5069349	total: 12m 39s	remaining: 2m 28s
1672:	learn: 0.5069273	total: 12m 39s	remaining: 2m 28s
1673:	learn: 0.5069210	total: 12m 39s	remaining: 2m 27s
1674:	learn: 0.5069170	total: 12m 40s	remaining: 2m 27s
1675:	learn: 0.5069103	total: 12m 40s	remaining: 2m 27s
1676:	learn: 0.5069029	total: 12m 41s	remaining: 2m 26s
1677:	learn: 0.5068941	total: 12m 41s	remaining: 2m 26s
1678:	learn: 0.5068873	total: 12m 41s	remaining: 2m 25s
1679:	learn: 0.5068824	total: 12m 42s	remaining:

1810:	learn: 0.5060301	total: 13m 41s	remaining: 1m 25s
1811:	learn: 0.5060240	total: 13m 41s	remaining: 1m 25s
1812:	learn: 0.5060204	total: 13m 42s	remaining: 1m 24s
1813:	learn: 0.5060168	total: 13m 42s	remaining: 1m 24s
1814:	learn: 0.5060085	total: 13m 42s	remaining: 1m 23s
1815:	learn: 0.5060041	total: 13m 43s	remaining: 1m 23s
1816:	learn: 0.5059939	total: 13m 43s	remaining: 1m 22s
1817:	learn: 0.5059861	total: 13m 44s	remaining: 1m 22s
1818:	learn: 0.5059824	total: 13m 44s	remaining: 1m 22s
1819:	learn: 0.5059739	total: 13m 44s	remaining: 1m 21s
1820:	learn: 0.5059675	total: 13m 45s	remaining: 1m 21s
1821:	learn: 0.5059621	total: 13m 45s	remaining: 1m 20s
1822:	learn: 0.5059556	total: 13m 46s	remaining: 1m 20s
1823:	learn: 0.5059437	total: 13m 46s	remaining: 1m 19s
1824:	learn: 0.5059363	total: 13m 47s	remaining: 1m 19s
1825:	learn: 0.5059287	total: 13m 47s	remaining: 1m 18s
1826:	learn: 0.5059220	total: 13m 47s	remaining: 1m 18s
1827:	learn: 0.5059201	total: 13m 48s	remaining:

1960:	learn: 0.5050702	total: 14m 52s	remaining: 17.8s
1961:	learn: 0.5050667	total: 14m 53s	remaining: 17.3s
1962:	learn: 0.5050606	total: 14m 53s	remaining: 16.8s
1963:	learn: 0.5050572	total: 14m 53s	remaining: 16.4s
1964:	learn: 0.5050525	total: 14m 54s	remaining: 15.9s
1965:	learn: 0.5050453	total: 14m 54s	remaining: 15.5s
1966:	learn: 0.5050331	total: 14m 55s	remaining: 15s
1967:	learn: 0.5050224	total: 14m 55s	remaining: 14.6s
1968:	learn: 0.5050152	total: 14m 55s	remaining: 14.1s
1969:	learn: 0.5050088	total: 14m 56s	remaining: 13.6s
1970:	learn: 0.5050032	total: 14m 56s	remaining: 13.2s
1971:	learn: 0.5050004	total: 14m 57s	remaining: 12.7s
1972:	learn: 0.5049951	total: 14m 58s	remaining: 12.3s
1973:	learn: 0.5049869	total: 14m 59s	remaining: 11.8s
1974:	learn: 0.5049821	total: 15m	remaining: 11.4s
1975:	learn: 0.5049751	total: 15m	remaining: 10.9s
1976:	learn: 0.5049684	total: 15m 1s	remaining: 10.5s
1977:	learn: 0.5049618	total: 15m 1s	remaining: 10s
1978:	learn: 0.5049549	t

<catboost.core.CatBoostClassifier at 0x212469a0e80>

In [23]:
y_val_pred = cb.predict_proba(X_val)
auc_val = roc_auc_score(y_val, y_val_pred[:, 1])

print("Test AUC: ", auc_val)

Test AUC:  0.6291615135728217


In [97]:
# 0.6364087100299958
# 0.62545 100 iterations

y_val_pred = cb.predict_proba(X_val)
auc_val = roc_auc_score(y_val, y_val_pred[:, 1])

print("Test AUC: ", auc_val)

Test AUC:  0.6313649623956481


In [108]:
cb.get_feature_importance().argmax()

5

In [59]:
y_val_pred = cb.predict_proba(X_val)
auc_val = roc_auc_score(y_val, y_val_pred[:, 1])

print("Test AUC: ", auc_val)

Test AUC:  0.6350649384325602


In [90]:
%%time
tree.fit(X_train, y_train)

Wall time: 11.8 s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [91]:
y_val_pred = tree.predict_proba(X_val)
auc_val = roc_auc_score(y_val, y_val_pred[:, 1])

print("Test AUC: ", auc_val)

Test AUC:  0.5751529350914758


In [109]:
from sklearn.decomposition import PCA

In [114]:
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train) 

In [115]:
gb = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, n_estimators=200, verbose=True)

In [116]:
gb.fit(X_train_pca, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.0741            8.33m
         2           1.0709            8.73m
         3           1.0683            8.89m
         4           1.0661            9.02m
         5           1.0639            8.84m
         6           1.0622            8.72m
         7           1.0602            8.57m
         8           1.0585            8.46m
         9           1.0567            8.35m
        10           1.0557            8.36m
        20           1.0463            7.83m
        30           1.0412            7.40m
        40           1.0377            6.95m
        50           1.0350            6.44m
        60           1.0327            6.01m
        70           1.0308            5.52m
        80           1.0290            5.19m
        90           1.0273            4.72m
       100           1.0261            4.28m
       200           1.0144            0.00s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=4,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=True,
                           warm_start=False)

In [118]:
X_val_pca = pca.transform(X_val)

In [119]:
y_val_pred = gb.predict_proba(X_val_pca)
auc_val = roc_auc_score(y_val, y_val_pred[:, 1])

print("Test AUC: ", auc_val)

Test AUC:  0.6296121922129543


In [35]:
# %%time
# n = 5
# step = X_train.shape[0] // n
# clfs = []
# for split in np.arange(0, X_train.shape[0], step).astype(int):
#     print('__________', vstack([X_train[:split], X_train[split+step:]]).shape, np.concatenate([y_train.values[:split], y_train.values[split+step:]]).shape)
    
#     clf = XGBClassifier(n_estimators=200, learning_rate=0.02, max_depth=8, n_jobs=3, colsample_bytree=0.7, scale_pos_weight=1.)
#     clf.fit(vstack([X_train[:split], X_train[split+step:]]), np.concatenate([y_train.values[:split], y_train.values[split+step:]]))
#     clfs.append(clf)

In [36]:
from xgboost import XGBClassifier

#clf = RandomForestClassifier(n_estimators=100, n_jobs=3, class_weight='balanced', min_samples_leaf=100)
clf = XGBClassifier(n_estimators=200, learning_rate=0.02, max_depth=8, n_jobs=3, colsample_bytree=0.7, scale_pos_weight=1., verbosity=3)
clf.fit(X_train, y_train)

[17:09:12] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=8
[17:09:12] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 298 extra nodes, 0 pruned nodes, max_depth=8
[17:09:12] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 318 extra nodes, 0 pruned nodes, max_depth=8
[17:09:13] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 288 extra nodes, 0 pruned nodes, max_depth=8
[17:09:13] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 312 extra nodes, 0 pruned nodes, max_depth=8
[17:09:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=8
[17:09:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=8
[17:09:15] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 284 e

[17:09:42] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 202 extra nodes, 0 pruned nodes, max_depth=8
[17:09:43] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 278 extra nodes, 0 pruned nodes, max_depth=8
[17:09:43] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=8
[17:09:44] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=8
[17:09:44] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 300 extra nodes, 0 pruned nodes, max_depth=8
[17:09:45] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=8
[17:09:45] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=8
[17:09:46] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 270 e

[17:10:11] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 274 extra nodes, 0 pruned nodes, max_depth=8
[17:10:12] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 176 extra nodes, 0 pruned nodes, max_depth=8
[17:10:12] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 204 extra nodes, 0 pruned nodes, max_depth=8
[17:10:13] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, max_depth=8
[17:10:13] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 192 extra nodes, 0 pruned nodes, max_depth=8
[17:10:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 276 extra nodes, 0 pruned nodes, max_depth=8
[17:10:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 212 extra nodes, 0 pruned nodes, max_depth=8
[17:10:15] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 148 e

[17:10:40] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 132 extra nodes, 0 pruned nodes, max_depth=8
[17:10:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 120 extra nodes, 0 pruned nodes, max_depth=8
[17:10:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 96 extra nodes, 0 pruned nodes, max_depth=8
[17:10:41] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 202 extra nodes, 0 pruned nodes, max_depth=8
[17:10:42] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=8
[17:10:42] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=8
[17:10:43] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=8
[17:10:43] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 168 ex

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       learning_rate=0.02, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=3,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.0, seed=None,
       silent=None, subsample=1, verbosity=3)

In [37]:
from sklearn.metrics import roc_auc_score

print("train AUC: ", roc_auc_score(y_train, clf.predict_proba(X_train)[:, 1]))
print("val AUC: ", roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1]))

train AUC:  0.6866320659076522
val AUC:  0.633924980095857


In [38]:
# Test AUC:  0.6289922934899153
# Test AUC:  0.6294445887736262
# Test AUC:  0.6299592194176079
# Test AUC:  0.6287220133901563
# Test AUC:  0.6298864684093403 lem words

In [43]:
ids = list(map(lambda x: x[0], (filter(lambda x: x[1]**2 > 1e-9, enumerate(clf.feature_importances_)))))
len(ids)

1610

In [44]:
X_train[:, ids]

<175640x1610 sparse matrix of type '<class 'numpy.float64'>'
	with 4172728 stored elements in Compressed Sparse Row format>

In [45]:
from xgboost import XGBClassifier

#clf = RandomForestClassifier(n_estimators=100, n_jobs=3, class_weight='balanced', min_samples_leaf=100)
clf2 = XGBClassifier(n_estimators=200, learning_rate=0.02, max_depth=8, n_jobs=3, colsample_bytree=0.7, scale_pos_weight=1., verbosity=3)
clf2.fit(X_train[:, ids], y_train)

[17:11:30] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 334 extra nodes, 0 pruned nodes, max_depth=8
[17:11:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=8
[17:11:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 308 extra nodes, 0 pruned nodes, max_depth=8
[17:11:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=8
[17:11:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=8
[17:11:32] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 326 extra nodes, 0 pruned nodes, max_depth=8
[17:11:32] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 280 extra nodes, 0 pruned nodes, max_depth=8
[17:11:32] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 300 e

[17:11:50] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 282 extra nodes, 0 pruned nodes, max_depth=8
[17:11:50] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=8
[17:11:50] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=8
[17:11:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 296 extra nodes, 0 pruned nodes, max_depth=8
[17:11:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 264 extra nodes, 0 pruned nodes, max_depth=8
[17:11:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 260 extra nodes, 0 pruned nodes, max_depth=8
[17:11:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 314 extra nodes, 0 pruned nodes, max_depth=8
[17:11:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 288 e

[17:12:09] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 extra nodes, 0 pruned nodes, max_depth=8
[17:12:09] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 162 extra nodes, 0 pruned nodes, max_depth=8
[17:12:09] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=8
[17:12:10] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 166 extra nodes, 0 pruned nodes, max_depth=8
[17:12:10] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 278 extra nodes, 0 pruned nodes, max_depth=8
[17:12:10] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 198 extra nodes, 0 pruned nodes, max_depth=8
[17:12:11] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=8
[17:12:11] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 170 e

[17:12:27] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 240 extra nodes, 0 pruned nodes, max_depth=8
[17:12:27] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 186 extra nodes, 0 pruned nodes, max_depth=8
[17:12:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 184 extra nodes, 0 pruned nodes, max_depth=8
[17:12:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 112 extra nodes, 0 pruned nodes, max_depth=8
[17:12:28] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 246 extra nodes, 0 pruned nodes, max_depth=8
[17:12:29] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 180 extra nodes, 0 pruned nodes, max_depth=8
[17:12:29] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra nodes, 0 pruned nodes, max_depth=8
[17:12:29] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 212 e

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       learning_rate=0.02, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=3,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.0, seed=None,
       silent=None, subsample=1, verbosity=3)

In [48]:
print("train AUC: ", roc_auc_score(y_train, clf2.predict_proba(X_train[:, ids])[:, 1]))
print("val AUC: ", roc_auc_score(y_val, clf2.predict_proba(X_val[:, ids])[:, 1]))

train AUC:  0.6867789544016315
val AUC:  0.6340381123883563


In [49]:
X = preprocessing(X).tocsr()

In [50]:
from xgboost import XGBClassifier

#clf = RandomForestClassifier(n_estimators=100, n_jobs=3, class_weight='balanced', min_samples_leaf=100)
clf = XGBClassifier(n_estimators=200, learning_rate=0.02, max_depth=8, n_jobs=3, colsample_bytree=0.7, scale_pos_weight=1., verbosity=3)
clf.fit(X[:, ids], y)

[17:20:30] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 372 extra nodes, 0 pruned nodes, max_depth=8
[17:20:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 374 extra nodes, 0 pruned nodes, max_depth=8
[17:20:32] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 362 extra nodes, 0 pruned nodes, max_depth=8
[17:20:32] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 360 extra nodes, 0 pruned nodes, max_depth=8
[17:20:33] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 308 extra nodes, 0 pruned nodes, max_depth=8
[17:20:33] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=8
[17:20:34] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 316 extra nodes, 0 pruned nodes, max_depth=8
[17:20:35] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 376 e

[17:21:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 318 extra nodes, 0 pruned nodes, max_depth=8
[17:21:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 278 extra nodes, 0 pruned nodes, max_depth=8
[17:21:15] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 314 extra nodes, 0 pruned nodes, max_depth=8
[17:21:16] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 348 extra nodes, 0 pruned nodes, max_depth=8
[17:21:16] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 350 extra nodes, 0 pruned nodes, max_depth=8
[17:21:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 340 extra nodes, 0 pruned nodes, max_depth=8
[17:21:17] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 348 extra nodes, 0 pruned nodes, max_depth=8
[17:21:18] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 312 e

[17:21:55] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 196 extra nodes, 0 pruned nodes, max_depth=8
[17:21:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 258 extra nodes, 0 pruned nodes, max_depth=8
[17:21:56] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 236 extra nodes, 0 pruned nodes, max_depth=8
[17:21:57] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 286 extra nodes, 0 pruned nodes, max_depth=8
[17:21:58] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 244 extra nodes, 0 pruned nodes, max_depth=8
[17:21:58] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 248 extra nodes, 0 pruned nodes, max_depth=8
[17:21:59] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 250 extra nodes, 0 pruned nodes, max_depth=8
[17:22:00] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 266 e

[17:22:35] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 290 extra nodes, 0 pruned nodes, max_depth=8
[17:22:36] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 200 extra nodes, 0 pruned nodes, max_depth=8
[17:22:37] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 210 extra nodes, 0 pruned nodes, max_depth=8
[17:22:37] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 182 extra nodes, 0 pruned nodes, max_depth=8
[17:22:38] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 222 extra nodes, 0 pruned nodes, max_depth=8
[17:22:39] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 224 extra nodes, 0 pruned nodes, max_depth=8
[17:22:39] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=8
[17:22:40] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 166 e

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0,
       learning_rate=0.02, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=3,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1.0, seed=None,
       silent=None, subsample=1, verbosity=3)

In [57]:
X.shape

(351281, 20389)

In [51]:
print("train AUC: ", roc_auc_score(y, clf.predict_proba(X[:, ids])[:, 1]))

train AUC:  0.6678732450937732


In [26]:
data_submit = pd.read_csv('test_nolabel.tsv', sep = '\t')

In [41]:
data_submit.shape

(89251, 19)

In [56]:
new_col = np.empty(data_submit.properties.values.shape[0], dtype=object)
for i, item_prop in enumerate(data_submit.properties.values):
    if "Cee\'d" in item_prop:
        item_prop = item_prop.replace('"Cee\'d"','\'Ceed\'')
    if "Levi\'s" in item_prop:
        item_prop = item_prop.replace('"Levi\'s"', '\'Levis\'')
    if "Victoria\'s Secret" in item_prop:
        item_prop = item_prop.replace('"Victoria\'s Secret"', '\'Victorias Secret\'')
    if "O\'Stin" in item_prop:
        item_prop = item_prop.replace('"O\'Stin"', '\'OStin\'')
    if "Carter\'s" in item_prop:
        item_prop = item_prop.replace('"Carter\'s"', '\'Carters\'')
    if "Colin\'s" in item_prop:
        item_prop = item_prop.replace('"Colin\'s"', '\'Colins\'')
    else:
        item_prop = item_prop.replace('"', '')  
    j = [json.loads(item_prop.replace('\'', '"').replace('\\xa0', ''))]
    string = ' '.join([a["slug_name"] + ' ' + a["value"] for a in j[0]])
    new_col[i] = string
    #print(string)

In [58]:
data_submit['prop_text'] = new_col

In [59]:
X_submit = preprocessing(data_submit)

In [61]:
y_submit_pred = cb.predict_proba(X_submit)[:, 1]

In [63]:
product_id = data_submit['product_id'].values
data_submit = pd.DataFrame.from_dict({'product_id' : product_id, 'score' : y_submit_pred})
data_submit.to_csv('./to_submit_0.634', sep = ',', index = False)

In [62]:
y_submit_pred.shape

(89251,)