In [1]:
import scipy
import string
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train.tsv', sep='\t', index_col=0)
df_test = pd.read_csv('test_nolabel.tsv', sep='\t', index_col=0)

In [3]:
dates = sorted(set(df_train['date_created']) | set(df_test['date_created']))
date2week = dict(zip(dates, np.arange(365) % 7))

df_train['week'] = df_train['date_created'].apply(lambda x: date2week[x])
df_test['week'] = df_test['date_created'].apply(lambda x: date2week[x])

In [4]:
def get_prop_text(data):
    prop_col = []
    for item_prop in data['properties'].values:
        prop_text = []
        for text in item_prop.split('}')[:-1]:
            props = text[text.find('{'):].split(',')
            name = props[1][props[1].find(': '):].translate(str.maketrans('','',string.punctuation))
            val = props[2][props[2].find(': '):].translate(str.maketrans('','',string.punctuation))
            if 'Нет' not in val.lower():
#                 prop_text.append(name)
                if 'Есть' not in val.lower():
                    prop_text.append(val)
        prop_col.append(' '.join(prop_text))
    return prop_col

df_train['prop_text'] = get_prop_text(df_train)
df_test['prop_text'] = get_prop_text(df_test)

In [5]:
X = df_train.drop(['sold_fast', 'properties', 'product_id', 'owner_id'], axis=1)
y = df_train['sold_fast']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42, shuffle=False)

# Preprocessing

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, OneHotEncoder

In [8]:
name_text = X_train['name_text']
vectorizer_name_text = TfidfVectorizer(max_features=200, decode_error='ignore')
vectorizer_name_text.fit(name_text)

desc_text = X_train['desc_text']
vectorizer_desc_text = TfidfVectorizer(max_features=200, decode_error='ignore')
vectorizer_desc_text.fit(desc_text)
             
prop_text = X_train['prop_text']
vectorizer_prop_text = TfidfVectorizer(max_features=200, decode_error='ignore')
vectorizer_prop_text.fit(desc_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [9]:
onehot_preprocess = ColumnTransformer([("dummy_col", OneHotEncoder(
    handle_unknown='ignore',
    sparse=False,
    categories=[
        sorted(set(X_train['region']) & set(df_test['region'])),
        sorted(set(X_train['category_id']) & set(df_test['category_id'])),
        sorted(set(X_train['subcategory_id']) & set(df_test['subcategory_id'])),
        sorted(set(X_train['sold_mode']) & set(df_test['sold_mode'])),
        sorted(set(X_train['product_type']) & set(df_test['product_type']))
    ]), ['region', 'category_id', 'subcategory_id', 'sold_mode', 'product_type'])])

onehot_preprocess.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('dummy_col', OneHotEncoder(categorical_features=None,
       categories=[['Адыгея', 'Алтайский край', 'Амурская область', 'Архангельская область', 'Астраханская область', 'Башкортостан', 'Белгородская область', 'Брянская область', 'Владимирская область', 'Волгоградская область', 'Воло...lues=None, sparse=False), ['region', 'category_id', 'subcategory_id', 'sold_mode', 'product_type'])])

In [11]:
def date2ymd(date):
    date_plitted = date.split('-')
    year = date_plitted[0]
    month = date_plitted[1]
    day = date_plitted[2]
    return int(month + day)


def preprocessing(data):
    return np.concatenate([
        data[['lat', 'long', 'price', 'sold_mode', 'img_num']].values,
        np.array([date2ymd(i) for i in data['date_created']]).reshape(-1,1),
        data[['payment_available', 'delivery_available']].values * 1.,    
        (data['price'].values * data['product_type'].values).reshape(-1,1),
        (data['sold_mode'].values * data['subcategory_id'].values).reshape(-1,1),
        (data['sold_mode'].values * data['price'].values).reshape(-1,1),

        vectorizer_name_text.transform(data['name_text']).toarray(),
        vectorizer_desc_text.transform(data['desc_text']).toarray(),
        vectorizer_prop_text.transform(data['prop_text']).toarray(),

        onehot_preprocess.transform(data)
    ], axis=1)

# Model

In [12]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [13]:
X_train = preprocessing(X_train)
X_val = preprocessing(X_val)
X_train.shape, X_val.shape

((175640, 990), (175641, 990))

In [14]:
cb = CatBoostClassifier(learning_rate=0.1, iterations=2000, depth=4)
cb.fit(X_train, y_train)

0:	learn: 0.6618750	total: 236ms	remaining: 7m 51s
1:	learn: 0.6370865	total: 330ms	remaining: 5m 29s
2:	learn: 0.6170760	total: 429ms	remaining: 4m 45s
3:	learn: 0.6010326	total: 547ms	remaining: 4m 32s
4:	learn: 0.5885475	total: 641ms	remaining: 4m 15s
5:	learn: 0.5785959	total: 739ms	remaining: 4m 5s
6:	learn: 0.5698375	total: 863ms	remaining: 4m 5s
7:	learn: 0.5631317	total: 959ms	remaining: 3m 58s
8:	learn: 0.5579344	total: 1.05s	remaining: 3m 53s
9:	learn: 0.5530022	total: 1.17s	remaining: 3m 53s
10:	learn: 0.5491585	total: 1.27s	remaining: 3m 50s
11:	learn: 0.5461270	total: 1.37s	remaining: 3m 47s
12:	learn: 0.5434887	total: 1.49s	remaining: 3m 47s
13:	learn: 0.5414591	total: 1.59s	remaining: 3m 45s
14:	learn: 0.5396683	total: 1.69s	remaining: 3m 43s
15:	learn: 0.5381216	total: 1.81s	remaining: 3m 44s
16:	learn: 0.5363399	total: 1.93s	remaining: 3m 45s
17:	learn: 0.5352086	total: 2.03s	remaining: 3m 43s
18:	learn: 0.5340124	total: 2.14s	remaining: 3m 42s
19:	learn: 0.5332907	tot

159:	learn: 0.5197931	total: 20s	remaining: 3m 49s
160:	learn: 0.5197644	total: 20.1s	remaining: 3m 49s
161:	learn: 0.5197399	total: 20.2s	remaining: 3m 48s
162:	learn: 0.5197070	total: 20.3s	remaining: 3m 48s
163:	learn: 0.5196659	total: 20.5s	remaining: 3m 49s
164:	learn: 0.5196442	total: 20.6s	remaining: 3m 48s
165:	learn: 0.5195961	total: 20.7s	remaining: 3m 49s
166:	learn: 0.5195609	total: 20.9s	remaining: 3m 49s
167:	learn: 0.5195334	total: 21s	remaining: 3m 48s
168:	learn: 0.5195147	total: 21.1s	remaining: 3m 48s
169:	learn: 0.5194916	total: 21.2s	remaining: 3m 48s
170:	learn: 0.5194552	total: 21.4s	remaining: 3m 48s
171:	learn: 0.5194225	total: 21.5s	remaining: 3m 48s
172:	learn: 0.5193982	total: 21.6s	remaining: 3m 48s
173:	learn: 0.5193740	total: 21.7s	remaining: 3m 48s
174:	learn: 0.5193364	total: 21.9s	remaining: 3m 48s
175:	learn: 0.5193041	total: 22s	remaining: 3m 47s
176:	learn: 0.5192842	total: 22.1s	remaining: 3m 47s
177:	learn: 0.5192178	total: 22.3s	remaining: 3m 48s

316:	learn: 0.5157783	total: 40.1s	remaining: 3m 33s
317:	learn: 0.5157611	total: 40.3s	remaining: 3m 32s
318:	learn: 0.5157406	total: 40.4s	remaining: 3m 32s
319:	learn: 0.5157274	total: 40.5s	remaining: 3m 32s
320:	learn: 0.5157107	total: 40.6s	remaining: 3m 32s
321:	learn: 0.5156959	total: 40.8s	remaining: 3m 32s
322:	learn: 0.5156806	total: 40.9s	remaining: 3m 32s
323:	learn: 0.5156565	total: 41s	remaining: 3m 32s
324:	learn: 0.5156340	total: 41.1s	remaining: 3m 32s
325:	learn: 0.5156212	total: 41.3s	remaining: 3m 31s
326:	learn: 0.5156076	total: 41.4s	remaining: 3m 31s
327:	learn: 0.5155960	total: 41.5s	remaining: 3m 31s
328:	learn: 0.5155780	total: 41.7s	remaining: 3m 31s
329:	learn: 0.5155438	total: 41.8s	remaining: 3m 31s
330:	learn: 0.5155358	total: 41.9s	remaining: 3m 31s
331:	learn: 0.5155164	total: 42s	remaining: 3m 31s
332:	learn: 0.5154858	total: 42.2s	remaining: 3m 31s
333:	learn: 0.5154731	total: 42.3s	remaining: 3m 31s
334:	learn: 0.5154516	total: 42.4s	remaining: 3m 3

473:	learn: 0.5129399	total: 1m	remaining: 3m 13s
474:	learn: 0.5129277	total: 1m	remaining: 3m 13s
475:	learn: 0.5129069	total: 1m	remaining: 3m 13s
476:	learn: 0.5128918	total: 1m	remaining: 3m 13s
477:	learn: 0.5128776	total: 1m	remaining: 3m 13s
478:	learn: 0.5128642	total: 1m	remaining: 3m 13s
479:	learn: 0.5128454	total: 1m	remaining: 3m 12s
480:	learn: 0.5128270	total: 1m 1s	remaining: 3m 12s
481:	learn: 0.5128053	total: 1m 1s	remaining: 3m 12s
482:	learn: 0.5127791	total: 1m 1s	remaining: 3m 12s
483:	learn: 0.5127580	total: 1m 1s	remaining: 3m 12s
484:	learn: 0.5127395	total: 1m 1s	remaining: 3m 12s
485:	learn: 0.5127313	total: 1m 1s	remaining: 3m 12s
486:	learn: 0.5127143	total: 1m 1s	remaining: 3m 12s
487:	learn: 0.5126984	total: 1m 2s	remaining: 3m 12s
488:	learn: 0.5126858	total: 1m 2s	remaining: 3m 12s
489:	learn: 0.5126586	total: 1m 2s	remaining: 3m 12s
490:	learn: 0.5126427	total: 1m 2s	remaining: 3m 11s
491:	learn: 0.5126232	total: 1m 2s	remaining: 3m 11s
492:	learn: 0.

629:	learn: 0.5104534	total: 1m 19s	remaining: 2m 53s
630:	learn: 0.5104333	total: 1m 20s	remaining: 2m 53s
631:	learn: 0.5104163	total: 1m 20s	remaining: 2m 53s
632:	learn: 0.5104019	total: 1m 20s	remaining: 2m 53s
633:	learn: 0.5103897	total: 1m 20s	remaining: 2m 53s
634:	learn: 0.5103781	total: 1m 20s	remaining: 2m 53s
635:	learn: 0.5103670	total: 1m 20s	remaining: 2m 52s
636:	learn: 0.5103521	total: 1m 20s	remaining: 2m 52s
637:	learn: 0.5103430	total: 1m 20s	remaining: 2m 52s
638:	learn: 0.5103282	total: 1m 21s	remaining: 2m 52s
639:	learn: 0.5103177	total: 1m 21s	remaining: 2m 52s
640:	learn: 0.5103054	total: 1m 21s	remaining: 2m 52s
641:	learn: 0.5102953	total: 1m 21s	remaining: 2m 52s
642:	learn: 0.5102829	total: 1m 21s	remaining: 2m 52s
643:	learn: 0.5102690	total: 1m 21s	remaining: 2m 52s
644:	learn: 0.5102593	total: 1m 21s	remaining: 2m 51s
645:	learn: 0.5102482	total: 1m 21s	remaining: 2m 51s
646:	learn: 0.5102416	total: 1m 22s	remaining: 2m 51s
647:	learn: 0.5102254	total:

781:	learn: 0.5082831	total: 1m 40s	remaining: 2m 36s
782:	learn: 0.5082631	total: 1m 40s	remaining: 2m 36s
783:	learn: 0.5082508	total: 1m 40s	remaining: 2m 35s
784:	learn: 0.5082388	total: 1m 40s	remaining: 2m 35s
785:	learn: 0.5082252	total: 1m 40s	remaining: 2m 35s
786:	learn: 0.5082085	total: 1m 40s	remaining: 2m 35s
787:	learn: 0.5081984	total: 1m 41s	remaining: 2m 35s
788:	learn: 0.5081758	total: 1m 41s	remaining: 2m 35s
789:	learn: 0.5081652	total: 1m 41s	remaining: 2m 35s
790:	learn: 0.5081530	total: 1m 41s	remaining: 2m 35s
791:	learn: 0.5081444	total: 1m 41s	remaining: 2m 34s
792:	learn: 0.5081267	total: 1m 41s	remaining: 2m 34s
793:	learn: 0.5081102	total: 1m 41s	remaining: 2m 34s
794:	learn: 0.5080918	total: 1m 41s	remaining: 2m 34s
795:	learn: 0.5080777	total: 1m 42s	remaining: 2m 34s
796:	learn: 0.5080647	total: 1m 42s	remaining: 2m 34s
797:	learn: 0.5080585	total: 1m 42s	remaining: 2m 34s
798:	learn: 0.5080424	total: 1m 42s	remaining: 2m 34s
799:	learn: 0.5080312	total:

935:	learn: 0.5063089	total: 2m 1s	remaining: 2m 18s
936:	learn: 0.5062984	total: 2m 2s	remaining: 2m 18s
937:	learn: 0.5062899	total: 2m 2s	remaining: 2m 18s
938:	learn: 0.5062814	total: 2m 2s	remaining: 2m 18s
939:	learn: 0.5062679	total: 2m 2s	remaining: 2m 18s
940:	learn: 0.5062180	total: 2m 2s	remaining: 2m 18s
941:	learn: 0.5062042	total: 2m 2s	remaining: 2m 17s
942:	learn: 0.5061923	total: 2m 2s	remaining: 2m 17s
943:	learn: 0.5061777	total: 2m 3s	remaining: 2m 17s
944:	learn: 0.5061670	total: 2m 3s	remaining: 2m 17s
945:	learn: 0.5061549	total: 2m 3s	remaining: 2m 17s
946:	learn: 0.5061341	total: 2m 3s	remaining: 2m 17s
947:	learn: 0.5061303	total: 2m 3s	remaining: 2m 17s
948:	learn: 0.5061132	total: 2m 3s	remaining: 2m 17s
949:	learn: 0.5061013	total: 2m 3s	remaining: 2m 17s
950:	learn: 0.5060915	total: 2m 4s	remaining: 2m 16s
951:	learn: 0.5060788	total: 2m 4s	remaining: 2m 16s
952:	learn: 0.5060726	total: 2m 4s	remaining: 2m 16s
953:	learn: 0.5060626	total: 2m 4s	remaining: 

1089:	learn: 0.5044220	total: 2m 23s	remaining: 1m 59s
1090:	learn: 0.5044088	total: 2m 23s	remaining: 1m 59s
1091:	learn: 0.5044020	total: 2m 23s	remaining: 1m 59s
1092:	learn: 0.5043904	total: 2m 23s	remaining: 1m 59s
1093:	learn: 0.5043861	total: 2m 23s	remaining: 1m 59s
1094:	learn: 0.5043772	total: 2m 24s	remaining: 1m 59s
1095:	learn: 0.5043649	total: 2m 24s	remaining: 1m 58s
1096:	learn: 0.5043551	total: 2m 24s	remaining: 1m 58s
1097:	learn: 0.5043454	total: 2m 24s	remaining: 1m 58s
1098:	learn: 0.5043408	total: 2m 24s	remaining: 1m 58s
1099:	learn: 0.5043324	total: 2m 24s	remaining: 1m 58s
1100:	learn: 0.5043194	total: 2m 24s	remaining: 1m 58s
1101:	learn: 0.5042973	total: 2m 25s	remaining: 1m 58s
1102:	learn: 0.5042856	total: 2m 25s	remaining: 1m 58s
1103:	learn: 0.5042790	total: 2m 25s	remaining: 1m 57s
1104:	learn: 0.5042716	total: 2m 25s	remaining: 1m 57s
1105:	learn: 0.5042540	total: 2m 25s	remaining: 1m 57s
1106:	learn: 0.5042437	total: 2m 25s	remaining: 1m 57s
1107:	lear

1239:	learn: 0.5026751	total: 2m 44s	remaining: 1m 40s
1240:	learn: 0.5026658	total: 2m 44s	remaining: 1m 40s
1241:	learn: 0.5026541	total: 2m 44s	remaining: 1m 40s
1242:	learn: 0.5026416	total: 2m 45s	remaining: 1m 40s
1243:	learn: 0.5026347	total: 2m 45s	remaining: 1m 40s
1244:	learn: 0.5026183	total: 2m 45s	remaining: 1m 40s
1245:	learn: 0.5026100	total: 2m 45s	remaining: 1m 40s
1246:	learn: 0.5025932	total: 2m 45s	remaining: 1m 40s
1247:	learn: 0.5025846	total: 2m 45s	remaining: 1m 39s
1248:	learn: 0.5025707	total: 2m 45s	remaining: 1m 39s
1249:	learn: 0.5025598	total: 2m 46s	remaining: 1m 39s
1250:	learn: 0.5025497	total: 2m 46s	remaining: 1m 39s
1251:	learn: 0.5025369	total: 2m 46s	remaining: 1m 39s
1252:	learn: 0.5025258	total: 2m 46s	remaining: 1m 39s
1253:	learn: 0.5025177	total: 2m 46s	remaining: 1m 39s
1254:	learn: 0.5025053	total: 2m 46s	remaining: 1m 39s
1255:	learn: 0.5024960	total: 2m 46s	remaining: 1m 38s
1256:	learn: 0.5024859	total: 2m 47s	remaining: 1m 38s
1257:	lear

1391:	learn: 0.5008800	total: 3m 6s	remaining: 1m 21s
1392:	learn: 0.5008739	total: 3m 6s	remaining: 1m 21s
1393:	learn: 0.5008630	total: 3m 6s	remaining: 1m 20s
1394:	learn: 0.5008542	total: 3m 6s	remaining: 1m 20s
1395:	learn: 0.5008451	total: 3m 6s	remaining: 1m 20s
1396:	learn: 0.5008402	total: 3m 6s	remaining: 1m 20s
1397:	learn: 0.5008286	total: 3m 6s	remaining: 1m 20s
1398:	learn: 0.5008221	total: 3m 6s	remaining: 1m 20s
1399:	learn: 0.5008077	total: 3m 7s	remaining: 1m 20s
1400:	learn: 0.5007927	total: 3m 7s	remaining: 1m 20s
1401:	learn: 0.5007830	total: 3m 7s	remaining: 1m 19s
1402:	learn: 0.5007721	total: 3m 7s	remaining: 1m 19s
1403:	learn: 0.5007644	total: 3m 7s	remaining: 1m 19s
1404:	learn: 0.5007526	total: 3m 7s	remaining: 1m 19s
1405:	learn: 0.5007406	total: 3m 7s	remaining: 1m 19s
1406:	learn: 0.5007325	total: 3m 8s	remaining: 1m 19s
1407:	learn: 0.5007213	total: 3m 8s	remaining: 1m 19s
1408:	learn: 0.5007105	total: 3m 8s	remaining: 1m 19s
1409:	learn: 0.5006998	total

1542:	learn: 0.4991548	total: 3m 27s	remaining: 1m 1s
1543:	learn: 0.4991241	total: 3m 27s	remaining: 1m 1s
1544:	learn: 0.4991136	total: 3m 27s	remaining: 1m 1s
1545:	learn: 0.4991050	total: 3m 27s	remaining: 1m
1546:	learn: 0.4990997	total: 3m 27s	remaining: 1m
1547:	learn: 0.4990871	total: 3m 27s	remaining: 1m
1548:	learn: 0.4990780	total: 3m 27s	remaining: 1m
1549:	learn: 0.4990637	total: 3m 28s	remaining: 1m
1550:	learn: 0.4990517	total: 3m 28s	remaining: 1m
1551:	learn: 0.4990455	total: 3m 28s	remaining: 1m
1552:	learn: 0.4990378	total: 3m 28s	remaining: 60s
1553:	learn: 0.4990213	total: 3m 28s	remaining: 59.8s
1554:	learn: 0.4990113	total: 3m 28s	remaining: 59.7s
1555:	learn: 0.4989978	total: 3m 28s	remaining: 59.6s
1556:	learn: 0.4989887	total: 3m 28s	remaining: 59.4s
1557:	learn: 0.4989748	total: 3m 28s	remaining: 59.3s
1558:	learn: 0.4989617	total: 3m 29s	remaining: 59.2s
1559:	learn: 0.4989517	total: 3m 29s	remaining: 59s
1560:	learn: 0.4989383	total: 3m 29s	remaining: 58.9s

1696:	learn: 0.4973860	total: 3m 47s	remaining: 40.6s
1697:	learn: 0.4973722	total: 3m 47s	remaining: 40.4s
1698:	learn: 0.4973589	total: 3m 47s	remaining: 40.3s
1699:	learn: 0.4973536	total: 3m 47s	remaining: 40.1s
1700:	learn: 0.4973357	total: 3m 47s	remaining: 40s
1701:	learn: 0.4973265	total: 3m 47s	remaining: 39.9s
1702:	learn: 0.4973115	total: 3m 47s	remaining: 39.7s
1703:	learn: 0.4972986	total: 3m 48s	remaining: 39.6s
1704:	learn: 0.4972876	total: 3m 48s	remaining: 39.5s
1705:	learn: 0.4972820	total: 3m 48s	remaining: 39.3s
1706:	learn: 0.4972717	total: 3m 48s	remaining: 39.2s
1707:	learn: 0.4972597	total: 3m 48s	remaining: 39.1s
1708:	learn: 0.4972515	total: 3m 48s	remaining: 38.9s
1709:	learn: 0.4972375	total: 3m 48s	remaining: 38.8s
1710:	learn: 0.4972298	total: 3m 49s	remaining: 38.7s
1711:	learn: 0.4972202	total: 3m 49s	remaining: 38.5s
1712:	learn: 0.4972116	total: 3m 49s	remaining: 38.4s
1713:	learn: 0.4972042	total: 3m 49s	remaining: 38.3s
1714:	learn: 0.4971887	total: 

1850:	learn: 0.4957312	total: 4m 6s	remaining: 19.9s
1851:	learn: 0.4957180	total: 4m 6s	remaining: 19.7s
1852:	learn: 0.4957075	total: 4m 7s	remaining: 19.6s
1853:	learn: 0.4956911	total: 4m 7s	remaining: 19.5s
1854:	learn: 0.4956776	total: 4m 7s	remaining: 19.3s
1855:	learn: 0.4956656	total: 4m 7s	remaining: 19.2s
1856:	learn: 0.4956558	total: 4m 7s	remaining: 19.1s
1857:	learn: 0.4956471	total: 4m 7s	remaining: 18.9s
1858:	learn: 0.4956396	total: 4m 7s	remaining: 18.8s
1859:	learn: 0.4956259	total: 4m 7s	remaining: 18.7s
1860:	learn: 0.4956136	total: 4m 8s	remaining: 18.5s
1861:	learn: 0.4956033	total: 4m 8s	remaining: 18.4s
1862:	learn: 0.4955958	total: 4m 8s	remaining: 18.3s
1863:	learn: 0.4955789	total: 4m 8s	remaining: 18.1s
1864:	learn: 0.4955718	total: 4m 8s	remaining: 18s
1865:	learn: 0.4955640	total: 4m 8s	remaining: 17.9s
1866:	learn: 0.4955461	total: 4m 8s	remaining: 17.7s
1867:	learn: 0.4955350	total: 4m 8s	remaining: 17.6s
1868:	learn: 0.4955280	total: 4m 8s	remaining: 1

<catboost.core.CatBoostClassifier at 0x7f46b83f2a20>

In [15]:
print("train AUC: ", roc_auc_score(y_train, cb.predict_proba(X_train)[:, 1]))
print("val AUC: ", roc_auc_score(y_val, cb.predict_proba(X_val)[:, 1]))

train AUC:  0.7078511179992582
val AUC:  0.6365163155071684


# Submittion

In [16]:
X = preprocessing(X)
X.shape

(351281, 990)

In [17]:
cb.fit(X, y)

0:	learn: 0.6617073	total: 192ms	remaining: 6m 23s
1:	learn: 0.6363144	total: 381ms	remaining: 6m 20s
2:	learn: 0.6163250	total: 579ms	remaining: 6m 25s
3:	learn: 0.5994457	total: 771ms	remaining: 6m 24s
4:	learn: 0.5866287	total: 976ms	remaining: 6m 29s
5:	learn: 0.5760268	total: 1.16s	remaining: 6m 26s
6:	learn: 0.5676686	total: 1.37s	remaining: 6m 31s
7:	learn: 0.5609060	total: 1.56s	remaining: 6m 29s
8:	learn: 0.5554994	total: 1.76s	remaining: 6m 28s
9:	learn: 0.5511340	total: 1.95s	remaining: 6m 28s
10:	learn: 0.5473679	total: 2.21s	remaining: 6m 39s
11:	learn: 0.5442670	total: 2.48s	remaining: 6m 51s
12:	learn: 0.5415788	total: 2.74s	remaining: 6m 58s
13:	learn: 0.5396942	total: 3s	remaining: 7m 6s
14:	learn: 0.5380743	total: 3.26s	remaining: 7m 11s
15:	learn: 0.5361825	total: 3.53s	remaining: 7m 17s
16:	learn: 0.5350053	total: 3.79s	remaining: 7m 22s
17:	learn: 0.5339767	total: 4.05s	remaining: 7m 25s
18:	learn: 0.5331425	total: 4.31s	remaining: 7m 29s
19:	learn: 0.5325336	total

159:	learn: 0.5194759	total: 39.9s	remaining: 7m 38s
160:	learn: 0.5194396	total: 40.1s	remaining: 7m 37s
161:	learn: 0.5194030	total: 40.3s	remaining: 7m 36s
162:	learn: 0.5193823	total: 40.4s	remaining: 7m 35s
163:	learn: 0.5193566	total: 40.6s	remaining: 7m 34s
164:	learn: 0.5193214	total: 40.9s	remaining: 7m 34s
165:	learn: 0.5192895	total: 41.1s	remaining: 7m 34s
166:	learn: 0.5192610	total: 41.4s	remaining: 7m 33s
167:	learn: 0.5192341	total: 41.6s	remaining: 7m 33s
168:	learn: 0.5192046	total: 41.8s	remaining: 7m 33s
169:	learn: 0.5190810	total: 42.1s	remaining: 7m 32s
170:	learn: 0.5190627	total: 42.3s	remaining: 7m 32s
171:	learn: 0.5190311	total: 42.6s	remaining: 7m 32s
172:	learn: 0.5190087	total: 42.8s	remaining: 7m 32s
173:	learn: 0.5189794	total: 43s	remaining: 7m 31s
174:	learn: 0.5189428	total: 43.3s	remaining: 7m 31s
175:	learn: 0.5189207	total: 43.5s	remaining: 7m 31s
176:	learn: 0.5188969	total: 43.8s	remaining: 7m 30s
177:	learn: 0.5188802	total: 44s	remaining: 7m 3

315:	learn: 0.5163637	total: 1m 18s	remaining: 6m 57s
316:	learn: 0.5163518	total: 1m 18s	remaining: 6m 57s
317:	learn: 0.5163400	total: 1m 18s	remaining: 6m 57s
318:	learn: 0.5163226	total: 1m 19s	remaining: 6m 57s
319:	learn: 0.5163095	total: 1m 19s	remaining: 6m 57s
320:	learn: 0.5162999	total: 1m 19s	remaining: 6m 57s
321:	learn: 0.5162921	total: 1m 20s	remaining: 6m 57s
322:	learn: 0.5162808	total: 1m 20s	remaining: 6m 57s
323:	learn: 0.5162737	total: 1m 20s	remaining: 6m 56s
324:	learn: 0.5162610	total: 1m 20s	remaining: 6m 56s
325:	learn: 0.5162437	total: 1m 21s	remaining: 6m 56s
326:	learn: 0.5162348	total: 1m 21s	remaining: 6m 56s
327:	learn: 0.5162180	total: 1m 21s	remaining: 6m 56s
328:	learn: 0.5162089	total: 1m 21s	remaining: 6m 56s
329:	learn: 0.5161977	total: 1m 22s	remaining: 6m 55s
330:	learn: 0.5161882	total: 1m 22s	remaining: 6m 55s
331:	learn: 0.5161720	total: 1m 22s	remaining: 6m 55s
332:	learn: 0.5161548	total: 1m 23s	remaining: 6m 55s
333:	learn: 0.5161389	total:

467:	learn: 0.5145666	total: 1m 55s	remaining: 6m 19s
468:	learn: 0.5145578	total: 1m 56s	remaining: 6m 18s
469:	learn: 0.5145416	total: 1m 56s	remaining: 6m 18s
470:	learn: 0.5145313	total: 1m 56s	remaining: 6m 18s
471:	learn: 0.5145204	total: 1m 56s	remaining: 6m 18s
472:	learn: 0.5145097	total: 1m 57s	remaining: 6m 18s
473:	learn: 0.5145028	total: 1m 57s	remaining: 6m 17s
474:	learn: 0.5144898	total: 1m 57s	remaining: 6m 17s
475:	learn: 0.5144809	total: 1m 57s	remaining: 6m 17s
476:	learn: 0.5144667	total: 1m 58s	remaining: 6m 16s
477:	learn: 0.5144575	total: 1m 58s	remaining: 6m 16s
478:	learn: 0.5144417	total: 1m 58s	remaining: 6m 16s
479:	learn: 0.5144346	total: 1m 58s	remaining: 6m 16s
480:	learn: 0.5144270	total: 1m 59s	remaining: 6m 16s
481:	learn: 0.5144186	total: 1m 59s	remaining: 6m 15s
482:	learn: 0.5144007	total: 1m 59s	remaining: 6m 15s
483:	learn: 0.5143878	total: 1m 59s	remaining: 6m 15s
484:	learn: 0.5143786	total: 2m	remaining: 6m 14s
485:	learn: 0.5143687	total: 2m	

621:	learn: 0.5129765	total: 2m 33s	remaining: 5m 40s
622:	learn: 0.5129695	total: 2m 33s	remaining: 5m 39s
623:	learn: 0.5129568	total: 2m 34s	remaining: 5m 39s
624:	learn: 0.5129489	total: 2m 34s	remaining: 5m 39s
625:	learn: 0.5129408	total: 2m 34s	remaining: 5m 39s
626:	learn: 0.5129352	total: 2m 34s	remaining: 5m 38s
627:	learn: 0.5129280	total: 2m 35s	remaining: 5m 38s
628:	learn: 0.5129198	total: 2m 35s	remaining: 5m 38s
629:	learn: 0.5129077	total: 2m 35s	remaining: 5m 38s
630:	learn: 0.5128991	total: 2m 35s	remaining: 5m 37s
631:	learn: 0.5128910	total: 2m 35s	remaining: 5m 37s
632:	learn: 0.5128826	total: 2m 36s	remaining: 5m 37s
633:	learn: 0.5128733	total: 2m 36s	remaining: 5m 36s
634:	learn: 0.5128680	total: 2m 36s	remaining: 5m 36s
635:	learn: 0.5128569	total: 2m 36s	remaining: 5m 36s
636:	learn: 0.5128482	total: 2m 37s	remaining: 5m 36s
637:	learn: 0.5128412	total: 2m 37s	remaining: 5m 35s
638:	learn: 0.5128347	total: 2m 37s	remaining: 5m 35s
639:	learn: 0.5127728	total:

775:	learn: 0.5114440	total: 3m 11s	remaining: 5m 2s
776:	learn: 0.5114357	total: 3m 11s	remaining: 5m 1s
777:	learn: 0.5114269	total: 3m 11s	remaining: 5m 1s
778:	learn: 0.5114228	total: 3m 12s	remaining: 5m 1s
779:	learn: 0.5114160	total: 3m 12s	remaining: 5m
780:	learn: 0.5114080	total: 3m 12s	remaining: 5m
781:	learn: 0.5113950	total: 3m 12s	remaining: 5m
782:	learn: 0.5113793	total: 3m 13s	remaining: 5m
783:	learn: 0.5113748	total: 3m 13s	remaining: 5m
784:	learn: 0.5113706	total: 3m 13s	remaining: 4m 59s
785:	learn: 0.5113595	total: 3m 13s	remaining: 4m 59s
786:	learn: 0.5113498	total: 3m 14s	remaining: 4m 59s
787:	learn: 0.5113456	total: 3m 14s	remaining: 4m 59s
788:	learn: 0.5113386	total: 3m 14s	remaining: 4m 58s
789:	learn: 0.5113247	total: 3m 14s	remaining: 4m 58s
790:	learn: 0.5113190	total: 3m 15s	remaining: 4m 58s
791:	learn: 0.5113141	total: 3m 15s	remaining: 4m 58s
792:	learn: 0.5113034	total: 3m 15s	remaining: 4m 57s
793:	learn: 0.5112968	total: 3m 15s	remaining: 4m 57

928:	learn: 0.5100731	total: 3m 48s	remaining: 4m 23s
929:	learn: 0.5100678	total: 3m 48s	remaining: 4m 23s
930:	learn: 0.5100597	total: 3m 49s	remaining: 4m 22s
931:	learn: 0.5100563	total: 3m 49s	remaining: 4m 22s
932:	learn: 0.5100473	total: 3m 49s	remaining: 4m 22s
933:	learn: 0.5100396	total: 3m 49s	remaining: 4m 22s
934:	learn: 0.5100342	total: 3m 49s	remaining: 4m 21s
935:	learn: 0.5100279	total: 3m 50s	remaining: 4m 21s
936:	learn: 0.5100189	total: 3m 50s	remaining: 4m 21s
937:	learn: 0.5100103	total: 3m 50s	remaining: 4m 21s
938:	learn: 0.5100037	total: 3m 50s	remaining: 4m 20s
939:	learn: 0.5099933	total: 3m 51s	remaining: 4m 20s
940:	learn: 0.5099864	total: 3m 51s	remaining: 4m 20s
941:	learn: 0.5099815	total: 3m 51s	remaining: 4m 20s
942:	learn: 0.5099783	total: 3m 51s	remaining: 4m 19s
943:	learn: 0.5099699	total: 3m 52s	remaining: 4m 19s
944:	learn: 0.5099618	total: 3m 52s	remaining: 4m 19s
945:	learn: 0.5099542	total: 3m 52s	remaining: 4m 19s
946:	learn: 0.5099433	total:

1081:	learn: 0.5089111	total: 4m 25s	remaining: 3m 45s
1082:	learn: 0.5089057	total: 4m 25s	remaining: 3m 45s
1083:	learn: 0.5089016	total: 4m 25s	remaining: 3m 44s
1084:	learn: 0.5088922	total: 4m 26s	remaining: 3m 44s
1085:	learn: 0.5088867	total: 4m 26s	remaining: 3m 44s
1086:	learn: 0.5088789	total: 4m 26s	remaining: 3m 44s
1087:	learn: 0.5088714	total: 4m 26s	remaining: 3m 43s
1088:	learn: 0.5088637	total: 4m 27s	remaining: 3m 43s
1089:	learn: 0.5088575	total: 4m 27s	remaining: 3m 43s
1090:	learn: 0.5088497	total: 4m 27s	remaining: 3m 43s
1091:	learn: 0.5088405	total: 4m 27s	remaining: 3m 42s
1092:	learn: 0.5088275	total: 4m 28s	remaining: 3m 42s
1093:	learn: 0.5088208	total: 4m 28s	remaining: 3m 42s
1094:	learn: 0.5088131	total: 4m 28s	remaining: 3m 42s
1095:	learn: 0.5088047	total: 4m 28s	remaining: 3m 41s
1096:	learn: 0.5087987	total: 4m 29s	remaining: 3m 41s
1097:	learn: 0.5087937	total: 4m 29s	remaining: 3m 41s
1098:	learn: 0.5087862	total: 4m 29s	remaining: 3m 41s
1099:	lear

1231:	learn: 0.5078270	total: 5m 2s	remaining: 3m 8s
1232:	learn: 0.5078193	total: 5m 2s	remaining: 3m 8s
1233:	learn: 0.5078112	total: 5m 2s	remaining: 3m 7s
1234:	learn: 0.5078058	total: 5m 2s	remaining: 3m 7s
1235:	learn: 0.5078007	total: 5m 3s	remaining: 3m 7s
1236:	learn: 0.5077920	total: 5m 3s	remaining: 3m 7s
1237:	learn: 0.5077803	total: 5m 3s	remaining: 3m 6s
1238:	learn: 0.5077716	total: 5m 3s	remaining: 3m 6s
1239:	learn: 0.5077683	total: 5m 4s	remaining: 3m 6s
1240:	learn: 0.5077642	total: 5m 4s	remaining: 3m 6s
1241:	learn: 0.5077592	total: 5m 4s	remaining: 3m 5s
1242:	learn: 0.5077493	total: 5m 4s	remaining: 3m 5s
1243:	learn: 0.5077407	total: 5m 5s	remaining: 3m 5s
1244:	learn: 0.5077319	total: 5m 5s	remaining: 3m 5s
1245:	learn: 0.5077249	total: 5m 5s	remaining: 3m 4s
1246:	learn: 0.5077172	total: 5m 5s	remaining: 3m 4s
1247:	learn: 0.5077125	total: 5m 6s	remaining: 3m 4s
1248:	learn: 0.5077047	total: 5m 6s	remaining: 3m 4s
1249:	learn: 0.5076976	total: 5m 6s	remaining:

1382:	learn: 0.5066682	total: 5m 38s	remaining: 2m 30s
1383:	learn: 0.5066603	total: 5m 38s	remaining: 2m 30s
1384:	learn: 0.5066556	total: 5m 38s	remaining: 2m 30s
1385:	learn: 0.5066469	total: 5m 39s	remaining: 2m 30s
1386:	learn: 0.5066396	total: 5m 39s	remaining: 2m 30s
1387:	learn: 0.5066351	total: 5m 39s	remaining: 2m 29s
1388:	learn: 0.5066294	total: 5m 39s	remaining: 2m 29s
1389:	learn: 0.5066260	total: 5m 40s	remaining: 2m 29s
1390:	learn: 0.5066173	total: 5m 40s	remaining: 2m 28s
1391:	learn: 0.5066090	total: 5m 40s	remaining: 2m 28s
1392:	learn: 0.5065991	total: 5m 40s	remaining: 2m 28s
1393:	learn: 0.5065934	total: 5m 41s	remaining: 2m 28s
1394:	learn: 0.5065835	total: 5m 41s	remaining: 2m 28s
1395:	learn: 0.5065791	total: 5m 41s	remaining: 2m 27s
1396:	learn: 0.5065720	total: 5m 41s	remaining: 2m 27s
1397:	learn: 0.5065644	total: 5m 42s	remaining: 2m 27s
1398:	learn: 0.5065603	total: 5m 42s	remaining: 2m 27s
1399:	learn: 0.5065564	total: 5m 42s	remaining: 2m 26s
1400:	lear

1533:	learn: 0.5056657	total: 6m 15s	remaining: 1m 53s
1534:	learn: 0.5056587	total: 6m 15s	remaining: 1m 53s
1535:	learn: 0.5056553	total: 6m 15s	remaining: 1m 53s
1536:	learn: 0.5056492	total: 6m 15s	remaining: 1m 53s
1537:	learn: 0.5056426	total: 6m 16s	remaining: 1m 52s
1538:	learn: 0.5056335	total: 6m 16s	remaining: 1m 52s
1539:	learn: 0.5056264	total: 6m 16s	remaining: 1m 52s
1540:	learn: 0.5056192	total: 6m 16s	remaining: 1m 52s
1541:	learn: 0.5056124	total: 6m 17s	remaining: 1m 51s
1542:	learn: 0.5056087	total: 6m 17s	remaining: 1m 51s
1543:	learn: 0.5055969	total: 6m 17s	remaining: 1m 51s
1544:	learn: 0.5055872	total: 6m 17s	remaining: 1m 51s
1545:	learn: 0.5055793	total: 6m 18s	remaining: 1m 51s
1546:	learn: 0.5055745	total: 6m 18s	remaining: 1m 50s
1547:	learn: 0.5055664	total: 6m 18s	remaining: 1m 50s
1548:	learn: 0.5055619	total: 6m 18s	remaining: 1m 50s
1549:	learn: 0.5055575	total: 6m 18s	remaining: 1m 50s
1550:	learn: 0.5055462	total: 6m 19s	remaining: 1m 49s
1551:	lear

1682:	learn: 0.5046378	total: 6m 51s	remaining: 1m 17s
1683:	learn: 0.5046310	total: 6m 51s	remaining: 1m 17s
1684:	learn: 0.5046218	total: 6m 52s	remaining: 1m 17s
1685:	learn: 0.5046179	total: 6m 52s	remaining: 1m 16s
1686:	learn: 0.5046141	total: 6m 52s	remaining: 1m 16s
1687:	learn: 0.5046089	total: 6m 52s	remaining: 1m 16s
1688:	learn: 0.5046015	total: 6m 53s	remaining: 1m 16s
1689:	learn: 0.5045933	total: 6m 53s	remaining: 1m 15s
1690:	learn: 0.5045848	total: 6m 53s	remaining: 1m 15s
1691:	learn: 0.5045762	total: 6m 53s	remaining: 1m 15s
1692:	learn: 0.5045719	total: 6m 54s	remaining: 1m 15s
1693:	learn: 0.5045634	total: 6m 54s	remaining: 1m 14s
1694:	learn: 0.5045564	total: 6m 54s	remaining: 1m 14s
1695:	learn: 0.5045498	total: 6m 55s	remaining: 1m 14s
1696:	learn: 0.5045437	total: 6m 55s	remaining: 1m 14s
1697:	learn: 0.5045369	total: 6m 55s	remaining: 1m 13s
1698:	learn: 0.5045312	total: 6m 56s	remaining: 1m 13s
1699:	learn: 0.5045241	total: 6m 56s	remaining: 1m 13s
1700:	lear

1835:	learn: 0.5037050	total: 7m 34s	remaining: 40.6s
1836:	learn: 0.5036977	total: 7m 34s	remaining: 40.3s
1837:	learn: 0.5036946	total: 7m 34s	remaining: 40.1s
1838:	learn: 0.5036868	total: 7m 35s	remaining: 39.8s
1839:	learn: 0.5036823	total: 7m 35s	remaining: 39.6s
1840:	learn: 0.5036760	total: 7m 35s	remaining: 39.4s
1841:	learn: 0.5036677	total: 7m 35s	remaining: 39.1s
1842:	learn: 0.5036558	total: 7m 36s	remaining: 38.9s
1843:	learn: 0.5036499	total: 7m 36s	remaining: 38.6s
1844:	learn: 0.5036437	total: 7m 36s	remaining: 38.4s
1845:	learn: 0.5036377	total: 7m 37s	remaining: 38.1s
1846:	learn: 0.5036316	total: 7m 37s	remaining: 37.9s
1847:	learn: 0.5036284	total: 7m 37s	remaining: 37.7s
1848:	learn: 0.5036237	total: 7m 38s	remaining: 37.4s
1849:	learn: 0.5036109	total: 7m 38s	remaining: 37.2s
1850:	learn: 0.5036064	total: 7m 38s	remaining: 36.9s
1851:	learn: 0.5036027	total: 7m 38s	remaining: 36.7s
1852:	learn: 0.5035936	total: 7m 39s	remaining: 36.4s
1853:	learn: 0.5035863	total

1989:	learn: 0.5026657	total: 8m 17s	remaining: 2.5s
1990:	learn: 0.5026602	total: 8m 17s	remaining: 2.25s
1991:	learn: 0.5026550	total: 8m 17s	remaining: 2s
1992:	learn: 0.5026486	total: 8m 18s	remaining: 1.75s
1993:	learn: 0.5026436	total: 8m 18s	remaining: 1.5s
1994:	learn: 0.5026389	total: 8m 18s	remaining: 1.25s
1995:	learn: 0.5026332	total: 8m 19s	remaining: 1s
1996:	learn: 0.5026261	total: 8m 19s	remaining: 750ms
1997:	learn: 0.5026187	total: 8m 19s	remaining: 500ms
1998:	learn: 0.5026134	total: 8m 19s	remaining: 250ms
1999:	learn: 0.5026072	total: 8m 20s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f46b83f2a20>

In [18]:
print("train AUC: ", roc_auc_score(y, cb.predict_proba(X)[:, 1]))

train AUC:  0.6818271273574927


In [19]:
X_submit = preprocessing(df_test)

In [20]:
y_submit_pred = cb.predict_proba(X_submit)[:, 1]

In [21]:
product_id = df_test['product_id'].values
df_test = pd.DataFrame.from_dict({'product_id' : product_id, 'score' : y_submit_pred})
df_test.to_csv('./to_submit', sep = ',', index = False)

One hot and text embeddings helped achieve the quality. One hot worked better here than probobalistic encodings. 