In [46]:
import pandas as pd
import sklearn
import numpy as np
import catboost as cb
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
TRAIN_PATH = "./train.csv"
TEST_PATH = "./test.csv"

In [30]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH).drop('RowId', axis=1)

In [4]:
(train.shape, test.shape)

((56222, 10), (87804, 8))

In [5]:
train.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690.0,7953835000.0,116881275.0,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0
1,150950690.0,7953835000.0,116881275.0,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0
2,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0
3,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0
4,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0


In [6]:
test.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition
0,150950810.0,15401620000.0,62521740000.0,consida,EXACT,DESKTOP,2017-02-13,1.0
1,150951290.0,7953845000.0,191069700.0,billån,EXACT,DESKTOP,2017-02-10,1.5
2,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6
3,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2
4,150950810.0,15401620000.0,62521740000.0,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0


# Посмотрим, сколько всего уникальных кампаний есть в train

In [7]:
train["CampaignId"].unique().shape

(9,)

# Посмотрим уникальные значения для каждого признака

In [8]:
train["Query"].unique().shape # Достаточно много уникальных значений

(4289,)

In [9]:
train["QueryMatchTypeWithVariant"].unique()

array(['NEAR_EXACT', 'EXACT', 'NEAR_PHRASE', 'PHRASE', 'EXPANDED'],
      dtype=object)

In [10]:
train["Device"].unique()

array(['HIGH_END_MOBILE', 'DESKTOP', 'TABLET'], dtype=object)

# Убедимся в том, что симметрическая разность множеств уникальных значений по каждому признакому между тренировочной и тестовой выборкой является пустым множеством, другими словами, что в train есть те же уникальные значения, что и в  test и наоборот

In [11]:
(set(test["Query"].unique()) ^ set(train["Query"].unique())).__len__() # симметрическая разность

4385

In [12]:
# В test есть такие запросы, которые нет в train => будем брать объединение множеств запросов
# для обучения и построения BoW

In [13]:
# Основной обработчик данных
def handler(data, cidcols=None): # cidcols = campaign id columns in train
    ds = data.copy(deep=True)
    if (cidcols != None):
        cidcols = [("CID_" + str(x)) for x in cidcols]
    
    # One hot encoding campaign id
    ohcid = pd.get_dummies(data["CampaignId"], prefix="CID") # one hot campaign ids
    if (ohcid.columns.shape[0] == 3): # test case
        for col in list(set(cidcols) - set(ohcid.columns)): # lost columns
            ohcid[col] = np.zeros(len(ds))
    sorted_ohcids = sorted(ohcid.columns)
    for col in sorted_ohcids:
        ds[col] = ohcid[col].copy(deep=True)
    del ds["CampaignId"]
        
    # One hot encoding device
    ds = ds.join(pd.get_dummies(ds["Device"]))
    del ds["Device"]
    
    # One hot encoding QueryMatchTypeWithVariant
    ds = ds.join(pd.get_dummies(train["QueryMatchTypeWithVariant"]))
    del ds["QueryMatchTypeWithVariant"]
    
    # Extracting seasoning, only months, one hot encoding
    ds["Date"] = ds["Date"].map(lambda x : x[5:7])
    ds = ds.join(pd.get_dummies(ds["Date"], prefix="mnth"))
    if (cidcols != None): # Got test
        ds = ds.join(pd.DataFrame({"mnth_0" + str(i) : np.zeros(len(ds)) for i in range(3, 10)}, dtype="float64"))
        ds = ds.join(pd.DataFrame({"mnth_" + str(i) : np.zeros(len(ds)) for i in range(10, 13)}, dtype="float64"))
    del ds["Date"]
    
    # Creating target column
    if (cidcols == None): # Got train
        ds["CTR"] = (ds["Clicks"] / ds["Impressions"]).copy(deep=True)
        del ds["Clicks"]
        del ds["Impressions"]
        
    del ds["AdGroupId"]
    del ds["KeywordId"]
    
    return ds

In [31]:
X_train = handler(train)

In [32]:
X_test = handler(pd.read_csv(TEST_PATH).drop('RowId', axis=1), cidcols=list(train["CampaignId"].unique()))

# Пробуем word embedding

In [33]:
from gensim.models import Word2Vec

#model = Word2Vec(common_texts, size=50, window=5, min_count=1, workers=4)

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import euclidean

vect = CountVectorizer(ngram_range=(3, 3), analyzer='char_wb', max_features=500) 

uniq_queries = (train["Query"].append(test["Query"])).unique()
res = vect.fit_transform(uniq_queries).toarray()

In [35]:
res

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
res.shape

(4784, 500)

In [37]:
X_features = pd.DataFrame({"w2v_" + str(i + 1) : np.zeros(len(X_train)) for i in range(res.shape[1])})

In [38]:
X_features.head()

Unnamed: 0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,w2v_10,...,w2v_491,w2v_492,w2v_493,w2v_494,w2v_495,w2v_496,w2v_497,w2v_498,w2v_499,w2v_500
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
c = 0
for w in X_train["Query"]:
    X_features.iloc[c] = res[list(uniq_queries).index(w)]
    c += 1

In [49]:
# X_features.to_csv("X_features-500.csv")

In [43]:
X_train = X_train.join(X_features)

In [48]:
X_train = shuffle(X_train)

In [50]:
y_train = X_train["CTR"]

In [51]:
del X_train["CTR"]

# CatBoost

In [56]:
model = cb.CatBoostClassifier()
model.fit(X_train[:55000], y_train[:55000])

0:	learn: 0.6659506	total: 468ms	remaining: 7m 47s
1:	learn: 0.6401518	total: 774ms	remaining: 6m 26s
2:	learn: 0.6169004	total: 1.02s	remaining: 5m 40s
3:	learn: 0.5951822	total: 1.28s	remaining: 5m 19s
4:	learn: 0.5754659	total: 1.53s	remaining: 5m 5s
5:	learn: 0.5573996	total: 1.79s	remaining: 4m 56s
6:	learn: 0.5423467	total: 2.06s	remaining: 4m 51s
7:	learn: 0.5296598	total: 2.32s	remaining: 4m 48s
8:	learn: 0.5162491	total: 2.58s	remaining: 4m 43s
9:	learn: 0.5041947	total: 2.84s	remaining: 4m 41s
10:	learn: 0.4930428	total: 3.1s	remaining: 4m 39s
11:	learn: 0.4828722	total: 3.36s	remaining: 4m 36s
12:	learn: 0.4741302	total: 3.64s	remaining: 4m 36s
13:	learn: 0.4655123	total: 3.9s	remaining: 4m 34s
14:	learn: 0.4568722	total: 4.16s	remaining: 4m 33s
15:	learn: 0.4491242	total: 4.45s	remaining: 4m 33s
16:	learn: 0.4415442	total: 4.72s	remaining: 4m 33s
17:	learn: 0.4353413	total: 5s	remaining: 4m 32s
18:	learn: 0.4292022	total: 5.27s	remaining: 4m 32s
19:	learn: 0.4233000	total: 

158:	learn: 0.3123610	total: 46.1s	remaining: 4m 3s
159:	learn: 0.3122139	total: 46.3s	remaining: 4m 3s
160:	learn: 0.3120471	total: 46.6s	remaining: 4m 2s
161:	learn: 0.3117464	total: 46.9s	remaining: 4m 2s
162:	learn: 0.3115940	total: 47.2s	remaining: 4m 2s
163:	learn: 0.3114626	total: 47.4s	remaining: 4m 1s
164:	learn: 0.3113498	total: 47.7s	remaining: 4m 1s
165:	learn: 0.3111777	total: 48s	remaining: 4m 1s
166:	learn: 0.3110891	total: 48.3s	remaining: 4m 1s
167:	learn: 0.3109662	total: 48.6s	remaining: 4m
168:	learn: 0.3108763	total: 48.9s	remaining: 4m
169:	learn: 0.3107576	total: 49.2s	remaining: 4m
170:	learn: 0.3106361	total: 49.5s	remaining: 3m 59s
171:	learn: 0.3105198	total: 49.8s	remaining: 3m 59s
172:	learn: 0.3103321	total: 50.1s	remaining: 3m 59s
173:	learn: 0.3101918	total: 50.3s	remaining: 3m 58s
174:	learn: 0.3100787	total: 50.6s	remaining: 3m 58s
175:	learn: 0.3099257	total: 50.9s	remaining: 3m 58s
176:	learn: 0.3098273	total: 51.2s	remaining: 3m 58s
177:	learn: 0.30

312:	learn: 0.2938752	total: 1m 30s	remaining: 3m 19s
313:	learn: 0.2937877	total: 1m 31s	remaining: 3m 19s
314:	learn: 0.2936775	total: 1m 31s	remaining: 3m 18s
315:	learn: 0.2935751	total: 1m 31s	remaining: 3m 18s
316:	learn: 0.2935218	total: 1m 32s	remaining: 3m 18s
317:	learn: 0.2934393	total: 1m 32s	remaining: 3m 17s
318:	learn: 0.2933436	total: 1m 32s	remaining: 3m 17s
319:	learn: 0.2932760	total: 1m 32s	remaining: 3m 17s
320:	learn: 0.2930588	total: 1m 33s	remaining: 3m 17s
321:	learn: 0.2929857	total: 1m 33s	remaining: 3m 16s
322:	learn: 0.2929300	total: 1m 33s	remaining: 3m 16s
323:	learn: 0.2928321	total: 1m 34s	remaining: 3m 16s
324:	learn: 0.2927091	total: 1m 34s	remaining: 3m 15s
325:	learn: 0.2926326	total: 1m 34s	remaining: 3m 15s
326:	learn: 0.2925113	total: 1m 34s	remaining: 3m 15s
327:	learn: 0.2924576	total: 1m 35s	remaining: 3m 14s
328:	learn: 0.2924031	total: 1m 35s	remaining: 3m 14s
329:	learn: 0.2923364	total: 1m 35s	remaining: 3m 14s
330:	learn: 0.2922911	total:

466:	learn: 0.2828871	total: 2m 15s	remaining: 2m 35s
467:	learn: 0.2828417	total: 2m 16s	remaining: 2m 34s
468:	learn: 0.2827657	total: 2m 16s	remaining: 2m 34s
469:	learn: 0.2826756	total: 2m 16s	remaining: 2m 34s
470:	learn: 0.2825694	total: 2m 16s	remaining: 2m 33s
471:	learn: 0.2824911	total: 2m 17s	remaining: 2m 33s
472:	learn: 0.2824592	total: 2m 17s	remaining: 2m 33s
473:	learn: 0.2824032	total: 2m 17s	remaining: 2m 32s
474:	learn: 0.2823660	total: 2m 18s	remaining: 2m 32s
475:	learn: 0.2823258	total: 2m 18s	remaining: 2m 32s
476:	learn: 0.2822732	total: 2m 18s	remaining: 2m 32s
477:	learn: 0.2822082	total: 2m 18s	remaining: 2m 31s
478:	learn: 0.2821703	total: 2m 19s	remaining: 2m 31s
479:	learn: 0.2821135	total: 2m 19s	remaining: 2m 31s
480:	learn: 0.2820557	total: 2m 19s	remaining: 2m 30s
481:	learn: 0.2820093	total: 2m 20s	remaining: 2m 30s
482:	learn: 0.2819782	total: 2m 20s	remaining: 2m 30s
483:	learn: 0.2819122	total: 2m 20s	remaining: 2m 29s
484:	learn: 0.2817891	total:

619:	learn: 0.2753887	total: 2m 59s	remaining: 1m 50s
620:	learn: 0.2753437	total: 3m	remaining: 1m 49s
621:	learn: 0.2752680	total: 3m	remaining: 1m 49s
622:	learn: 0.2752386	total: 3m	remaining: 1m 49s
623:	learn: 0.2751548	total: 3m 1s	remaining: 1m 49s
624:	learn: 0.2751092	total: 3m 1s	remaining: 1m 48s
625:	learn: 0.2750648	total: 3m 1s	remaining: 1m 48s
626:	learn: 0.2750237	total: 3m 1s	remaining: 1m 48s
627:	learn: 0.2749839	total: 3m 2s	remaining: 1m 47s
628:	learn: 0.2749354	total: 3m 2s	remaining: 1m 47s
629:	learn: 0.2748886	total: 3m 2s	remaining: 1m 47s
630:	learn: 0.2748345	total: 3m 3s	remaining: 1m 47s
631:	learn: 0.2747818	total: 3m 3s	remaining: 1m 46s
632:	learn: 0.2747360	total: 3m 3s	remaining: 1m 46s
633:	learn: 0.2746895	total: 3m 3s	remaining: 1m 46s
634:	learn: 0.2746728	total: 3m 4s	remaining: 1m 45s
635:	learn: 0.2746092	total: 3m 4s	remaining: 1m 45s
636:	learn: 0.2745878	total: 3m 4s	remaining: 1m 45s
637:	learn: 0.2745253	total: 3m 5s	remaining: 1m 45s
6

772:	learn: 0.2691415	total: 3m 44s	remaining: 1m 5s
773:	learn: 0.2690501	total: 3m 44s	remaining: 1m 5s
774:	learn: 0.2689903	total: 3m 45s	remaining: 1m 5s
775:	learn: 0.2689524	total: 3m 45s	remaining: 1m 5s
776:	learn: 0.2689019	total: 3m 45s	remaining: 1m 4s
777:	learn: 0.2688560	total: 3m 45s	remaining: 1m 4s
778:	learn: 0.2688194	total: 3m 46s	remaining: 1m 4s
779:	learn: 0.2687764	total: 3m 46s	remaining: 1m 3s
780:	learn: 0.2687630	total: 3m 46s	remaining: 1m 3s
781:	learn: 0.2687527	total: 3m 47s	remaining: 1m 3s
782:	learn: 0.2687210	total: 3m 47s	remaining: 1m 3s
783:	learn: 0.2687128	total: 3m 47s	remaining: 1m 2s
784:	learn: 0.2686613	total: 3m 47s	remaining: 1m 2s
785:	learn: 0.2686312	total: 3m 48s	remaining: 1m 2s
786:	learn: 0.2686012	total: 3m 48s	remaining: 1m 1s
787:	learn: 0.2685662	total: 3m 48s	remaining: 1m 1s
788:	learn: 0.2685220	total: 3m 49s	remaining: 1m 1s
789:	learn: 0.2684965	total: 3m 49s	remaining: 1m
790:	learn: 0.2684466	total: 3m 49s	remaining: 1m

928:	learn: 0.2643120	total: 4m 33s	remaining: 20.9s
929:	learn: 0.2642780	total: 4m 33s	remaining: 20.6s
930:	learn: 0.2642564	total: 4m 34s	remaining: 20.3s
931:	learn: 0.2642246	total: 4m 34s	remaining: 20s
932:	learn: 0.2642000	total: 4m 34s	remaining: 19.7s
933:	learn: 0.2641838	total: 4m 34s	remaining: 19.4s
934:	learn: 0.2641693	total: 4m 35s	remaining: 19.1s
935:	learn: 0.2641436	total: 4m 35s	remaining: 18.8s
936:	learn: 0.2641170	total: 4m 35s	remaining: 18.5s
937:	learn: 0.2640904	total: 4m 36s	remaining: 18.2s
938:	learn: 0.2640570	total: 4m 36s	remaining: 18s
939:	learn: 0.2640262	total: 4m 36s	remaining: 17.7s
940:	learn: 0.2640129	total: 4m 36s	remaining: 17.4s
941:	learn: 0.2639945	total: 4m 37s	remaining: 17.1s
942:	learn: 0.2639694	total: 4m 37s	remaining: 16.8s
943:	learn: 0.2639489	total: 4m 37s	remaining: 16.5s
944:	learn: 0.2639243	total: 4m 38s	remaining: 16.2s
945:	learn: 0.2639026	total: 4m 38s	remaining: 15.9s
946:	learn: 0.2638690	total: 4m 38s	remaining: 15.

<catboost.core.CatBoostClassifier at 0x7ffa2cb501d0>

In [62]:
np.mean((model.predict_proba(X_train[55000:])[:, 1] - y_train[55000:]) ** 2)

0.07568899854208644

In [64]:
X_test.head()

Unnamed: 0,Query,AveragePosition,CID_150950690.0,CID_150950810.0,CID_150950930.0,CID_150951050.0,CID_150951170.0,CID_150951290.0,CID_150951410.0,CID_150951530.0,...,mnth_03,mnth_04,mnth_05,mnth_06,mnth_07,mnth_08,mnth_09,mnth_10,mnth_11,mnth_12
0,consida,1.0,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,billån,1.5,0,0,0.0,0.0,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wasa kredit,1.6,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,wasa kredit,1.2,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,consida,1.0,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
X_features_test = pd.DataFrame({"w2v_" + str(i + 1) : np.zeros(len(X_test)) for i in range(res.shape[1])})

In [66]:
c = 0
for w in X_test["Query"]:
    X_features_test.iloc[c] = res[list(uniq_queries).index(w)]
    c += 1

In [68]:
del X_test["Query"]

In [70]:
X_test = X_test.join(X_features_test)

In [73]:
model.predict_proba(X_test)[:, 1]

array([0.10420868, 0.11770246, 0.01664335, ..., 0.02156293, 0.03271539,
       0.04179146])

In [75]:
pd.DataFrame({'RowId' : range(len(X_test)), 'ClickProbability' : model.predict_proba(X_test)[:, 1]}).to_csv('2.csv', index=False)