In [1]:
import pandas as pd
import sklearn
import numpy as np
import catboost as cb
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
TRAIN_PATH = "./train.csv"
TEST_PATH = "./test.csv"

In [3]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH).drop('RowId', axis=1)

In [4]:
(train.shape, test.shape)

((56222, 10), (87804, 8))

In [5]:
train.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690.0,7953835000.0,116881275.0,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0
1,150950690.0,7953835000.0,116881275.0,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0
2,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0
3,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0
4,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0


In [6]:
test.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition
0,150950810.0,15401620000.0,62521740000.0,consida,EXACT,DESKTOP,2017-02-13,1.0
1,150951290.0,7953845000.0,191069700.0,billån,EXACT,DESKTOP,2017-02-10,1.5
2,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6
3,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2
4,150950810.0,15401620000.0,62521740000.0,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0


# Посмотрим, сколько всего уникальных кампаний есть в train

In [7]:
train["CampaignId"].unique().shape

(9,)

# Посмотрим уникальные значения для каждого признака

In [8]:
train["Query"].unique().shape # Достаточно много уникальных значений

(4289,)

In [9]:
train["QueryMatchTypeWithVariant"].unique()

array(['NEAR_EXACT', 'EXACT', 'NEAR_PHRASE', 'PHRASE', 'EXPANDED'],
      dtype=object)

In [10]:
train["Device"].unique()

array(['HIGH_END_MOBILE', 'DESKTOP', 'TABLET'], dtype=object)

# Убедимся в том, что симметрическая разность множеств уникальных значений по каждому признакому между тренировочной и тестовой выборкой является пустым множеством, другими словами, что в train есть те же уникальные значения, что и в  test и наоборот

In [11]:
(set(test["Query"].unique()) ^ set(train["Query"].unique())).__len__() # симметрическая разность

4385

In [12]:
# В test есть такие запросы, которые нет в train => будем брать объединение множеств запросов
# для обучения и построения BoW

In [13]:
# Основной обработчик данных
def handler(data, cidcols=None): # cidcols = campaign id columns in train
    ds = data.copy(deep=True)
    if (cidcols != None):
        cidcols = [("CID_" + str(x)) for x in cidcols]
    
    # One hot encoding campaign id
    ohcid = pd.get_dummies(data["CampaignId"], prefix="CID") # one hot campaign ids
    if (ohcid.columns.shape[0] == 3): # test case
        for col in list(set(cidcols) - set(ohcid.columns)): # lost columns
            ohcid[col] = np.zeros(len(ds))
    sorted_ohcids = sorted(ohcid.columns)
    for col in sorted_ohcids:
        ds[col] = ohcid[col].copy(deep=True)
    del ds["CampaignId"]
        
    # One hot encoding device
    ds = ds.join(pd.get_dummies(ds["Device"]))
    del ds["Device"]
    
    # One hot encoding QueryMatchTypeWithVariant
    ds = ds.join(pd.get_dummies(train["QueryMatchTypeWithVariant"]))
    del ds["QueryMatchTypeWithVariant"]
    
    # Extracting seasoning, only months, one hot encoding
    ds["Date"] = ds["Date"].map(lambda x : x[5:7])
    ds = ds.join(pd.get_dummies(ds["Date"], prefix="mnth"))
    if (cidcols != None): # Got test
        ds = ds.join(pd.DataFrame({"mnth_0" + str(i) : np.zeros(len(ds)) for i in range(3, 10)}, dtype="float64"))
        ds = ds.join(pd.DataFrame({"mnth_" + str(i) : np.zeros(len(ds)) for i in range(10, 13)}, dtype="float64"))
    del ds["Date"]
    
    # Creating target column
    if (cidcols == None): # Got train
        ds["CTR"] = (ds["Clicks"] / ds["Impressions"]).copy(deep=True)
        del ds["Clicks"]
        del ds["Impressions"]
        
    del ds["AdGroupId"]
    del ds["KeywordId"]
    
    return ds

In [14]:
X_train = handler(train)

In [15]:
X_test = handler(pd.read_csv(TEST_PATH).drop('RowId', axis=1), cidcols=list(train["CampaignId"].unique()))

# Пробуем word embedding

In [33]:
from gensim.models import Word2Vec

#model = Word2Vec(common_texts, size=50, window=5, min_count=1, workers=4)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import euclidean

vect = CountVectorizer(ngram_range=(3, 3), analyzer='char_wb', max_features=100) 

uniq_queries = (train["Query"].append(test["Query"])).unique()
res = vect.fit_transform(uniq_queries).toarray()

In [17]:
res

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
res.shape

(4784, 100)

In [19]:
X_features = pd.DataFrame({"w2v_" + str(i + 1) : np.zeros(len(X_train)) for i in range(res.shape[1])})

In [20]:
X_features.head()

Unnamed: 0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,w2v_10,...,w2v_91,w2v_92,w2v_93,w2v_94,w2v_95,w2v_96,w2v_97,w2v_98,w2v_99,w2v_100
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
c = 0
for w in X_train["Query"]:
    X_features.iloc[c] = res[list(uniq_queries).index(w)]
    c += 1

In [22]:
# X_features.to_csv("X_features-100.csv")

In [23]:
X_train = X_train.join(X_features)
del X_features

In [24]:
X_train = shuffle(X_train)

In [25]:
y_train = X_train["CTR"]

In [26]:
del X_train["CTR"]

In [28]:
del X_train["Query"]

# CatBoost

In [35]:
model = cb.CatBoostClassifier(learning_rate=0.1, l2_leaf_reg=0.8)
model.fit(X_train[:55000], y_train[:55000])

0:	learn: 0.6073409	total: 69.9ms	remaining: 1m 9s
1:	learn: 0.5411171	total: 139ms	remaining: 1m 9s
2:	learn: 0.4947705	total: 213ms	remaining: 1m 10s
3:	learn: 0.4617881	total: 305ms	remaining: 1m 15s
4:	learn: 0.4364496	total: 377ms	remaining: 1m 15s
5:	learn: 0.4186818	total: 452ms	remaining: 1m 14s
6:	learn: 0.4013383	total: 536ms	remaining: 1m 15s
7:	learn: 0.3871275	total: 612ms	remaining: 1m 15s
8:	learn: 0.3770618	total: 680ms	remaining: 1m 14s
9:	learn: 0.3693566	total: 767ms	remaining: 1m 15s
10:	learn: 0.3620814	total: 842ms	remaining: 1m 15s
11:	learn: 0.3572870	total: 917ms	remaining: 1m 15s
12:	learn: 0.3524144	total: 1000ms	remaining: 1m 15s
13:	learn: 0.3482283	total: 1.07s	remaining: 1m 15s
14:	learn: 0.3443455	total: 1.14s	remaining: 1m 14s
15:	learn: 0.3414622	total: 1.22s	remaining: 1m 15s
16:	learn: 0.3393200	total: 1.3s	remaining: 1m 15s
17:	learn: 0.3369536	total: 1.37s	remaining: 1m 14s
18:	learn: 0.3351322	total: 1.46s	remaining: 1m 15s
19:	learn: 0.3329051	to

160:	learn: 0.2754318	total: 14.4s	remaining: 1m 14s
161:	learn: 0.2753596	total: 14.4s	remaining: 1m 14s
162:	learn: 0.2751307	total: 14.6s	remaining: 1m 14s
163:	learn: 0.2749032	total: 14.7s	remaining: 1m 15s
164:	learn: 0.2748379	total: 14.8s	remaining: 1m 15s
165:	learn: 0.2745829	total: 14.9s	remaining: 1m 14s
166:	learn: 0.2744808	total: 15s	remaining: 1m 14s
167:	learn: 0.2744390	total: 15.1s	remaining: 1m 14s
168:	learn: 0.2743218	total: 15.2s	remaining: 1m 14s
169:	learn: 0.2741338	total: 15.3s	remaining: 1m 14s
170:	learn: 0.2738823	total: 15.4s	remaining: 1m 14s
171:	learn: 0.2737308	total: 15.5s	remaining: 1m 14s
172:	learn: 0.2736188	total: 15.6s	remaining: 1m 14s
173:	learn: 0.2734187	total: 15.7s	remaining: 1m 14s
174:	learn: 0.2733212	total: 15.8s	remaining: 1m 14s
175:	learn: 0.2732376	total: 15.9s	remaining: 1m 14s
176:	learn: 0.2731810	total: 16s	remaining: 1m 14s
177:	learn: 0.2730669	total: 16.1s	remaining: 1m 14s
178:	learn: 0.2729683	total: 16.2s	remaining: 1m 1

319:	learn: 0.2590560	total: 27.6s	remaining: 58.7s
320:	learn: 0.2589312	total: 27.7s	remaining: 58.6s
321:	learn: 0.2589178	total: 27.8s	remaining: 58.5s
322:	learn: 0.2588993	total: 27.9s	remaining: 58.4s
323:	learn: 0.2587654	total: 28s	remaining: 58.4s
324:	learn: 0.2586694	total: 28.1s	remaining: 58.3s
325:	learn: 0.2586483	total: 28.1s	remaining: 58.2s
326:	learn: 0.2585004	total: 28.2s	remaining: 58.1s
327:	learn: 0.2584891	total: 28.3s	remaining: 58s
328:	learn: 0.2584113	total: 28.4s	remaining: 57.9s
329:	learn: 0.2583644	total: 28.5s	remaining: 57.8s
330:	learn: 0.2582812	total: 28.5s	remaining: 57.7s
331:	learn: 0.2581686	total: 28.6s	remaining: 57.5s
332:	learn: 0.2581069	total: 28.7s	remaining: 57.5s
333:	learn: 0.2579557	total: 28.8s	remaining: 57.4s
334:	learn: 0.2578928	total: 29s	remaining: 57.5s
335:	learn: 0.2578284	total: 29.1s	remaining: 57.6s
336:	learn: 0.2577314	total: 29.3s	remaining: 57.6s
337:	learn: 0.2576696	total: 29.3s	remaining: 57.5s
338:	learn: 0.2575

479:	learn: 0.2488298	total: 41.6s	remaining: 45s
480:	learn: 0.2488178	total: 41.7s	remaining: 45s
481:	learn: 0.2487648	total: 41.8s	remaining: 44.9s
482:	learn: 0.2487201	total: 41.8s	remaining: 44.8s
483:	learn: 0.2486978	total: 42s	remaining: 44.7s
484:	learn: 0.2486382	total: 42.1s	remaining: 44.7s
485:	learn: 0.2485683	total: 42.2s	remaining: 44.6s
486:	learn: 0.2485159	total: 42.2s	remaining: 44.5s
487:	learn: 0.2484553	total: 42.3s	remaining: 44.4s
488:	learn: 0.2484304	total: 42.4s	remaining: 44.3s
489:	learn: 0.2483776	total: 42.5s	remaining: 44.2s
490:	learn: 0.2483284	total: 42.6s	remaining: 44.1s
491:	learn: 0.2482803	total: 42.7s	remaining: 44s
492:	learn: 0.2482448	total: 42.8s	remaining: 44s
493:	learn: 0.2481700	total: 42.9s	remaining: 43.9s
494:	learn: 0.2481194	total: 42.9s	remaining: 43.8s
495:	learn: 0.2480625	total: 43s	remaining: 43.7s
496:	learn: 0.2480201	total: 43.1s	remaining: 43.6s
497:	learn: 0.2479961	total: 43.2s	remaining: 43.5s
498:	learn: 0.2479614	to

638:	learn: 0.2413433	total: 55.4s	remaining: 31.3s
639:	learn: 0.2413269	total: 55.5s	remaining: 31.2s
640:	learn: 0.2412775	total: 55.5s	remaining: 31.1s
641:	learn: 0.2412573	total: 55.6s	remaining: 31s
642:	learn: 0.2412053	total: 55.7s	remaining: 30.9s
643:	learn: 0.2411644	total: 55.8s	remaining: 30.8s
644:	learn: 0.2411388	total: 55.8s	remaining: 30.7s
645:	learn: 0.2411326	total: 55.9s	remaining: 30.6s
646:	learn: 0.2410720	total: 56s	remaining: 30.5s
647:	learn: 0.2410293	total: 56.1s	remaining: 30.5s
648:	learn: 0.2410053	total: 56.1s	remaining: 30.4s
649:	learn: 0.2409829	total: 56.2s	remaining: 30.3s
650:	learn: 0.2409103	total: 56.3s	remaining: 30.2s
651:	learn: 0.2408291	total: 56.4s	remaining: 30.1s
652:	learn: 0.2407968	total: 56.4s	remaining: 30s
653:	learn: 0.2407686	total: 56.5s	remaining: 29.9s
654:	learn: 0.2407422	total: 56.6s	remaining: 29.8s
655:	learn: 0.2406935	total: 56.7s	remaining: 29.7s
656:	learn: 0.2406323	total: 56.8s	remaining: 29.6s
657:	learn: 0.2406

800:	learn: 0.2348476	total: 1m 7s	remaining: 16.9s
801:	learn: 0.2348025	total: 1m 7s	remaining: 16.8s
802:	learn: 0.2347859	total: 1m 7s	remaining: 16.7s
803:	learn: 0.2347712	total: 1m 8s	remaining: 16.6s
804:	learn: 0.2347538	total: 1m 8s	remaining: 16.5s
805:	learn: 0.2347428	total: 1m 8s	remaining: 16.4s
806:	learn: 0.2346845	total: 1m 8s	remaining: 16.3s
807:	learn: 0.2346678	total: 1m 8s	remaining: 16.2s
808:	learn: 0.2345965	total: 1m 8s	remaining: 16.2s
809:	learn: 0.2345842	total: 1m 8s	remaining: 16.1s
810:	learn: 0.2345652	total: 1m 8s	remaining: 16s
811:	learn: 0.2345285	total: 1m 8s	remaining: 15.9s
812:	learn: 0.2344628	total: 1m 8s	remaining: 15.8s
813:	learn: 0.2344299	total: 1m 8s	remaining: 15.7s
814:	learn: 0.2344011	total: 1m 8s	remaining: 15.6s
815:	learn: 0.2343775	total: 1m 8s	remaining: 15.6s
816:	learn: 0.2343600	total: 1m 9s	remaining: 15.5s
817:	learn: 0.2343084	total: 1m 9s	remaining: 15.4s
818:	learn: 0.2342819	total: 1m 9s	remaining: 15.3s
819:	learn: 0.

956:	learn: 0.2298145	total: 1m 19s	remaining: 3.59s
957:	learn: 0.2297991	total: 1m 20s	remaining: 3.51s
958:	learn: 0.2297814	total: 1m 20s	remaining: 3.42s
959:	learn: 0.2297569	total: 1m 20s	remaining: 3.34s
960:	learn: 0.2297448	total: 1m 20s	remaining: 3.26s
961:	learn: 0.2297230	total: 1m 20s	remaining: 3.17s
962:	learn: 0.2296746	total: 1m 20s	remaining: 3.09s
963:	learn: 0.2296430	total: 1m 20s	remaining: 3.01s
964:	learn: 0.2295823	total: 1m 20s	remaining: 2.92s
965:	learn: 0.2295635	total: 1m 20s	remaining: 2.84s
966:	learn: 0.2295512	total: 1m 20s	remaining: 2.75s
967:	learn: 0.2295159	total: 1m 20s	remaining: 2.67s
968:	learn: 0.2294935	total: 1m 20s	remaining: 2.59s
969:	learn: 0.2294743	total: 1m 20s	remaining: 2.5s
970:	learn: 0.2294169	total: 1m 21s	remaining: 2.42s
971:	learn: 0.2293712	total: 1m 21s	remaining: 2.34s
972:	learn: 0.2293636	total: 1m 21s	remaining: 2.25s
973:	learn: 0.2293197	total: 1m 21s	remaining: 2.17s
974:	learn: 0.2292993	total: 1m 21s	remaining: 

<catboost.core.CatBoostClassifier at 0x7fa005684940>

In [36]:
np.mean((model.predict_proba(X_train[55000:])[:, 1] - y_train[55000:]) ** 2) ** (1 / 2)

0.27719775711352107

In [37]:
X_test.head()

Unnamed: 0,Query,AveragePosition,CID_150950690.0,CID_150950810.0,CID_150950930.0,CID_150951050.0,CID_150951170.0,CID_150951290.0,CID_150951410.0,CID_150951530.0,...,mnth_03,mnth_04,mnth_05,mnth_06,mnth_07,mnth_08,mnth_09,mnth_10,mnth_11,mnth_12
0,consida,1.0,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,billån,1.5,0,0,0.0,0.0,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wasa kredit,1.6,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,wasa kredit,1.2,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,consida,1.0,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
X_features_test = pd.DataFrame({"w2v_" + str(i + 1) : np.zeros(len(X_test)) for i in range(res.shape[1])})

In [39]:
c = 0
for w in X_test["Query"]:
    X_features_test.iloc[c] = res[list(uniq_queries).index(w)]
    c += 1

In [40]:
del X_test["Query"]

In [41]:
X_test = X_test.join(X_features_test)

In [42]:
model.predict_proba(X_test)[:, 1]

array([0.17218741, 0.13060129, 0.00354877, ..., 0.01266531, 0.02571138,
       0.02850572])

In [43]:
pd.DataFrame({'RowId' : range(len(X_test)), 'ClickProbability' : model.predict_proba(X_test)[:, 1]}).to_csv('4.csv', index=False)

# Keras

In [83]:
import keras

Using TensorFlow backend.


In [90]:
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l1, l2

In [142]:
nn = Sequential()
nn.add(Dense(530,
             activation="sigmoid",
             kernel_regularizer=l2(0.7),
             input_shape=(530, )))
nn.add(Dense(1,
            activation="sigmoid",
            input_shape=(530, )))
nn.compile("sgd", loss="binary_crossentropy", metrics=["mse"])
nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_61 (Dense)             (None, 530)               281430    
_________________________________________________________________
dense_62 (Dense)             (None, 1)                 531       
Total params: 281,961
Trainable params: 281,961
Non-trainable params: 0
_________________________________________________________________


In [143]:
nn.fit(X_train[:55000], y_train[:55000], epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff9bdbece48>