In [46]:
import pandas as pd
import sklearn
import numpy as np
import catboost as cb
import xgboost as xgb

from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
TRAIN_PATH = "./train.csv"
TEST_PATH = "./test.csv"

In [30]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH).drop('RowId', axis=1)

In [4]:
(train.shape, test.shape)

((56222, 10), (87804, 8))

In [5]:
train.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690.0,7953835000.0,116881275.0,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0
1,150950690.0,7953835000.0,116881275.0,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0
2,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0
3,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0
4,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0


In [6]:
test.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition
0,150950810.0,15401620000.0,62521740000.0,consida,EXACT,DESKTOP,2017-02-13,1.0
1,150951290.0,7953845000.0,191069700.0,billån,EXACT,DESKTOP,2017-02-10,1.5
2,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6
3,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2
4,150950810.0,15401620000.0,62521740000.0,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0


# Посмотрим, сколько всего уникальных кампаний есть в train

In [7]:
train["CampaignId"].unique().shape

(9,)

# Посмотрим уникальные значения для каждого признака

In [8]:
train["Query"].unique().shape # Достаточно много уникальных значений

(4289,)

In [9]:
train["QueryMatchTypeWithVariant"].unique()

array(['NEAR_EXACT', 'EXACT', 'NEAR_PHRASE', 'PHRASE', 'EXPANDED'],
      dtype=object)

In [10]:
train["Device"].unique()

array(['HIGH_END_MOBILE', 'DESKTOP', 'TABLET'], dtype=object)

# Убедимся в том, что симметрическая разность множеств уникальных значений по каждому признакому между тренировочной и тестовой выборкой является пустым множеством, другими словами, что в train есть те же уникальные значения, что и в  test и наоборот

In [11]:
(set(test["Query"].unique()) ^ set(train["Query"].unique())).__len__() # симметрическая разность

4385

In [12]:
# В test есть такие запросы, которые нет в train => будем брать объединение множеств запросов
# для обучения и построения BoW

In [13]:
# Основной обработчик данных
def handler(data, cidcols=None): # cidcols = campaign id columns in train
    ds = data.copy(deep=True)
    if (cidcols != None):
        cidcols = [("CID_" + str(x)) for x in cidcols]
    
    # One hot encoding campaign id
    ohcid = pd.get_dummies(data["CampaignId"], prefix="CID") # one hot campaign ids
    if (ohcid.columns.shape[0] == 3): # test case
        for col in list(set(cidcols) - set(ohcid.columns)): # lost columns
            ohcid[col] = np.zeros(len(ds))
    sorted_ohcids = sorted(ohcid.columns)
    for col in sorted_ohcids:
        ds[col] = ohcid[col].copy(deep=True)
    del ds["CampaignId"]
        
    # One hot encoding device
    ds = ds.join(pd.get_dummies(ds["Device"]))
    del ds["Device"]
    
    # One hot encoding QueryMatchTypeWithVariant
    ds = ds.join(pd.get_dummies(train["QueryMatchTypeWithVariant"]))
    del ds["QueryMatchTypeWithVariant"]
    
    # Extracting seasoning, only months, one hot encoding
    ds["Date"] = ds["Date"].map(lambda x : x[5:7])
    ds = ds.join(pd.get_dummies(ds["Date"], prefix="mnth"))
    if (cidcols != None): # Got test
        ds = ds.join(pd.DataFrame({"mnth_0" + str(i) : np.zeros(len(ds)) for i in range(3, 10)}, dtype="float64"))
        ds = ds.join(pd.DataFrame({"mnth_" + str(i) : np.zeros(len(ds)) for i in range(10, 13)}, dtype="float64"))
    del ds["Date"]
    
    # Creating target column
    if (cidcols == None): # Got train
        ds["CTR"] = (ds["Clicks"] / ds["Impressions"]).copy(deep=True)
        del ds["Clicks"]
        del ds["Impressions"]
        
    del ds["AdGroupId"]
    del ds["KeywordId"]
    
    return ds

In [149]:
X_train = handler(train)

In [150]:
X_test = handler(pd.read_csv(TEST_PATH).drop('RowId', axis=1), cidcols=list(train["CampaignId"].unique()))

# Пробуем word embedding

In [33]:
from gensim.models import Word2Vec

#model = Word2Vec(common_texts, size=50, window=5, min_count=1, workers=4)

In [151]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import euclidean

vect = CountVectorizer(ngram_range=(3, 3), analyzer='char_wb', max_features=2000) 

uniq_queries = (train["Query"].append(test["Query"])).unique()
res = vect.fit_transform(uniq_queries).toarray()

In [152]:
res

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [153]:
res.shape

(4784, 2000)

In [154]:
X_features = pd.DataFrame({"w2v_" + str(i + 1) : np.zeros(len(X_train)) for i in range(res.shape[1])})

MemoryError: 

In [38]:
X_features.head()

Unnamed: 0,w2v_1,w2v_2,w2v_3,w2v_4,w2v_5,w2v_6,w2v_7,w2v_8,w2v_9,w2v_10,...,w2v_491,w2v_492,w2v_493,w2v_494,w2v_495,w2v_496,w2v_497,w2v_498,w2v_499,w2v_500
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
c = 0
for w in X_train["Query"]:
    X_features.iloc[c] = res[list(uniq_queries).index(w)]
    c += 1

KeyError: 'Query'

In [49]:
# X_features.to_csv("X_features-500.csv")

In [43]:
X_train = X_train.join(X_features)

In [48]:
X_train = shuffle(X_train)

In [50]:
y_train = X_train["CTR"]

In [51]:
del X_train["CTR"]

# CatBoost

In [113]:
model = cb.CatBoostClassifier(learning_rate=0.05, depth=8, l2_leaf_reg=0.5)
model.fit(X_train[:55000], y_train[:55000])

0:	learn: 0.6467510	total: 307ms	remaining: 5m 6s
1:	learn: 0.6066220	total: 627ms	remaining: 5m 12s
2:	learn: 0.5704524	total: 945ms	remaining: 5m 14s
3:	learn: 0.5406144	total: 1.27s	remaining: 5m 15s
4:	learn: 0.5167096	total: 1.59s	remaining: 5m 16s
5:	learn: 0.4950251	total: 1.91s	remaining: 5m 16s
6:	learn: 0.4760564	total: 2.25s	remaining: 5m 18s
7:	learn: 0.4599215	total: 2.58s	remaining: 5m 19s
8:	learn: 0.4464948	total: 2.92s	remaining: 5m 21s
9:	learn: 0.4332024	total: 3.27s	remaining: 5m 23s
10:	learn: 0.4224452	total: 3.64s	remaining: 5m 27s
11:	learn: 0.4126704	total: 3.98s	remaining: 5m 28s
12:	learn: 0.4042447	total: 4.34s	remaining: 5m 29s
13:	learn: 0.3951411	total: 4.67s	remaining: 5m 28s
14:	learn: 0.3883579	total: 5.01s	remaining: 5m 29s
15:	learn: 0.3826145	total: 5.38s	remaining: 5m 30s
16:	learn: 0.3768833	total: 5.72s	remaining: 5m 30s
17:	learn: 0.3720041	total: 6.08s	remaining: 5m 31s
18:	learn: 0.3684997	total: 6.47s	remaining: 5m 33s
19:	learn: 0.3648966	to

158:	learn: 0.2821163	total: 57.4s	remaining: 5m 3s
159:	learn: 0.2819212	total: 57.7s	remaining: 5m 3s
160:	learn: 0.2817932	total: 58.1s	remaining: 5m 2s
161:	learn: 0.2816329	total: 58.4s	remaining: 5m 2s
162:	learn: 0.2811687	total: 58.8s	remaining: 5m 1s
163:	learn: 0.2810479	total: 59.1s	remaining: 5m 1s
164:	learn: 0.2808642	total: 59.5s	remaining: 5m
165:	learn: 0.2806037	total: 59.8s	remaining: 5m
166:	learn: 0.2804615	total: 1m	remaining: 5m
167:	learn: 0.2803152	total: 1m	remaining: 4m 59s
168:	learn: 0.2801929	total: 1m	remaining: 4m 59s
169:	learn: 0.2799446	total: 1m 1s	remaining: 4m 59s
170:	learn: 0.2798117	total: 1m 1s	remaining: 4m 58s
171:	learn: 0.2795959	total: 1m 2s	remaining: 4m 58s
172:	learn: 0.2794526	total: 1m 2s	remaining: 4m 58s
173:	learn: 0.2792842	total: 1m 2s	remaining: 4m 57s
174:	learn: 0.2790507	total: 1m 3s	remaining: 4m 57s
175:	learn: 0.2789053	total: 1m 3s	remaining: 4m 57s
176:	learn: 0.2788087	total: 1m 3s	remaining: 4m 56s
177:	learn: 0.278695

311:	learn: 0.2607697	total: 1m 52s	remaining: 4m 8s
312:	learn: 0.2607026	total: 1m 52s	remaining: 4m 7s
313:	learn: 0.2605975	total: 1m 53s	remaining: 4m 7s
314:	learn: 0.2604926	total: 1m 53s	remaining: 4m 6s
315:	learn: 0.2604268	total: 1m 53s	remaining: 4m 6s
316:	learn: 0.2602747	total: 1m 54s	remaining: 4m 6s
317:	learn: 0.2602067	total: 1m 54s	remaining: 4m 5s
318:	learn: 0.2600944	total: 1m 55s	remaining: 4m 5s
319:	learn: 0.2600246	total: 1m 55s	remaining: 4m 5s
320:	learn: 0.2598918	total: 1m 55s	remaining: 4m 4s
321:	learn: 0.2598316	total: 1m 56s	remaining: 4m 4s
322:	learn: 0.2596536	total: 1m 56s	remaining: 4m 4s
323:	learn: 0.2595868	total: 1m 56s	remaining: 4m 3s
324:	learn: 0.2594668	total: 1m 57s	remaining: 4m 3s
325:	learn: 0.2593962	total: 1m 57s	remaining: 4m 2s
326:	learn: 0.2593457	total: 1m 57s	remaining: 4m 2s
327:	learn: 0.2592469	total: 1m 58s	remaining: 4m 2s
328:	learn: 0.2591565	total: 1m 58s	remaining: 4m 1s
329:	learn: 0.2590230	total: 1m 58s	remaining:

464:	learn: 0.2476990	total: 2m 47s	remaining: 3m 12s
465:	learn: 0.2475714	total: 2m 47s	remaining: 3m 12s
466:	learn: 0.2475421	total: 2m 48s	remaining: 3m 11s
467:	learn: 0.2474535	total: 2m 48s	remaining: 3m 11s
468:	learn: 0.2473841	total: 2m 48s	remaining: 3m 11s
469:	learn: 0.2473195	total: 2m 49s	remaining: 3m 10s
470:	learn: 0.2472312	total: 2m 49s	remaining: 3m 10s
471:	learn: 0.2471770	total: 2m 49s	remaining: 3m 9s
472:	learn: 0.2471365	total: 2m 50s	remaining: 3m 9s
473:	learn: 0.2470864	total: 2m 50s	remaining: 3m 9s
474:	learn: 0.2470024	total: 2m 50s	remaining: 3m 8s
475:	learn: 0.2469707	total: 2m 51s	remaining: 3m 8s
476:	learn: 0.2469237	total: 2m 51s	remaining: 3m 8s
477:	learn: 0.2468705	total: 2m 52s	remaining: 3m 7s
478:	learn: 0.2467837	total: 2m 52s	remaining: 3m 7s
479:	learn: 0.2467091	total: 2m 52s	remaining: 3m 7s
480:	learn: 0.2466542	total: 2m 53s	remaining: 3m 6s
481:	learn: 0.2465810	total: 2m 53s	remaining: 3m 6s
482:	learn: 0.2465117	total: 2m 53s	rem

618:	learn: 0.2380945	total: 3m 42s	remaining: 2m 17s
619:	learn: 0.2380208	total: 3m 43s	remaining: 2m 16s
620:	learn: 0.2379915	total: 3m 43s	remaining: 2m 16s
621:	learn: 0.2379152	total: 3m 43s	remaining: 2m 16s
622:	learn: 0.2378486	total: 3m 44s	remaining: 2m 15s
623:	learn: 0.2377989	total: 3m 44s	remaining: 2m 15s
624:	learn: 0.2377131	total: 3m 44s	remaining: 2m 14s
625:	learn: 0.2376687	total: 3m 45s	remaining: 2m 14s
626:	learn: 0.2375546	total: 3m 45s	remaining: 2m 14s
627:	learn: 0.2374927	total: 3m 46s	remaining: 2m 13s
628:	learn: 0.2374427	total: 3m 46s	remaining: 2m 13s
629:	learn: 0.2373860	total: 3m 46s	remaining: 2m 13s
630:	learn: 0.2373148	total: 3m 47s	remaining: 2m 12s
631:	learn: 0.2372711	total: 3m 47s	remaining: 2m 12s
632:	learn: 0.2372327	total: 3m 47s	remaining: 2m 12s
633:	learn: 0.2371975	total: 3m 48s	remaining: 2m 11s
634:	learn: 0.2371597	total: 3m 48s	remaining: 2m 11s
635:	learn: 0.2371164	total: 3m 48s	remaining: 2m 10s
636:	learn: 0.2370655	total:

772:	learn: 0.2301950	total: 4m 40s	remaining: 1m 22s
773:	learn: 0.2301123	total: 4m 40s	remaining: 1m 21s
774:	learn: 0.2300824	total: 4m 40s	remaining: 1m 21s
775:	learn: 0.2300012	total: 4m 41s	remaining: 1m 21s
776:	learn: 0.2299229	total: 4m 41s	remaining: 1m 20s
777:	learn: 0.2298862	total: 4m 41s	remaining: 1m 20s
778:	learn: 0.2298212	total: 4m 42s	remaining: 1m 20s
779:	learn: 0.2297863	total: 4m 42s	remaining: 1m 19s
780:	learn: 0.2297180	total: 4m 42s	remaining: 1m 19s
781:	learn: 0.2296616	total: 4m 43s	remaining: 1m 18s
782:	learn: 0.2296276	total: 4m 43s	remaining: 1m 18s
783:	learn: 0.2295995	total: 4m 44s	remaining: 1m 18s
784:	learn: 0.2295621	total: 4m 44s	remaining: 1m 17s
785:	learn: 0.2295208	total: 4m 44s	remaining: 1m 17s
786:	learn: 0.2294626	total: 4m 45s	remaining: 1m 17s
787:	learn: 0.2294430	total: 4m 45s	remaining: 1m 16s
788:	learn: 0.2293753	total: 4m 45s	remaining: 1m 16s
789:	learn: 0.2293235	total: 4m 46s	remaining: 1m 16s
790:	learn: 0.2292825	total:

928:	learn: 0.2236151	total: 5m 35s	remaining: 25.6s
929:	learn: 0.2235975	total: 5m 35s	remaining: 25.3s
930:	learn: 0.2235745	total: 5m 36s	remaining: 24.9s
931:	learn: 0.2234714	total: 5m 36s	remaining: 24.6s
932:	learn: 0.2234512	total: 5m 36s	remaining: 24.2s
933:	learn: 0.2233995	total: 5m 37s	remaining: 23.8s
934:	learn: 0.2233684	total: 5m 37s	remaining: 23.5s
935:	learn: 0.2233277	total: 5m 38s	remaining: 23.1s
936:	learn: 0.2232979	total: 5m 38s	remaining: 22.8s
937:	learn: 0.2232696	total: 5m 38s	remaining: 22.4s
938:	learn: 0.2231778	total: 5m 39s	remaining: 22s
939:	learn: 0.2231192	total: 5m 39s	remaining: 21.7s
940:	learn: 0.2230968	total: 5m 39s	remaining: 21.3s
941:	learn: 0.2230585	total: 5m 40s	remaining: 21s
942:	learn: 0.2230226	total: 5m 40s	remaining: 20.6s
943:	learn: 0.2229657	total: 5m 41s	remaining: 20.2s
944:	learn: 0.2229360	total: 5m 41s	remaining: 19.9s
945:	learn: 0.2229140	total: 5m 41s	remaining: 19.5s
946:	learn: 0.2228549	total: 5m 42s	remaining: 19.

<catboost.core.CatBoostClassifier at 0x7ff9e05c8048>

In [124]:
np.mean((model.predict_proba(X_train[55000:])[:, 1] - y_train[55000:]) ** 2) ** (1 / 2)

0.2726947449959501

In [64]:
X_test.head()

Unnamed: 0,Query,AveragePosition,CID_150950690.0,CID_150950810.0,CID_150950930.0,CID_150951050.0,CID_150951170.0,CID_150951290.0,CID_150951410.0,CID_150951530.0,...,mnth_03,mnth_04,mnth_05,mnth_06,mnth_07,mnth_08,mnth_09,mnth_10,mnth_11,mnth_12
0,consida,1.0,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,billån,1.5,0,0,0.0,0.0,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,wasa kredit,1.6,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,wasa kredit,1.2,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,consida,1.0,0,1,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
X_features_test = pd.DataFrame({"w2v_" + str(i + 1) : np.zeros(len(X_test)) for i in range(res.shape[1])})

In [66]:
c = 0
for w in X_test["Query"]:
    X_features_test.iloc[c] = res[list(uniq_queries).index(w)]
    c += 1

In [68]:
del X_test["Query"]

In [70]:
X_test = X_test.join(X_features_test)

In [73]:
model.predict_proba(X_test)[:, 1]

array([0.10420868, 0.11770246, 0.01664335, ..., 0.02156293, 0.03271539,
       0.04179146])

In [77]:
pd.DataFrame({'RowId' : range(len(X_test)), 'ClickProbability' : model.predict_proba(X_test)[:, 1]}).to_csv('3.csv', index=False)

# Keras

In [83]:
import keras

Using TensorFlow backend.


In [90]:
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l1, l2

In [142]:
nn = Sequential()
nn.add(Dense(530,
             activation="sigmoid",
             kernel_regularizer=l2(0.7),
             input_shape=(530, )))
nn.add(Dense(1,
            activation="sigmoid",
            input_shape=(530, )))
nn.compile("sgd", loss="binary_crossentropy", metrics=["mse"])
nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_61 (Dense)             (None, 530)               281430    
_________________________________________________________________
dense_62 (Dense)             (None, 1)                 531       
Total params: 281,961
Trainable params: 281,961
Non-trainable params: 0
_________________________________________________________________


In [143]:
nn.fit(X_train[:55000], y_train[:55000], epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff9bdbece48>