In [1]:
import pandas as pd
import sklearn
import numpy as np
import catboost as cb
import xgboost as xgb

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
TRAIN_PATH = "./train.csv"
TEST_PATH = "./test.csv"

In [6]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH).drop('RowId', axis=1).drop_duplicates()

In [17]:
(train.shape, test.shape)

((56222, 10), (4684, 8))

In [9]:
train.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690.0,7953835000.0,116881275.0,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0
1,150950690.0,7953835000.0,116881275.0,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0
2,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0
3,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0
4,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0


In [10]:
test.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition
0,150950810.0,15401620000.0,62521740000.0,consida,EXACT,DESKTOP,2017-02-13,1.0
1,150951290.0,7953845000.0,191069700.0,billån,EXACT,DESKTOP,2017-02-10,1.5
2,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6
3,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2
4,150950810.0,15401620000.0,62521740000.0,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0


# Посмотрим, сколько всего уникальных компаний есть в train

In [16]:
train["CampaignId"].unique().shape

(9,)

# Посмотрим уникальные значения для каждого признака

In [15]:
train["Query"].unique().shape # Достаточно много уникальных значений

(4289,)

In [19]:
train["QueryMatchTypeWithVariant"].unique()

array(['NEAR_EXACT', 'EXACT', 'NEAR_PHRASE', 'PHRASE', 'EXPANDED'],
      dtype=object)

In [20]:
train["Device"].unique()

array(['HIGH_END_MOBILE', 'DESKTOP', 'TABLET'], dtype=object)

# Убедимся в том, что симметрическая разность множеств уникальных значений по каждому признакому между тренировочной и тестовой выборкой является пустым множеством, другими словами, что в train есть те же уникальные значения, что и в  test и наоборот

In [47]:
(set(test["Query"].unique()) ^ set(train["Query"].unique())).__len__() # симметрическая разность

4385

In [24]:
(train["Query"].unique().shape, test["Query"].unique().shape)

((4289,), (894,))

In [27]:
(set(test["Query"].unique()) - set(train["Query"].unique())).__len__()

495

In [30]:
# В test есть такие запросы, которые нет в train => будем брать объединение множеств запросов
# для обучения и построения BoW

In [76]:
all_requests = list(set(test["Query"].unique()) | set(train["Query"].unique()))

In [78]:
len(all_requests)

4784

In [87]:
list(train["CampaignId"].unique())

[150950690.0,
 150950810.0,
 150950930.0,
 150951050.0,
 150951170.0,
 150951290.0,
 150951410.0,
 150951530.0,
 150951650.0]

In [86]:
list(test["CampaignId"].unique())

[150950810.0, 150951290.0, 150950690.0]

In [177]:
# Основной обработчик данных
def handler(data, cidcols=None): # cidcols = campaign id columns in train
    ds = data.copy(deep=True)
    
    
    cidgd = pd.get_dummies(ds["CampaignId"], prefix="CID_")
    for col in cidgd:
        ds[col] = cidgd[col].copy(deep=True)
    if (cidcols != None): # Got test, without some unique campaign id
        for col in cidcols[3:]:
            ds["CID_" + str(col)] = np.zeros(len(ds))
    
    return ds

In [178]:
handler(train)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions,CID__150950690.0,CID__150950810.0,CID__150950930.0,CID__150951050.0,CID__150951170.0,CID__150951290.0,CID__150951410.0,CID__150951530.0,CID__150951650.0
0,150950690.0,7.953835e+09,1.168813e+08,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0,1,0,0,0,0,0,0,0,0
1,150950690.0,7.953835e+09,1.168813e+08,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0,1,0,0,0,0,0,0,0,0
2,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0,1,0,0,0,0,0,0,0,0
3,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0,1,0,0,0,0,0,0,0,0
4,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0,1,0,0,0,0,0,0,0,0
5,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-21,2.0,0.0,1.0,1,0,0,0,0,0,0,0,0
6,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-23,2.0,0.0,1.0,1,0,0,0,0,0,0,0,0
7,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-25,2.0,1.0,1.0,1,0,0,0,0,0,0,0,0
8,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-05-09,1.0,1.0,1.0,1,0,0,0,0,0,0,0,0
9,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-05-14,1.0,0.0,1.0,1,0,0,0,0,0,0,0,0


In [179]:
handler(test, cidcols=list(train["CampaignId"].unique()))

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,CID__150950690.0,CID__150950810.0,CID__150951290.0,CID_150951050.0,CID_150951170.0,CID_150951290.0,CID_150951410.0,CID_150951530.0,CID_150951650.0
0,150950810.0,1.540162e+10,6.252174e+10,consida,EXACT,DESKTOP,2017-02-13,1.0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
1,150951290.0,7.953845e+09,1.910697e+08,billån,EXACT,DESKTOP,2017-02-10,1.5,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0
2,150950810.0,2.457336e+10,5.701888e+08,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
3,150950810.0,2.457336e+10,5.701888e+08,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
4,150950810.0,1.540162e+10,6.252174e+10,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
5,150950690.0,7.953858e+09,3.260069e+08,mylan,EXACT,HIGH_END_MOBILE,2017-01-07,1.0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0
6,150950810.0,7.953835e+09,4.181418e+09,lendo,EXACT,HIGH_END_MOBILE,2017-02-04,5.3,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
7,150951290.0,7.953845e+09,1.910697e+08,billån,EXACT,DESKTOP,2017-01-13,2.7,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0
8,150950810.0,2.457348e+10,2.415573e+10,marginalen bank,EXACT,TABLET,2017-01-02,2.0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
9,150950810.0,2.457336e+10,5.701888e+08,wasa kredit,EXACT,HIGH_END_MOBILE,2017-02-04,1.5,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0
