In [332]:
import pandas as pd
import sklearn
import numpy as np
import catboost as cb
import xgboost as xgb

from sklearn.tree import DecisionTreeRegressor
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [2]:
TRAIN_PATH = "./train.csv"
TEST_PATH = "./test.csv"

In [3]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH).drop('RowId', axis=1).drop_duplicates()

In [4]:
(train.shape, test.shape)

((56222, 10), (4684, 8))

In [5]:
train.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690.0,7953835000.0,116881275.0,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0
1,150950690.0,7953835000.0,116881275.0,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0
2,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0
3,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0
4,150950690.0,7953835000.0,116881275.0,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0


In [6]:
test.head(5)

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition
0,150950810.0,15401620000.0,62521740000.0,consida,EXACT,DESKTOP,2017-02-13,1.0
1,150951290.0,7953845000.0,191069700.0,billån,EXACT,DESKTOP,2017-02-10,1.5
2,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,HIGH_END_MOBILE,2017-01-22,1.6
3,150950810.0,24573360000.0,570188800.0,wasa kredit,EXACT,DESKTOP,2017-01-18,1.2
4,150950810.0,15401620000.0,62521740000.0,consida,EXACT,HIGH_END_MOBILE,2017-02-14,1.0


# Посмотрим, сколько всего уникальных кампаний есть в train

In [7]:
train["CampaignId"].unique().shape

(9,)

# Посмотрим уникальные значения для каждого признака

In [8]:
train["Query"].unique().shape # Достаточно много уникальных значений

(4289,)

In [9]:
train["QueryMatchTypeWithVariant"].unique()

array(['NEAR_EXACT', 'EXACT', 'NEAR_PHRASE', 'PHRASE', 'EXPANDED'],
      dtype=object)

In [10]:
train["Device"].unique()

array(['HIGH_END_MOBILE', 'DESKTOP', 'TABLET'], dtype=object)

# Убедимся в том, что симметрическая разность множеств уникальных значений по каждому признакому между тренировочной и тестовой выборкой является пустым множеством, другими словами, что в train есть те же уникальные значения, что и в  test и наоборот

In [11]:
(set(test["Query"].unique()) ^ set(train["Query"].unique())).__len__() # симметрическая разность

4385

In [12]:
(train["Query"].unique().shape, test["Query"].unique().shape)

((4289,), (894,))

In [13]:
(set(test["Query"].unique()) - set(train["Query"].unique())).__len__()

495

In [60]:
# В test есть такие запросы, которые нет в train => будем брать объединение множеств запросов
# для обучения и построения BoW

In [15]:
all_requests = list(set(test["Query"].unique()) | set(train["Query"].unique()))

In [16]:
len(all_requests)

4784

In [17]:
list(train["CampaignId"].unique())

[150950690.0,
 150950810.0,
 150950930.0,
 150951050.0,
 150951170.0,
 150951290.0,
 150951410.0,
 150951530.0,
 150951650.0]

In [18]:
list(test["CampaignId"].unique())

[150950810.0, 150951290.0, 150950690.0]

In [328]:
# Основной обработчик данных
def handler(data, cidcols=None): # cidcols = campaign id columns in train
    ds = data.copy(deep=True)
    if (cidcols != None):
        cidcols = [("CID_" + str(x)) for x in cidcols]
    
    # One hot encoding campaign id
    ohcid = pd.get_dummies(data["CampaignId"], prefix="CID") # one hot campaign ids
    if (ohcid.columns.shape[0] == 3): # test case
        for col in list(set(cidcols) - set(ohcid.columns)): # lost columns
            ohcid[col] = np.zeros(len(ds))
    sorted_ohcids = sorted(ohcid.columns)
    for col in sorted_ohcids:
        ds[col] = ohcid[col].copy(deep=True)
    del ds["CampaignId"]
        
    # One hot encoding device
    ds = ds.join(pd.get_dummies(ds["Device"]))
    del ds["Device"]
    
    # One hot encoding QueryMatchTypeWithVariant
    ds = ds.join(pd.get_dummies(train["QueryMatchTypeWithVariant"]))
    del ds["QueryMatchTypeWithVariant"]
    
    # Extracting seasoning, only months, one hot encoding
    ds["Date"] = ds["Date"].map(lambda x : x[5:7])
    ds = ds.join(pd.get_dummies(ds["Date"], prefix="mnth"))
    if (cidcols != None): # Got test
        ds = ds.join(pd.DataFrame({"mnth_0" + str(i) : np.zeros(len(ds)) for i in range(3, 10)}, dtype="float64"))
        ds = ds.join(pd.DataFrame({"mnth_" + str(i) : np.zeros(len(ds)) for i in range(10, 13)}, dtype="float64"))
    del ds["Date"]
    
    # Creating target column
    if (cidcols == None): # Got train
        ds["CTR"] = (ds["Clicks"] / ds["Impressions"]).copy(deep=True)
        del ds["Clicks"]
        del ds["Impressions"]
        
    del ds["Query"]
    
    return ds

In [329]:
handler(train)

Unnamed: 0,AdGroupId,KeywordId,AveragePosition,CID_150950690.0,CID_150950810.0,CID_150950930.0,CID_150951050.0,CID_150951170.0,CID_150951290.0,CID_150951410.0,...,mnth_04,mnth_05,mnth_06,mnth_07,mnth_08,mnth_09,mnth_10,mnth_11,mnth_12,CTR
0,7.953835e+09,1.168813e+08,1.0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1.000000
1,7.953835e+09,1.168813e+08,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.000000
2,7.953835e+09,1.168813e+08,2.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1.000000
3,7.953835e+09,1.168813e+08,2.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1.000000
4,7.953835e+09,1.168813e+08,1.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0.000000
5,7.953835e+09,1.168813e+08,2.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0.000000
6,7.953835e+09,1.168813e+08,2.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0.000000
7,7.953835e+09,1.168813e+08,2.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1.000000
8,7.953835e+09,1.168813e+08,1.0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1.000000
9,7.953835e+09,1.168813e+08,1.0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.000000


In [337]:
tr = cb.CatBoostClassifier(loss_function="Logloss")

In [330]:
handler(test, cidcols=list(train["CampaignId"].unique())).head()

Unnamed: 0,AdGroupId,KeywordId,AveragePosition,CID_150950690.0,CID_150950810.0,CID_150950930.0,CID_150951050.0,CID_150951170.0,CID_150951290.0,CID_150951410.0,...,mnth_03,mnth_04,mnth_05,mnth_06,mnth_07,mnth_08,mnth_09,mnth_10,mnth_11,mnth_12
0,15401620000.0,62521740000.0,1.0,0,1,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7953845000.0,191069700.0,1.5,0,0,0.0,0.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,24573360000.0,570188800.0,1.6,0,1,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,24573360000.0,570188800.0,1.2,0,1,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15401620000.0,62521740000.0,1.0,0,1,0.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [304]:
def tfidf(train, test):
    
    # Docs will be keywordids, words will be queries
    
            
    return 0

In [309]:
tfidf(train, test)

0

In [306]:
train[["CampaignId", "Query"]].head()

Unnamed: 0,CampaignId,Query
0,150950690.0,may loan
1,150950690.0,my lloan
2,150950690.0,my loan
3,150950690.0,my loan
4,150950690.0,my loan


In [307]:
train[["CampaignId", "Query"]].head()

Unnamed: 0,CampaignId,Query
0,150950690.0,may loan
1,150950690.0,my lloan
2,150950690.0,my loan
3,150950690.0,my loan
4,150950690.0,my loan


In [308]:
train

Unnamed: 0,CampaignId,AdGroupId,KeywordId,Query,QueryMatchTypeWithVariant,Device,Date,AveragePosition,Clicks,Impressions
0,150950690.0,7.953835e+09,1.168813e+08,may loan,NEAR_EXACT,HIGH_END_MOBILE,2016-08-26,1.0,1.0,1.0
1,150950690.0,7.953835e+09,1.168813e+08,my lloan,NEAR_EXACT,DESKTOP,2016-09-16,1.0,1.0,1.0
2,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-05,2.0,1.0,1.0
3,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-12,2.0,1.0,1.0
4,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-18,1.0,0.0,1.0
5,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-21,2.0,0.0,1.0
6,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-23,2.0,0.0,1.0
7,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-04-25,2.0,1.0,1.0
8,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-05-09,1.0,1.0,1.0
9,150950690.0,7.953835e+09,1.168813e+08,my loan,EXACT,DESKTOP,2016-05-14,1.0,0.0,1.0


In [292]:
len(set(train["KeywordId"].unique()) | set(test["KeywordId"].unique()))

469

In [291]:
len(set(train["KeywordId"].unique()))

465

In [310]:
c = 0
for q in train["Query"].unique():
    c += len(train[train["Query"] == q]["KeywordId"].unique())

In [312]:
c / len(train["Query"].unique())

1.1231056190254138

In [313]:
len(train["KeywordId"].unique())

465