In [3]:
import pandas as pd
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier


In [4]:
def load_data(nrows=100_000):
    chunksize = 10 ** 6
    filename='train.csv'
    chunks = []
    with pd.read_csv(filename, chunksize=chunksize, nrows=nrows) as reader:
        for chunk in reader:
            chunks.append(chunk)
    data = pd.concat(chunks)

    return data

In [5]:
def find_best_estimator(params, clf, X, Y, verbose=0):
    grid_search = GridSearchCV(clf, params, n_jobs=-1, cv=3, scoring='roc_auc')
    if verbose:
        grid_search.fit(X, Y, verbose=100)
    else:
        grid_search.fit(X, Y)
    print(grid_search.best_params_)
    return grid_search.best_estimator_

In [6]:
nrows_data = 300_000
data = load_data(nrows=nrows_data)

In [7]:
data.head()
data.info()
data.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                300000 non-null  float64
 1   click             300000 non-null  int64  
 2   hour              300000 non-null  int64  
 3   C1                300000 non-null  int64  
 4   banner_pos        300000 non-null  int64  
 5   site_id           300000 non-null  object 
 6   site_domain       300000 non-null  object 
 7   site_category     300000 non-null  object 
 8   app_id            300000 non-null  object 
 9   app_domain        300000 non-null  object 
 10  app_category      300000 non-null  object 
 11  device_id         300000 non-null  object 
 12  device_ip         300000 non-null  object 
 13  device_model      300000 non-null  object 
 14  device_type       300000 non-null  int64  
 15  device_conn_type  300000 non-null  int64  
 16  C14               30

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [8]:
Y = data["click"]

In [9]:
print(f"Clicking rate is: {round(Y.sum()/Y.shape[0]*100, 2)}%")

Clicking rate is: 17.07%


In [10]:
X = data.drop(columns=['id', 'click', 'hour', 'device_id', 'device_ip'])

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [12]:
enc = OneHotEncoder(handle_unknown='ignore')

In [13]:
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [14]:
print(X_train_enc[0].shape)

(1, 8385)


In [36]:
DT_parameters = {'max_depth': [5, 10, 20],
                 'max_features': ["sqrt", "log2", 1.0]}

In [37]:
decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)

In [38]:
decision_tree_best = find_best_estimator(DT_parameters, decision_tree, X_train_enc, Y_train)
pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(Y_test, pos_prob)}")

{'max_depth': 20, 'max_features': 1.0}
Pole pod krzywą ROC dla zbioru testowego wynosi: 0.7262510234797023


In [33]:
rf_parameters = {'max_depth': [5, 10, 20],
                 'n_estimators': [50, 100, 200]}

In [34]:
random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)

In [35]:
random_forest_best = find_best_estimator(rf_parameters, random_forest, X_train_enc, Y_train)
pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(Y_test, pos_prob)}")



{'max_depth': 20, 'n_estimators': 200}
Pole pod krzywą ROC dla zbioru testowego wynosi: 0.731815144034198


In [27]:
xgb_parameters = {'max_depth': [5, 10, 20],
                 'n_estimators': [50, 100, 200],
                 'learning_rate': [0.01, 0.1, 0.5]}

In [28]:
xgb_model = xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=100)

In [29]:
xgb_best = find_best_estimator(xgb_parameters, xgb_model, X_train_enc, Y_train)
pos_prob = xgb_best.predict_proba(X_test_enc)[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(Y_test, pos_prob)}")



{'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 200}
Pole pod krzywą ROC dla zbioru testowego wynosi: 0.7518017888942463


In [30]:
cb_parameters = {"depth": [5, 10, 20],
            "learning_rate": [0.01, 0.1, 0.5],
            "iterations": [50, 100, 200]}

In [31]:
cb_model = cb.CatBoostClassifier()

In [32]:
cb_best = find_best_estimator(cb_parameters, cb_model, X_train_enc, Y_train, 100)
pos_prob = cb_best.predict_proba(X_test_enc)[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(Y_test, pos_prob)}")

0:	learn: 0.6360819	total: 740ms	remaining: 36.3s
0:	learn: 0.6361253	total: 441ms	remaining: 21.6s
0:	learn: 0.6870671	total: 464ms	remaining: 22.8s
1:	learn: 0.5917597	total: 1.23s	remaining: 29.6s
1:	learn: 0.6811981	total: 909ms	remaining: 21.8s
0:	learn: 0.6870737	total: 535ms	remaining: 53s
1:	learn: 0.5936791	total: 1s	remaining: 24s
0:	learn: 0.6344868	total: 638ms	remaining: 31.2s
2:	learn: 0.6753337	total: 1.32s	remaining: 20.6s
0:	learn: 0.4795075	total: 378ms	remaining: 18.5s
0:	learn: 0.6869021	total: 428ms	remaining: 42.4s
0:	learn: 0.6870671	total: 315ms	remaining: 31.1s
1:	learn: 0.6811913	total: 974ms	remaining: 47.7s
0:	learn: 0.6361253	total: 705ms	remaining: 1m 9s
2:	learn: 0.5571457	total: 1.85s	remaining: 29s
2:	learn: 0.5583642	total: 1.39s	remaining: 21.9s
0:	learn: 0.6869021	total: 685ms	remaining: 33.6s
2:	learn: 0.6757307	total: 1.2s	remaining: 38.7s
1:	learn: 0.5892666	total: 1.03s	remaining: 24.7s
0:	learn: 0.6870737	total: 2.78s	remaining: 2m 16s
1:	learn:



49:	learn: 0.4010405	total: 33.6s	remaining: 0us
48:	learn: 0.5141595	total: 30.8s	remaining: 642ms
40:	learn: 0.4022226	total: 32.5s	remaining: 51.7s
52:	learn: 0.5065725	total: 29.6s	remaining: 26.8s
48:	learn: 0.5149960	total: 31.2s	remaining: 663ms
56:	learn: 0.4995120	total: 30.7s	remaining: 24s
52:	learn: 0.4117297	total: 27.7s	remaining: 24.6s
49:	learn: 0.5129992	total: 30.9s	remaining: 32.2s
44:	learn: 0.4122580	total: 20.1s	remaining: 24.6s
49:	learn: 0.4124810	total: 28.1s	remaining: 28.7s
49:	learn: 0.5121858	total: 31.1s	remaining: 0us
50:	learn: 0.5111529	total: 31s	remaining: 31s
45:	learn: 0.4120481	total: 20.3s	remaining: 23.8s
49:	learn: 0.5129992	total: 31.4s	remaining: 0us
41:	learn: 0.4021301	total: 32.8s	remaining: 50s
51:	learn: 0.5093496	total: 31.1s	remaining: 29.9s
53:	learn: 0.4116109	total: 27.9s	remaining: 23.8s
53:	learn: 0.5049024	total: 30s	remaining: 26s
46:	learn: 0.4119090	total: 20.4s	remaining: 23s
57:	learn: 0.4977815	total: 31s	remaining: 23.3s
42

27 fits failed out of a total of 81.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "/home/piotr/Desktop/Studia/Ads prediction/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/piotr/Desktop/Studia/Ads prediction/venv/lib/python3.10/site-packages/catboost/core.py", line 5100, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/home/piotr/Desktop/Studia/Ads prediction/venv/lib/python3.10/site-packages/catboost/core.py", line 2303, in _fit


0:	learn: 0.4778602	total: 124ms	remaining: 24.8s
1:	learn: 0.4313935	total: 198ms	remaining: 19.6s
2:	learn: 0.4188684	total: 277ms	remaining: 18.2s
3:	learn: 0.4146420	total: 2.72s	remaining: 2m 13s
4:	learn: 0.4123359	total: 2.72s	remaining: 2m 13s
5:	learn: 0.4113589	total: 2.76s	remaining: 1m 47s
6:	learn: 0.4104179	total: 2.84s	remaining: 1m 31s
7:	learn: 0.4089634	total: 5.28s	remaining: 2m 24s
8:	learn: 0.4076211	total: 5.36s	remaining: 2m 7s
9:	learn: 0.4061348	total: 5.43s	remaining: 1m 54s
10:	learn: 0.4054400	total: 5.51s	remaining: 1m 44s
11:	learn: 0.4048735	total: 5.58s	remaining: 1m 35s
12:	learn: 0.4045657	total: 5.66s	remaining: 1m 28s
13:	learn: 0.4040223	total: 5.73s	remaining: 1m 22s
14:	learn: 0.4035644	total: 5.81s	remaining: 1m 16s
15:	learn: 0.4031831	total: 5.88s	remaining: 1m 12s
16:	learn: 0.4028221	total: 5.96s	remaining: 1m 8s
17:	learn: 0.4025222	total: 6.04s	remaining: 1m 4s
18:	learn: 0.4021730	total: 6.11s	remaining: 1m 1s
19:	learn: 0.4019537	total: 6