In [None]:
import timeit
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import roc_auc_score, ConfusionMatrixDisplay, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
pd.set_option('display.max_columns', None)

: 

In [None]:
def load_data(nrows=100_000):
    chunksize = 10 ** 6
    filename="../input/avazu-ctr-prediction/train.gz"
    chunks = []
    with pd.read_csv(filename, chunksize=chunksize, compression='gzip', nrows=nrows) as reader:
        for chunk in reader:
            chunks.append(chunk)
    data = pd.concat(chunks)

    return data

: 

In [None]:
def find_best_estimator(params, clf, X, Y, verbose=0):
    grid_search = GridSearchCV(clf, params, n_jobs=-1, cv=3, scoring='roc_auc')
    if verbose:
        grid_search.fit(X, Y, verbose=100)
    else:
        grid_search.fit(X, Y)
    print(grid_search.best_params_)
    return grid_search.best_estimator_

: 

In [None]:
nrows_data = 300_000
data = load_data(nrows=nrows_data)

: 

In [None]:
data.head()

: 

In [None]:
data.info()

: 

In [None]:
data.describe()

: 

In [None]:
X = data.drop(columns=['id', 'click', 'hour', 'device_id', 'device_ip'])
Y = data['click']

: 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

: 

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

: 

In [None]:
sgdc_clf = SGDClassifier(loss='log_loss', penalty=None, fit_intercept=True, max_iter=100, learning_rate='constant', eta0=0.01)

: 

In [None]:
sgdc_clf.fit(X_train_enc, Y_train)
pos_prob = sgdc_clf.predict_proba(X_test_enc)[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(Y_test, pos_prob)}")

: 

In [None]:
print(classification_report(Y_test, sgdc_clf.predict(X_test_enc), target_names=['0', '1']))

: 

In [None]:
ConfusionMatrixDisplay.from_estimator(
    sgdc_clf, X_test_enc, Y_test, display_labels=['0', '1'], xticks_rotation="vertical"
)
plt.tight_layout()
plt.show()

: 

Penalty, eta0, alpha

In [None]:
sgdc_params = {'penalty': ['l2', 'l1', 'elasticnet', None],
               'eta0': [0.001, 0.01, 1.0],
               'alpha': [0.01, 0.1, 1.0]}

: 

In [None]:
grid_search = GridSearchCV(sgdc_clf, sgdc_params, n_jobs=-1, cv=3, scoring='roc_auc')
grid_search.fit(X_train_enc, Y_train)
print(grid_search.best_params_)
sgdc_best = grid_search.best_estimator_

: 

In [None]:
pos_prob = sgdc_best.predict_proba(X_test_enc)[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(Y_test, pos_prob)}")

: 

In [None]:
print(classification_report(Y_test, sgdc_best.predict(X_test_enc), target_names=['0', '1']))

: 

In [None]:
ConfusionMatrixDisplay.from_estimator(
    sgdc_best, X_test_enc, Y_test, display_labels=['0', '1'], xticks_rotation="vertical"
)
plt.tight_layout()
plt.show()

: 

# Online learning

10 000 000 samples

In [None]:
big_data = load_data(10_000_000)

: 

In [None]:
big_data.info()

: 

In [None]:
big_X = big_data.drop(columns=['id', 'click', 'hour', 'device_id', 'device_ip'])
big_Y = big_data['click']

: 

In [None]:
big_X_train, big_X_test, big_Y_train, big_Y_test = train_test_split(big_X, big_Y, test_size=0.1, random_state=42)

: 

In [None]:
big_enc = OneHotEncoder(handle_unknown='ignore')
big_enc.fit(big_X_train)

: 

In [None]:
sgdc_lr_online = SGDClassifier(
    loss='log_loss',
    fit_intercept=True,
    max_iter=100,
    learning_rate='constant',
    **grid_search.best_params_)

: 

In [None]:
start_time = timeit.default_timer()
for i in range(9):
    x_train = big_X_train[i*1_000_000:(i+1)*1_000_000]
    y_train = big_Y_train[i*1_000_000:(i+1)*1_000_000]    
    x_train_enc = big_enc.transform(x_train)
    sgdc_lr_online.partial_fit(x_train_enc, y_train, classes=[0,1])
print(timeit.default_timer()-start_time)

: 

In [None]:
big_pos_prob = sgdc_lr_online.predict_proba(big_enc.transform(big_X_test))[:, 1]
print(f"Pole pod krzywą ROC dla zbioru testowego wynosi: {roc_auc_score(big_Y_test, big_pos_prob)}")

: 

If we want to work still in the same notebook, we must clear memmory. Otherwise "Your notebook tried to allocate more memory than is available. It has restarted." message will occour

In [None]:
big_data = big_data.iloc[0:0]
big_X, big_Y = big_data.iloc[0:0], big_data.iloc[0:0]
big_X_train, big_X_test, big_Y_train, big_Y_test = big_data.iloc[0:0], big_data.iloc[0:0], big_data.iloc[0:0], big_data.iloc[0:0]
x_train, y_train, x_train_enc = big_data.iloc[0:0], big_data.iloc[0:0], big_data.iloc[0:0]

: 

# Logistic Regression with TensorFlow

In [None]:
X_train_enc.shape 

: 

In [None]:
X_train.nunique()

: 

While shape of X_train_enc is 270_000 x 8385 we can't use .toarray() method and than .astype('float32'), because new array would take 270_000 * 8385 * 4 = 9_055_800_000 bytes (it's **8,43387 Gigabytes**). To reduce number of features we are going to use **feature selection with random forest**.

## Feature importance

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)
rf_clf.fit(X_train_enc, Y_train)

: 

In [None]:
feature_imp = rf_clf.feature_importances_
print(feature_imp)

: 

In [None]:
std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_], axis=0)

: 

In [None]:
forest_importances = pd.Series(feature_imp, index=enc.get_feature_names_out())

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

: 

In [None]:
print(std)

: 

In [None]:
batch_size=1000
train_data = tf.data.Dataset.from_tensor_slices((dict(X_train), Y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)

: 

In [None]:
# Display a few examples
for data, label in train_data.take(1):
    for key, value in data.items():
        print("Features:", key, data[key][0].numpy())
    print("Label:", label[0].numpy())
    print("\n")

: 

In [None]:
n_features = X_train.shape[1]
W = tf.Variable(tf.zeros([n_features, 1]))
b = tf.Variable(tf.zeros([1]))

: 

In [None]:
learning_rate = 0.0008
optimizer = tf._optimizers.Adam(learning_rate)

: 

In [None]:
def run_optimization(x, y):
    with tf.GradientTape() as g:
        logits = tf.add(tf.matmul(x, W), b)[:, 0]
        cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))
        gradients = g.gradient(cost, [W, b])
        optimizer.apply_gradients(zip(gradients, [W, b]))

: 

## Training

In [None]:
training_steps = 6_000
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
    run_optimization(batch_x, batch_y)
    if step % 500 == 0:
        logits = tf.add(tf.matmul(batch_x, W), b)[:, 0]
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=batch_y, logits=logits))
        print("Steps: %i, loss: %f" % (step, loss))

: 

## Testing

In [None]:
logits = tf.add(tf.matmul(x, W), b)[:, 0]
pred = tf.nn.sigmoid(logits)
auc_metric = tf.keras.metrics.AUC()
auc_metric.update_state(Y_test, pred)
print(f'Area under curve ROC for testing dataset is: {auc_metric.result().numpy():.3f}')

: 