In [None]:
import os
import csv
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras

In [2]:
def normalize(vals: np.ndarray):
    min_val = np.min(vals)
    max_val = np.max(vals)
    normalized_values = (vals - min_val) / (max_val - min_val)

    return normalized_values


def restore(normalized_values: np.ndarray, vals: np.ndarray):
    original_min = np.min(vals)
    original_max = np.max(vals)
    normalized_values = np.array(normalized_values)
    restored_values = normalized_values * (original_max - original_min) + original_min

    return restored_values

In [9]:
train_full = pd.read_csv('data/train.csv')
train_ids = train_full[['id']].to_numpy(dtype=int)
xs = train_full.drop('id', axis=1).to_numpy()
ys = train_full[['target']].to_numpy()
test_full = pd.read_csv('data/test.csv')
ids_test = test_full[['id']].to_numpy(dtype=int)
test = test_full.drop('id', axis=1).to_numpy()
split = int(xs.shape[0] * .8)
X_train = xs[:split, :-1]
y_train = ys[:split, -1]
y_train_normalized = normalize(y_train)
ids_train = train_ids[:split, 0]
X_valid = xs[split:, :-1]
y_valid = ys[split:, -1]
y_valid_normalized = normalize(y_valid)
ids_valid = train_ids[split:, 0]

In [4]:
class SwapNoise(keras.layers.Layer):
    def __init__(self, ratio=0.15, col_to_apply=None, **kwargs):
        super().__init__(**kwargs)
        self.ratio = ratio
        self.col_to_apply = col_to_apply

    def call(self, inputs, training=None):
        if training:
            noisy_inputs = tf.map_fn(lambda x: SwapNoise._add_swap_noise(x, ratio=self.ratio, col_to_apply=self.col_to_apply), inputs)
            return noisy_inputs
        else:
            return inputs

    @staticmethod
    def _add_swap_noise(X, ratio=.15, col_to_apply=None, return_mask=False):
        if col_to_apply is None:
            col_to_apply = []

        shape = tf.shape(X)
        obfuscation_mask = tf.cast(
            tf.random.stateless_binomial(
                shape=shape,
                seed=(1, 2),
                counts=1,
                probs=tf.fill(shape, ratio)),
            dtype=tf.float32)

        if col_to_apply:
            column_mask = np.zeros(X.shape, dtype=np.float32)
            column_mask[col_to_apply] = 1
            obfuscation_mask *= column_mask

        shuffled_rows = tf.random.shuffle(tf.range(tf.shape(X)[0]))
        obfuscated_X = tf.where(obfuscation_mask == 1, tf.gather(X, shuffled_rows), X)

        if return_mask:
            return obfuscated_X, obfuscation_mask

        return obfuscated_X

NameError: name 'keras' is not defined

In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.metrics import mean_squared_error
from scipy.sparse._csr import csr_matrix

from hyperopt import fmin, hp, tpe, STATUS_OK
from hyperopt.pyll.base import scope
from typing import Callable

In [6]:
def rte_objective(search_space: dict) -> dict:
    rte = RandomTreesEmbedding(**search_space)
    ridge = Ridge(alpha=3000)

    rte.fit(X_train)

    X_train_transformed = rte.transform(X_train)

    ridge.fit(X_train_transformed, y_train_normalized)

    X_valid_transformed = rte.transform(X_valid)
    y_pred = ridge.predict(X_valid_transformed)
    accuracy = mean_squared_error(y_valid_normalized, y_pred)

    return {'loss': accuracy, 'status': STATUS_OK}


def find_params(search_space: dict, get_objective: Callable) -> dict:
    algorithm = tpe.suggest
    best_params = fmin(
        fn=get_objective,
        space=search_space,
        algo=algorithm,
        max_evals=100)

    return best_params

In [7]:
rte_params = find_params({
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 2000, 50)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 10, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 50, 200, 50)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 25, 200, 25)),
    'n_jobs': -2}, rte_objective)

100%|██████████| 100/100 [6:13:15<00:00, 223.95s/trial, best loss: 0.06101286318967715] 


In [8]:
import xgboost as xgb


def xgb_objective(search_space: dict) -> dict:
    regressor = xgb.XGBRegressor(**search_space)

    regressor.fit(X_train, y_train_normalized)

    y_pred = regressor.predict(X_valid)
    accuracy = mean_squared_error(y_valid_normalized, y_pred)

    return {'loss': accuracy, 'status': STATUS_OK}

In [9]:
xgb_params = find_params({
    'max_depth': scope.int(hp.quniform('max_depth', 1, 10, 1)),
    'gamma': hp.uniform ('gamma', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
    'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
    'learning_rate': hp.uniform('learning_rate', 0, .15),
    'random_state': 5,
    'max_bin' : scope.int(hp.quniform('max_bin', 200, 550, 1))}, xgb_objective)

100%|██████████| 100/100 [09:48<00:00,  5.88s/trial, best loss: 0.018306310110094284]


In [4]:
regressor = xgb.XGBRegressor(
    colsample_bytree=0.6602379452042961,
    gamma=0.3229199347299238,
    learning_rate=0.0027049502919725735,
    max_bin=491,
    max_depth=1,
    min_child_weight=2.146234118263356)

regressor.fit(X_train, y_train_normalized)

NameError: name 'xgb' is not defined

In [11]:
rte = RandomTreesEmbedding(max_depth=9, min_samples_leaf=50, min_samples_split=50, n_estimators=1600)
ridge = Ridge(alpha=3000)

rte.fit(X_train)

X_train_transformed = rte.transform(X_train)
X_valid_transformed = rte.transform(X_valid)

ridge.fit(X_train_transformed, y_train_normalized)

In [12]:
def create_file(name: str, xs_trans: csr_matrix, xs_old: np.ndarray, ids: np.ndarray, ys: np.ndarray = None):
    col_names = \
        ['id'] +\
        [f'cat{idx}' for idx in range(xs_trans.shape[1])] +\
        [f'cont{idx}' for idx in range(xs_old.shape[1])]

    if ys is not None:
        col_names += ['target']

    with open(os.path.join('data', f'{name}_enhanced.csv'), 'w') as file:
        writer = csv.writer(file, delimiter=',')

        writer.writerow(col_names)

        for i in range(0, xs_trans.shape[0]):
            enhanced_repr = xs_trans.getrow(i).toarray()[0]
            enhanced_repr = enhanced_repr.reshape(enhanced_repr.shape[0], 1)
            old_features = xs_old[i, :]
            old_features = old_features.reshape(old_features.shape[0], 1)
            tall_repr = np.vstack([enhanced_repr, old_features])
            wide_repr = tall_repr.reshape(1, tall_repr.shape[0]).tolist()[0]
            user_id = [ids[i, 0]] if ids.ndim == 2 else [ids[i]]
            full_row = user_id + wide_repr + ([ys[i]] if ys is not None else [])

            writer.writerow(full_row)

In [13]:
create_file('train_normalized', X_train_transformed, X_train, ids_train, y_train_normalized)
create_file('valid_normalized', X_valid_transformed, X_valid, ids_valid, y_valid_normalized)

In [14]:
def get_train_record_defaults():
    zf = tf.zeros(shape=(1,), dtype=tf.float32)

    # the +2 part is because of id and target
    return [zf] * (X_train_transformed.shape[1] + X_train.shape[1] + 2)


def parse_train_batch(tf_string: str):
    data = tf.io.decode_csv(tf_string, get_train_record_defaults())
    features = data[1:-1]
    labels = data[-1]
    features = tf.stack(features, axis=-1)

    return features, labels


def get_train_batched_dataset(batch_size: int, data_path: str) -> tf.data.Dataset:
    return tf.data.TextLineDataset([data_path])\
        .skip(1)\
        .batch(batch_size)\
        .map(parse_train_batch)

In [15]:
train_dataset = get_train_batched_dataset(64, os.path.join('data', 'train_normalized_enhanced.csv'))
valid_dataset = get_train_batched_dataset(64, os.path.join('data', 'valid_normalized_enhanced.csv'))

In [17]:
regressor_enhanced = keras.Sequential([
    SwapNoise(ratio=.1, col_to_apply=[X_train_transformed.shape[1] + idx for idx in range(X_train.shape[1])]),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(1),
    keras.layers.PReLU()
])

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30,
                                               min_delta=1e-6)
regressor_enhanced.compile(optimizer='adam', loss='mse')

history_enhanced = regressor_enhanced.fit(
    train_dataset, epochs=300, batch_size=64,
    callbacks=[
        early_stopping,
        keras.callbacks.ModelCheckpoint(
            filepath='saved_models/dnn2_regressor_after_dae_enhanced{epoch}',
            save_best_only=True)],
    validation_data=valid_dataset)

Epoch 1/300
   3750/Unknown - 1522s 405ms/step - loss: 0.0145INFO:tensorflow:Assets written to: saved_models\dnn2_regressor_after_dae_enhanced1\assets
Epoch 2/300
Epoch 3/300
 484/3750 [==>...........................] - ETA: 24:24 - loss: 0.0050

KeyboardInterrupt: 

In [None]:
enhanced_regressor = keras.models.load_model(os.path.join('saved_models', 'dnn2_regressor_after_dae_enhanced6'))

In [None]:
X_test_transformed = rte.transform(test)
ridge_predictions = ridge.predict(X_test_transformed)
xgb_predictions = regressor.predict(test)

create_file('test', X_test_transformed, test, ids_test)

In [None]:
from tensorflow import Tensor


def parse_test_batch(tf_string: Tensor):
    zf = tf.zeros(shape=(1,), dtype=tf.float32)
    defaults = [zf] * (14628 + 14 + 2)
    data = tf.io.decode_csv(tf_string, defaults)
    features = data[1:]
    features = tf.stack(features, axis=-1)

    return features


def get_test_batched_dataset(batch_size: int, data_path: str) -> tf.data.Dataset:
    return tf.data.TextLineDataset([data_path])\
        .skip(1)\
        .batch(batch_size)\
        .map(parse_test_batch)

In [None]:
test_dataset = get_test_batched_dataset(64, os.path.join('data', 'test_enhanced.csv'))
nn_predictions = enhanced_regressor.predict(test_dataset)

In [None]:
def vote():
    reshaped_ridge_predictions = ridge_predictions.reshape((ridge_predictions.shape[0], 1))
    reshaped_xgb_predictions = xgb_predictions.reshape((xgb_predictions.shape[0], 1))
    avg = (reshaped_ridge_predictions + reshaped_xgb_predictions) / 2
    avg_restored = restore(avg, y_train)

    pd\
        .DataFrame(np.column_stack([pd.read_csv('data/test_enhanced.csv')[['id']], avg_restored]), columns=['id', 'target'])\
        .astype({'id': int})\
        .to_csv(os.path.join('submissions', 'ensemble3.csv'), index=False)


vote()