This went quite well - got 0.6995 on private score, 0.70083 on the public one. It probably could be tweaked as the neural network at the end does the difference. The other competitors mentioned scaling the target variable to [0-1] range and were saying it allowed them to go down a bit with their MSE, but given that this task got me tired as no other, for now I'm leaving the solution as is.

In [1]:
import os
import csv
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from pandas.io.parsers import TextFileReader

In [2]:
def normalize(vals: np.ndarray):
    min_val = np.min(vals)
    max_val = np.max(vals)
    normalized_values = (vals - min_val) / (max_val - min_val)

    return normalized_values


def restore(normalized_values: np.ndarray, vals: np.ndarray):
    original_min = np.min(vals)
    original_max = np.max(vals)
    normalized_values = np.array(normalized_values)
    restored_values = normalized_values * (original_max - original_min) + original_min

    return restored_values

In [3]:
train_full = pd.read_csv('data/train.csv')
train_ids = train_full[['id']].to_numpy(dtype=int)
xs = train_full.drop('id', axis=1).to_numpy()
ys = train_full[['target']].to_numpy()
test_full = pd.read_csv('data/test.csv')
ids_test = test_full[['id']].to_numpy(dtype=int)
test = test_full.drop('id', axis=1).to_numpy()
split = int(xs.shape[0] * .8)
X_train = xs[:split, :-1]
y_train = ys[:split, -1]
y_train_normalized = normalize(y_train)
ids_train = train_ids[:split, 0]
X_valid = xs[split:, :-1]
y_valid = ys[split:, -1]
y_valid_normalized = normalize(y_valid)
ids_valid = train_ids[split:, 0]

In [4]:
class SwapNoise(keras.layers.Layer):
    def __init__(self, ratio=0.15, col_to_apply=None, **kwargs):
        super().__init__(**kwargs)
        self.ratio = ratio
        self.col_to_apply = col_to_apply

    def call(self, inputs, training=None):
        if training:
            noisy_inputs = tf.map_fn(lambda x: SwapNoise._add_swap_noise(x, ratio=self.ratio, col_to_apply=self.col_to_apply), inputs)
            return noisy_inputs
        else:
            return inputs

    @staticmethod
    def _add_swap_noise(X, ratio=.15, col_to_apply=None, return_mask=False):
        if col_to_apply is None:
            col_to_apply = []

        shape = tf.shape(X)
        obfuscation_mask = tf.cast(
            tf.random.stateless_binomial(
                shape=shape,
                seed=(1, 2),
                counts=1,
                probs=tf.fill(shape, ratio)),
            dtype=tf.float32)

        if col_to_apply:
            column_mask = np.zeros(X.shape, dtype=np.float32)
            column_mask[col_to_apply] = 1
            obfuscation_mask *= column_mask

        shuffled_rows = tf.random.shuffle(tf.range(tf.shape(X)[0]))
        obfuscated_X = tf.where(obfuscation_mask == 1, tf.gather(X, shuffled_rows), X)

        if return_mask:
            return obfuscated_X, obfuscation_mask

        return obfuscated_X

In [5]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.metrics import mean_squared_error
from scipy.sparse._csr import csr_matrix

from hyperopt import fmin, hp, tpe, STATUS_OK
from hyperopt.pyll.base import scope
from typing import Callable

In [6]:
def rte_objective(search_space: dict) -> dict:
    rte = RandomTreesEmbedding(**search_space)
    ridge = Ridge(alpha=3000)

    rte.fit(X_train)

    X_train_transformed = rte.transform(X_train)

    ridge.fit(X_train_transformed, y_train_normalized)

    X_valid_transformed = rte.transform(X_valid)
    y_pred = ridge.predict(X_valid_transformed)
    accuracy = mean_squared_error(y_valid_normalized, y_pred)

    return {'loss': accuracy, 'status': STATUS_OK}


def find_params(search_space: dict, get_objective: Callable, max_evals: int = 100) -> dict:
    algorithm = tpe.suggest
    best_params = fmin(
        fn=get_objective,
        space=search_space,
        algo=algorithm,
        max_evals=max_evals)

    return best_params

In [8]:
rte_params = find_params({
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 2000, 50)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 50, 400, 50)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 25, 400, 25))}, rte_objective)

100%|██████████| 150/150 [10:24:44<00:00, 249.90s/trial, best loss: 0.06101731938071909] 


In [12]:
try:
    rte_params['max_depth'] = int(rte_params['max_depth'])
    rte_params['min_samples_leaf'] = int(rte_params['min_samples_leaf'])
    rte_params['min_samples_split'] = int(rte_params['min_samples_split'])
    rte_params['n_estimators'] = int(rte_params['n_estimators'])

    with open(os.path.join('saved_params', 'rte_params.pkl'), 'wb') as fp:
        pickle.dump(rte_params, fp)
except NameError:
    with open(os.path.join('saved_params', 'rte_params.pkl'), 'rb') as fp:
        rte_params = pickle.load(fp)

EOFError: Ran out of input

In [7]:
import xgboost as xgb


def xgb_objective(search_space: dict) -> dict:
    regressor = xgb.XGBRegressor(**search_space)

    regressor.fit(X_train, y_train_normalized)

    y_pred = regressor.predict(X_valid)
    accuracy = mean_squared_error(y_valid_normalized, y_pred)

    return {'loss': accuracy, 'status': STATUS_OK}

In [11]:
xgb_params = find_params({
    'max_depth': scope.int(hp.quniform('max_depth', 1, 15, 1)),
    'gamma': hp.uniform ('gamma', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
    'min_child_weight' : hp.uniform('min_child_weight', 0, 10),
    'learning_rate': hp.uniform('learning_rate', 0, .3),
    'random_state': 5,
    'max_bin' : scope.int(hp.quniform('max_bin', 200, 650, 1))}, xgb_objective)

100%|██████████| 100/100 [20:51<00:00, 12.51s/trial, best loss: 0.018242681542883835]


In [11]:
try:
    xgb_params['max_bin'] = int(xgb_params['max_bin'])
    xgb_params['max_depth'] = int(xgb_params['max_depth'])

    with open(os.path.join('saved_params', 'xgb_params.pkl'), 'wb') as fp:
        pickle.dump(xgb_params, fp)
except NameError:
    with open(os.path.join('saved_params', 'xgb_params.pkl'), 'rb') as fp:
        xgb_params = pickle.load(fp)

In [None]:
regressor = xgb.XGBRegressor(**xgb_params)

regressor.fit(X_train, y_train_normalized)

In [14]:
rte = RandomTreesEmbedding(**rte_params)
ridge = Ridge(alpha=3000)

rte.fit(X_train)

X_train_transformed = rte.transform(X_train)
X_valid_transformed = rte.transform(X_valid)

ridge.fit(X_train_transformed, y_train_normalized)

In [8]:
def create_file(name: str, xs_trans: csr_matrix, xs_old: np.ndarray, ids: np.ndarray, ys: np.ndarray = None):
    col_names = \
        ['id'] + \
        [f'cat{idx}' for idx in range(xs_trans.shape[1])] + \
        [f'cont{idx}' for idx in range(xs_old.shape[1])]

    if ys is not None:
        col_names += ['target']

    with open(os.path.join('data', f'{name}_enhanced.csv'), 'w') as file:
        writer = csv.writer(file, delimiter=',')

        writer.writerow(col_names)

        for i in range(0, xs_trans.shape[0]):
            enhanced_repr = xs_trans.getrow(i).toarray()[0]
            enhanced_repr = enhanced_repr.reshape(enhanced_repr.shape[0], 1)
            old_features = xs_old[i, :]
            old_features = old_features.reshape(old_features.shape[0], 1)
            tall_repr = np.vstack([enhanced_repr, old_features])
            wide_repr = tall_repr.reshape(1, tall_repr.shape[0]).tolist()[0]
            user_id = [ids[i, 0]] if ids.ndim == 2 else [ids[i]]
            full_row = user_id + wide_repr + ([ys[i]] if ys is not None else [])

            writer.writerow(full_row)

I couldn't find a working way to create tfrecords directly from sparse matrix generated by the RTE, so I'm writing a temporary, huge file that is picked up by the next cells and transformed into multiple tfrecords.

In [16]:
try:
    create_file('train_normalized', X_train_transformed, X_train, ids_train, y_train_normalized)
    create_file('valid_normalized', X_valid_transformed, X_valid, ids_valid, y_valid_normalized)
except NameError:
    pass

In [9]:
def to_tf_records(filename: str, kind: str, categorical_cols: list[str], continuous_cols: list[str]):
    def rows_to_example(rows: pd.DataFrame):
        feature = { 'id': tf.train.Feature(int64_list=tf.train.Int64List(value=rows['id'].astype(int))) }

        for col in categorical_cols:
            feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=rows[col].astype(int)))
        for col in continuous_cols:
            feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=rows[col]))

        feature['target'] = tf.train.Feature(float_list=tf.train.FloatList(value=rows['target']))

        return tf.train.Example(features=tf.train.Features(feature=feature))


    csv_file = os.path.join('data', f'{filename}.csv')
    file_reader = pd.read_csv(csv_file, chunksize=1000, low_memory=False)
    record_options = tf.io.TFRecordOptions(compression_type='GZIP')
    idx = 0

    for frame in file_reader:
        record_name = f'{filename}_{idx}.tfrecord'
        output_file = os.path.join('data', 'records', kind, record_name)

        with tf.io.TFRecordWriter(output_file, record_options) as writer:
            example = rows_to_example(frame)

            writer.write(example.SerializeToString())

        print(f'Processed chunk {idx+1} into {output_file}')

        idx += 1
        break

In [12]:
train_path = os.path.join('data', 'train_normalized_enhanced.csv')
column_names = []

# way faster than doing it with pandas
with open(train_path) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')

    for row in csv_reader:
        column_names = row

        break

categorical_cols = list(filter(lambda n: n.startswith('cat'), column_names))
continuous_cols = list(filter(lambda n: n.startswith('cont'), column_names))

In [11]:
to_tf_records('train_normalized_enhanced', 'train', categorical_cols, continuous_cols)
to_tf_records('valid_normalized_enhanced', 'valid', categorical_cols, continuous_cols)

Processed chunk 1 into data\records\train\train_normalized_enhanced_0.tfrecord
Processed chunk 1 into data\records\valid\valid_normalized_enhanced_0.tfrecord


In [21]:
BATCH_SIZE = 64
feature_description = {'id': tf.io.FixedLenFeature([], dtype=tf.int64)}

for col in categorical_cols:
    feature_description[col] = tf.io.FixedLenFeature([], dtype=tf.int64)
for col in continuous_cols:
    feature_description[col] = tf.io.FixedLenFeature([], dtype=tf.float32)

feature_description['target'] = tf.io.FixedLenFeature([], dtype=tf.float32)


@tf.function
def parse_example(record_bytes: list) -> tf.Tensor:
    example = tf.io.parse_single_example(record_bytes, feature_description)
    tensors = []

    for feature_name in categorical_cols:
        tensors.append(tf.cast(example[feature_name], tf.float32))
    for feature_name in continuous_cols + ['target']:
        tensors.append(example[feature_name])

    data_vector = tf.stack(tensors)

    return data_vector


def get_dataset(files: list[str]) -> tf.data.Dataset:
    return tf.data.TFRecordDataset(files, compression_type='GZIP')\
        .prefetch(tf.data.experimental.AUTOTUNE)\
        .map(parse_example, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
        .batch(BATCH_SIZE) \
        .prefetch(tf.data.experimental.AUTOTUNE)


# .shuffle(1000) \

In [22]:
train_records_folder = os.path.join('data', 'records', 'train')
valid_records_folder = os.path.join('data', 'records', 'valid')
train_files = [
    os.path.join(train_records_folder, name)
    for name in os.listdir(train_records_folder) if name.endswith('.tfrecord')]
valid_files = [
    os.path.join(valid_records_folder, name)
    for name in os.listdir(valid_records_folder) if name.endswith('.tfrecord')]
train_dataset = get_dataset(train_files)
valid_dataset = get_dataset(valid_files)

Location:
  File "C:\ProgramData\anaconda3\envs\datascience\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,

  File "C:\ProgramData\anaconda3\envs\datascience\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)

  File "C:\ProgramData\anaconda3\envs\datascience\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()

  File "C:\ProgramData\anaconda3\envs\datascience\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()

  File "C:\ProgramData\anaconda3\envs\datascience\lib\site-packages\ipykernel\kernelapp.py", line 711, in start
    self.io_loop.start()

  File "C:\ProgramData\anaconda3\envs\datascience\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()

  File "C:\ProgramData\anaconda3\envs\datascience\lib\asyncio\base_events.py", line 601, in run_forever
    self._run_once()

  File "C:\Program

In [None]:
cat_cols_len = len(categorical_cols)
regressor_enhanced = keras.Sequential([
    SwapNoise(ratio=.1, col_to_apply=[cat_cols_len + idx for idx in range(len(continuous_cols))]),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.5),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.5),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.5),
    keras.layers.Dense(512),
    keras.layers.PReLU(),
    keras.layers.Dropout(.3),
    keras.layers.Dense(1),
    keras.layers.PReLU()
])

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30,
                                               min_delta=1e-6)
regressor_enhanced.compile(optimizer='adam', loss='mse')

history_enhanced = regressor_enhanced.fit(
    train_dataset, epochs=1,
    callbacks=[
        early_stopping,
        keras.callbacks.ModelCheckpoint(
            filepath='saved_models/dnn2_regressor_after_dae_enhanced{epoch}',
            save_best_only=True)],
    validation_data=valid_dataset)

In [None]:
enhanced_regressor = keras.models.load_model(os.path.join('saved_models', 'dnn2_regressor_after_dae_enhanced6'))

In [None]:
X_test_transformed = rte.transform(test)
ridge_predictions = ridge.predict(X_test_transformed)
xgb_predictions = regressor.predict(test)

create_file('test', X_test_transformed, test, ids_test)

In [None]:
from tensorflow import Tensor


def parse_test_batch(tf_string: Tensor):
    zf = tf.zeros(shape=(1,), dtype=tf.float32)
    defaults = [zf] * (14628 + 14 + 2)
    data = tf.io.decode_csv(tf_string, defaults)
    features = data[1:]
    features = tf.stack(features, axis=-1)

    return features


def get_test_batched_dataset(batch_size: int, data_path: str) -> tf.data.Dataset:
    return tf.data.TextLineDataset([data_path]) \
        .skip(1) \
        .batch(batch_size) \
        .map(parse_test_batch)

In [None]:
test_dataset = get_test_batched_dataset(64, os.path.join('data', 'test_enhanced.csv'))
nn_predictions = enhanced_regressor.predict(test_dataset)

In [None]:
def vote():
    reshaped_ridge_predictions = ridge_predictions.reshape((ridge_predictions.shape[0], 1))
    reshaped_xgb_predictions = xgb_predictions.reshape((xgb_predictions.shape[0], 1))
    avg = (reshaped_ridge_predictions + reshaped_xgb_predictions) / 2
    avg_restored = restore(avg, y_train)

    pd \
        .DataFrame(np.column_stack([pd.read_csv('data/test_enhanced.csv')[['id']], avg_restored]), columns=['id', 'target']) \
        .astype({'id': int}) \
        .to_csv(os.path.join('submissions', 'ensemble3.csv'), index=False)


vote()