In [1]:
import pandas as pd
import numpy as np
import os.path as path
import numba
import random
from sklearn.model_selection import train_test_split, KFold
from tqdm.notebook import tqdm
from math import ceil

In [6]:
def read_dataset(id, root='data'):
    X_tr = pd.read_csv(path.join(root, f'{id}-X.csv')).values
    y_tr = pd.read_csv(path.join(root, f'{id}-Y.csv')).values
    X_te = pd.read_csv(path.join(root, f'{id}-test.csv')).values
    return X_tr, y_tr[:,0], X_te, None

@numba.njit
def rmse(Y, y):
    return np.sqrt(np.mean((Y - y)**2))

@numba.njit
def best_split(X, y, min_samples_leaf=1):
    # Ugly imperative code for Numba to optimize
    n_rows, n_cols = X.shape
    loss, split = None, (None, None)
    cols = np.arange(n_cols)
    for col in cols:
        vals = np.unique(X[:,col])
        n_vals = len(vals)
        if n_vals > 1000:
            vals = np.random.choice(vals, size=ceil(np.sqrt(n_vals)), replace=False)
        for val in vals:
            mask = X[:,col] < val
            a, b = y[mask], y[~mask]
            if len(a) < min_samples_leaf or len(b) < min_samples_leaf:
                continue
            new_loss = max(rmse(np.mean(a), a), rmse(np.mean(b), b))
            if loss is None or loss < new_loss:
                loss = new_loss
                split = col, val
    return split

class Node:
    def __init__(self):
        self.lhs, self.rhs, self.leaf, self.split = [None]*4
    
    def set_params(self, params):
        for k, x in params.items(): setattr(self, k, x)
        self.params = params
        
    def extend(self):
        tree = Node()
        tree.set_params(self.params)
        return tree
    
    def _fit(self, X, y, depth=0):
        n_rows = len(y)
        mean_y = np.mean(y)
        n_matches = np.sum(np.abs(mean_y - y) < self.eps)
        if self.max_depth == depth or n_matches/n_rows > self.max_samples_split:
            self.leaf = mean_y
            return
        
        self.split = col, val = best_split(X, y, min_samples_leaf=self.min_samples_leaf)
        if col is None:
            self.leaf = mean_y
            return
        
        mask = X[:,col] < val
        self.lhs, self.rhs = self.extend(), self.extend()
        self.lhs._fit(X[ mask], y[ mask], depth=depth+1)
        self.rhs._fit(X[~mask], y[~mask], depth=depth+1)
        
    def fit(self, X, y, **params):
        self.set_params(params)
        self._fit(X, y, depth=0)
    
    def predict_one(self, x):
        if self.leaf is not None: return self.leaf
        col, val = self.split
        node = self.lhs if x[col] < val else self.rhs
        return node.predict_one(x)
    
    def predict(self, X):
        #assert self.lhs and self.rhs and self.split, 'Tree is not trained'
        return np.array([self.predict_one(X[i,:]) for i in range(X.shape[0])])
    
    def depth(self):
        return 0 if self.leaf is not None else 1 + max(self.lhs.depth(), self.rhs.depth())

In [7]:
best_params = {}

In [8]:
default_params = 0, dict(max_depth=8, min_samples_leaf=1, max_samples_split=0.8, eps=0.1)

param_dists = dict(
    max_depth=np.arange(4, 20),
    max_samples_split=np.arange(0.5, 1.05, 0.05),
    min_samples_leaf=[1], #np.arange(1, 20),
    eps=np.maximum(0.001, np.arange(0, 0.105, 0.005)),
)

def random_grid(n, **dists):
    grid = []
    for i in range(n):
        point = dict()
        for name, dist in dists.items():
            point[name] = random.choice(dist)
        grid.append(point)
    return grid

def find_params(X, y, n_iters, cv=5, seed=2021):
    min_loss, best_params = None, None
    for params in tqdm(random_grid(n_iters, **param_dists)):
        losses = []
        for idx_tr, idx_te in KFold(n_splits=cv, random_state=seed, shuffle=True).split(X):
            X_tr, X_te, y_tr, y_te = X[idx_tr], X[idx_te], y[idx_tr], y[idx_te]
            tree = Node()
            tree.fit(X_tr, y_tr, **params)
            Y_te = tree.predict(X_te)
            losses.append(rmse(Y_te, y_te))
        loss = np.mean(losses)
        if min_loss is None or loss < min_loss:
            min_loss, best_params = loss, params
    return min_loss, best_params

def find_params_for_id(id, n_iters):
    X, y, _, _ = read_dataset(id=id, root='data')
    return find_params(X, y, n_iters)
            
def evaluate():
    results = []
    for id in tqdm(range(1, 14)):
        X, y, _, _ = read_dataset(id=id, root='data')
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=2021)
        _, params = best_params.get(id, default_params)
        tree = Node()
        tree.fit(X_tr, y_tr, **params)
        Y_te = tree.predict(X_te)
        loss = rmse(Y_te, y_te)
        result = dict(id=id, loss=loss, depth=tree.depth())
        print(result)
        results.append(result)
    return pd.DataFrame(results)

for id in range(1, 14):
    n = 100
    if id in [2, 6, 7, 8]: n = 1000
    if id in [13]: n = 200
    if id in [1, 10]: n = 40
    if id in [4, 5]: n = 20
    if id in [9, 11]: n = 10
    loss, params = find_params_for_id(id, n)
    if id not in best_params or loss < best_params[id][0]:
        print('new best!')
        best_params[id] = loss, params
    print(id, loss, params)

  0%|          | 0/40 [00:00<?, ?it/s]

new best!
1 0.8789411234104909 {'max_depth': 15, 'max_samples_split': 0.9000000000000004, 'min_samples_leaf': 1, 'eps': 0.065}


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [67]:
best_params

{1: (0.6948942587217384,
  {'max_depth': 6,
   'max_samples_split': 0.5,
   'min_samples_leaf': 1,
   'eps': 0.095}),
 2: (0.14009522399417546,
  {'max_depth': 10,
   'max_samples_split': 0.9500000000000004,
   'min_samples_leaf': 1,
   'eps': 0.085}),
 3: (0.618861477123154,
  {'max_depth': 4,
   'max_samples_split': 0.8000000000000003,
   'min_samples_leaf': 1,
   'eps': 0.001}),
 4: (0.5559184827165268,
  {'max_depth': 10,
   'max_samples_split': 0.7500000000000002,
   'min_samples_leaf': 1,
   'eps': 0.085}),
 5: (0.26317137884800296,
  {'max_depth': 14,
   'max_samples_split': 0.6000000000000001,
   'min_samples_leaf': 1,
   'eps': 0.085}),
 6: (0.41156976586654326,
  {'max_depth': 9,
   'max_samples_split': 0.7000000000000002,
   'min_samples_leaf': 1,
   'eps': 0.085}),
 7: (0.06462435590195648,
  {'max_depth': 10,
   'max_samples_split': 0.8000000000000003,
   'min_samples_leaf': 1,
   'eps': 0.001}),
 8: (0.632313367889696,
  {'max_depth': 6,
   'max_samples_split': 0.5,
   'm

In [71]:
results = evaluate()
print(results.loss.mean())
results

  0%|          | 0/13 [00:00<?, ?it/s]

{'id': 1, 'loss': 0.6860439126214415, 'depth': 6}
{'id': 2, 'loss': 0.12717422642783732, 'depth': 10}
{'id': 3, 'loss': 0.6376490115548699, 'depth': 4}
{'id': 4, 'loss': 0.5613702125930704, 'depth': 10}
{'id': 5, 'loss': 0.23572019492693932, 'depth': 14}
{'id': 6, 'loss': 0.4505683200348073, 'depth': 9}
{'id': 7, 'loss': 0.07933034529249529, 'depth': 10}
{'id': 8, 'loss': 1.9584399582673633, 'depth': 6}
{'id': 9, 'loss': 0.7101859502593082, 'depth': 7}
{'id': 10, 'loss': 0.2039162508359682, 'depth': 10}
{'id': 11, 'loss': 0.30664744705349006, 'depth': 9}
{'id': 12, 'loss': 0.9491408548118362, 'depth': 4}
{'id': 13, 'loss': 0.08617258834415233, 'depth': 8}
0.5378737902325831


Unnamed: 0,id,loss,depth
0,1,0.686044,6
1,2,0.127174,10
2,3,0.637649,4
3,4,0.56137,10
4,5,0.23572,14
5,6,0.450568,9
6,7,0.07933,10
7,8,1.95844,6
8,9,0.710186,7
9,10,0.203916,10


In [117]:
results = evaluate()
print(results.loss.mean())
results

  0%|          | 0/13 [00:00<?, ?it/s]

{'id': 1, 'loss': 0.689713814928402, 'depth': 7}
{'id': 2, 'loss': 0.11902723611394904, 'depth': 5}
{'id': 3, 'loss': 0.4993628331790936, 'depth': 4}
{'id': 4, 'loss': 0.557446449204454, 'depth': 9}
{'id': 5, 'loss': 0.30734394556708833, 'depth': 8}
{'id': 6, 'loss': 0.4408157082794413, 'depth': 8}
{'id': 7, 'loss': 0.07933034529249529, 'depth': 10}
{'id': 8, 'loss': 1.929929664512582, 'depth': 4}
{'id': 9, 'loss': 0.706208819294931, 'depth': 10}
{'id': 10, 'loss': 0.2031681855687948, 'depth': 10}
{'id': 11, 'loss': 0.2920150187952806, 'depth': 9}
{'id': 12, 'loss': 0.9541354709246908, 'depth': 4}
{'id': 13, 'loss': 0.08062162715143269, 'depth': 7}
0.5276245476009719


Unnamed: 0,id,loss,depth
0,1,0.689714,7
1,2,0.119027,5
2,3,0.499363,4
3,4,0.557446,9
4,5,0.307344,8
5,6,0.440816,8
6,7,0.07933,10
7,8,1.92993,4
8,9,0.706209,10
9,10,0.203168,10


In [72]:
Y_te = []
for id in tqdm(range(1, 14)):
    X_tr, y_tr, X_te, _ = read_dataset(id=id, root='data')
    _, params = best_params[id]
    tree = Node()
    tree.fit(X_tr, y_tr, **params)
    Y_te = tree.predict(X_te)
    pd.DataFrame(Y_te, columns=['Y']).to_csv(f'data/{id}.csv', index=None)

  0%|          | 0/13 [00:00<?, ?it/s]