In [1]:
import pandas as pd
import numpy as np
import os.path as path
import numba
import random
from sklearn.model_selection import train_test_split, KFold
from tqdm.notebook import tqdm
from math import ceil

In [2]:
def read_dataset(id, root='data'):
    X_tr = pd.read_csv(path.join(root, f'{id}-X.csv')).values
    y_tr = pd.read_csv(path.join(root, f'{id}-Y.csv')).values
    X_te = pd.read_csv(path.join(root, f'{id}-test.csv')).values
    return X_tr, y_tr[:,0], X_te, None

@numba.njit
def rmse(Y, y):
    return np.sqrt(np.mean((Y - y)**2))

@numba.njit
def best_split(X, y, min_samples_leaf=1):
    # Ugly imperative code for Numba to optimize
    n_rows, n_cols = X.shape
    best_loss, split = None, (None, None)
    cols = np.arange(n_cols)
    for col in cols:
        vals = np.unique(X[:,col])
        n_vals = len(vals)
        if n_vals > 1000:
            vals = np.random.choice(vals, size=ceil(np.sqrt(n_vals)), replace=False)
        for val in vals:
            mask = X[:,col] < val
            a, b = y[mask], y[~mask]
            if len(a) < min_samples_leaf or len(b) < min_samples_leaf:
                continue
            loss = max(rmse(np.mean(a), a), rmse(np.mean(b), b))
            if best_loss is None or loss < best_loss:
                best_loss = loss
                split = col, val
    return split

class Node:
    def __init__(self):
        self.lhs, self.rhs, self.leaf, self.split = [None]*4
    
    def set_params(self, params):
        for k, x in params.items(): setattr(self, k, x)
        self.params = params
        
    def extend(self):
        tree = Node()
        tree.set_params(self.params)
        return tree
    
    def _fit(self, X, y, depth=0):
        n_rows = len(y)
        mean_y = np.mean(y)
        n_matches = np.sum(np.abs(mean_y - y) < self.eps)
        if self.max_depth == depth or n_matches/n_rows > self.max_samples_split:
            self.leaf = mean_y
            return
        
        self.split = col, val = best_split(X, y, min_samples_leaf=self.min_samples_leaf)
        if col is None:
            self.leaf = mean_y
            return
        
        mask = X[:,col] < val
        self.lhs, self.rhs = self.extend(), self.extend()
        self.lhs._fit(X[ mask], y[ mask], depth=depth+1)
        self.rhs._fit(X[~mask], y[~mask], depth=depth+1)
        
    def fit(self, X, y, **params):
        self.set_params(params)
        self._fit(X, y, depth=0)
    
    def predict_one(self, x):
        if self.leaf is not None: return self.leaf
        col, val = self.split
        node = self.lhs if x[col] < val else self.rhs
        return node.predict_one(x)
    
    def predict(self, X):
        #assert self.lhs and self.rhs and self.split, 'Tree is not trained'
        return np.array([self.predict_one(X[i,:]) for i in range(X.shape[0])])
    
    def depth(self):
        return 0 if self.leaf is not None else 1 + max(self.lhs.depth(), self.rhs.depth())

In [3]:
best_params = {}

In [4]:
default_params = 0, dict(max_depth=8, min_samples_leaf=1, max_samples_split=0.8, eps=0.1)

param_dists = dict(
    max_depth=np.arange(4, 20),
    max_samples_split=np.arange(0.5, 1.05, 0.05),
    min_samples_leaf=[1], #np.arange(1, 20),
    eps=np.maximum(0.001, np.arange(0, 0.105, 0.005)),
)

def random_grid(n, **dists):
    grid = []
    for i in range(n):
        point = dict()
        for name, dist in dists.items():
            point[name] = random.choice(dist)
        grid.append(point)
    return grid

def find_params(X, y, n_iters, cv=5, seed=2021):
    min_loss, best_params = None, None
    for params in tqdm(random_grid(n_iters, **param_dists)):
        losses = []
        for idx_tr, idx_te in KFold(n_splits=cv, random_state=seed, shuffle=True).split(X):
            X_tr, X_te, y_tr, y_te = X[idx_tr], X[idx_te], y[idx_tr], y[idx_te]
            tree = Node()
            tree.fit(X_tr, y_tr, **params)
            Y_te = tree.predict(X_te)
            losses.append(rmse(Y_te, y_te))
        loss = np.mean(losses)
        if min_loss is None or loss < min_loss:
            min_loss, best_params = loss, params
    return min_loss, best_params

def find_params_for_id(id, n_iters):
    X, y, _, _ = read_dataset(id=id, root='data')
    return find_params(X, y, n_iters)
            
def evaluate():
    results = []
    for id in tqdm(range(1, 14)):
        X, y, _, _ = read_dataset(id=id, root='data')
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=2021)
        _, params = best_params.get(id, default_params)
        tree = Node()
        tree.fit(X_tr, y_tr, **params)
        Y_te = tree.predict(X_te)
        loss = rmse(Y_te, y_te)
        result = dict(id=id, loss=loss, depth=tree.depth())
        print(result)
        results.append(result)
    return pd.DataFrame(results)

for id in range(1, 14):
    if id in [2, 6, 7, 8]: n = 1000
    if id in [13]: n = 200
    if id in [1, 10]: n = 40
    if id in [4, 5]: n = 20
    if id in [9, 11]: n = 10
    loss, params = find_params_for_id(id, n)
    if id not in best_params or loss < best_params[id][0]:
        print('new best!')
        best_params[id] = loss, params
    print(id, loss, params)

  0%|          | 0/40 [00:00<?, ?it/s]

new best!
1 0.9800689318123466 {'max_depth': 19, 'max_samples_split': 0.9500000000000004, 'min_samples_leaf': 1, 'eps': 0.03}


  0%|          | 0/1000 [00:00<?, ?it/s]

new best!
2 0.27790240210236294 {'max_depth': 19, 'max_samples_split': 0.9500000000000004, 'min_samples_leaf': 1, 'eps': 0.07}


  0%|          | 0/1000 [00:00<?, ?it/s]

new best!
3 0.9906855523585231 {'max_depth': 16, 'max_samples_split': 0.9500000000000004, 'min_samples_leaf': 1, 'eps': 0.075}


  0%|          | 0/20 [00:00<?, ?it/s]

new best!
4 0.9980364377987232 {'max_depth': 19, 'max_samples_split': 0.8000000000000003, 'min_samples_leaf': 1, 'eps': 0.005}


  0%|          | 0/20 [00:00<?, ?it/s]

new best!
5 0.4717611555057145 {'max_depth': 16, 'max_samples_split': 1.0000000000000004, 'min_samples_leaf': 1, 'eps': 0.06}


  0%|          | 0/1000 [00:00<?, ?it/s]

new best!
6 0.9979195877723124 {'max_depth': 17, 'max_samples_split': 0.6000000000000001, 'min_samples_leaf': 1, 'eps': 0.09}


  0%|          | 0/1000 [00:00<?, ?it/s]

new best!
7 0.6655638771575338 {'max_depth': 19, 'max_samples_split': 0.7500000000000002, 'min_samples_leaf': 1, 'eps': 0.001}


  0%|          | 0/1000 [00:00<?, ?it/s]

new best!
8 0.7037490192249347 {'max_depth': 4, 'max_samples_split': 0.6500000000000001, 'min_samples_leaf': 1, 'eps': 0.05}


  0%|          | 0/10 [00:00<?, ?it/s]

new best!
9 0.9901353837622405 {'max_depth': 18, 'max_samples_split': 0.9500000000000004, 'min_samples_leaf': 1, 'eps': 0.04}


  0%|          | 0/40 [00:00<?, ?it/s]

new best!
10 0.9990725171207057 {'max_depth': 19, 'max_samples_split': 0.6000000000000001, 'min_samples_leaf': 1, 'eps': 0.025}


  0%|          | 0/10 [00:00<?, ?it/s]

new best!
11 1.0006727410722605 {'max_depth': 5, 'max_samples_split': 0.8500000000000003, 'min_samples_leaf': 1, 'eps': 0.08}


  0%|          | 0/10 [00:00<?, ?it/s]

new best!
12 0.9881537170437238 {'max_depth': 4, 'max_samples_split': 1.0000000000000004, 'min_samples_leaf': 1, 'eps': 0.005}


  0%|          | 0/200 [00:00<?, ?it/s]

new best!
13 0.7141425697812096 {'max_depth': 19, 'max_samples_split': 0.55, 'min_samples_leaf': 1, 'eps': 0.035}


In [5]:
best_params

{1: (0.9800689318123466,
  {'max_depth': 19,
   'max_samples_split': 0.9500000000000004,
   'min_samples_leaf': 1,
   'eps': 0.03}),
 2: (0.27790240210236294,
  {'max_depth': 19,
   'max_samples_split': 0.9500000000000004,
   'min_samples_leaf': 1,
   'eps': 0.07}),
 3: (0.9906855523585231,
  {'max_depth': 16,
   'max_samples_split': 0.9500000000000004,
   'min_samples_leaf': 1,
   'eps': 0.075}),
 4: (0.9980364377987232,
  {'max_depth': 19,
   'max_samples_split': 0.8000000000000003,
   'min_samples_leaf': 1,
   'eps': 0.005}),
 5: (0.4717611555057145,
  {'max_depth': 16,
   'max_samples_split': 1.0000000000000004,
   'min_samples_leaf': 1,
   'eps': 0.06}),
 6: (0.9979195877723124,
  {'max_depth': 17,
   'max_samples_split': 0.6000000000000001,
   'min_samples_leaf': 1,
   'eps': 0.09}),
 7: (0.6655638771575338,
  {'max_depth': 19,
   'max_samples_split': 0.7500000000000002,
   'min_samples_leaf': 1,
   'eps': 0.001}),
 8: (0.7037490192249347,
  {'max_depth': 4,
   'max_samples_split

In [6]:
results = evaluate()
print(results.loss.mean())
results

  0%|          | 0/13 [00:00<?, ?it/s]

{'id': 1, 'loss': 0.9867172569979525, 'depth': 19}
{'id': 2, 'loss': 0.14487445793779094, 'depth': 19}
{'id': 3, 'loss': 0.8467298148201726, 'depth': 16}
{'id': 4, 'loss': 1.002690015487241, 'depth': 19}
{'id': 5, 'loss': 0.47129349502356627, 'depth': 16}
{'id': 6, 'loss': 0.9185717837436407, 'depth': 17}
{'id': 7, 'loss': 0.5079590371102758, 'depth': 19}
{'id': 8, 'loss': 1.9192735539122836, 'depth': 4}
{'id': 9, 'loss': 0.9549503689241841, 'depth': 18}
{'id': 10, 'loss': 0.9958662786403208, 'depth': 19}
{'id': 11, 'loss': 0.9999778982743274, 'depth': 5}
{'id': 12, 'loss': 0.9376731160851814, 'depth': 4}
{'id': 13, 'loss': 0.5722220442449105, 'depth': 19}
0.8660614708616805


Unnamed: 0,id,loss,depth
0,1,0.986717,19
1,2,0.144874,19
2,3,0.84673,16
3,4,1.00269,19
4,5,0.471293,16
5,6,0.918572,17
6,7,0.507959,19
7,8,1.919274,4
8,9,0.95495,18
9,10,0.995866,19


In [7]:
results = evaluate()
print(results.loss.mean())
results

  0%|          | 0/13 [00:00<?, ?it/s]

{'id': 1, 'loss': 0.9818837560424054, 'depth': 19}
{'id': 2, 'loss': 0.14487445793779094, 'depth': 19}
{'id': 3, 'loss': 0.8467298148201726, 'depth': 16}
{'id': 4, 'loss': 1.0030790536936445, 'depth': 19}
{'id': 5, 'loss': 0.47129349502356627, 'depth': 16}
{'id': 6, 'loss': 0.9185717837436407, 'depth': 17}
{'id': 7, 'loss': 0.5079590371102758, 'depth': 19}
{'id': 8, 'loss': 1.9192735539122836, 'depth': 4}
{'id': 9, 'loss': 0.9559495734151032, 'depth': 18}
{'id': 10, 'loss': 0.9958662786403208, 'depth': 19}
{'id': 11, 'loss': 0.9949499619505505, 'depth': 5}
{'id': 12, 'loss': 0.9376731160851814, 'depth': 4}
{'id': 13, 'loss': 0.5722220442449105, 'depth': 19}
0.8654096866630651


Unnamed: 0,id,loss,depth
0,1,0.981884,19
1,2,0.144874,19
2,3,0.84673,16
3,4,1.003079,19
4,5,0.471293,16
5,6,0.918572,17
6,7,0.507959,19
7,8,1.919274,4
8,9,0.95595,18
9,10,0.995866,19


In [8]:
Y_te = []
for id in tqdm(range(1, 14)):
    X_tr, y_tr, X_te, _ = read_dataset(id=id, root='data')
    _, params = best_params[id]
    tree = Node()
    tree.fit(X_tr, y_tr, **params)
    Y_te = tree.predict(X_te)
    pd.DataFrame(Y_te, columns=['Y']).to_csv(f'data/{id}.csv', index=None)

  0%|          | 0/13 [00:00<?, ?it/s]