In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn.functional as F
from torch import nn
import numpy as np
import pandas as pd

# manage beams's relative imports
import sys
sys.path.append('..')

from examples.covtype_ssl import CovtypeDataset, get_covtype_parser, CovtypeMaskedDataset, CovtypeDatasetOrg, EmbeddingCovtypeDataset
from src.beam import beam_arguments, as_numpy

In [2]:
path_to_data = '/home/shared/data/dataset/covtype'
root_dir = '/home/shared/data/results/covtype'

hparams = beam_arguments(get_covtype_parser(),
                         f"--project-name=covtype_ssl --root-dir={root_dir} --algorithm=BeamVICReg --device=0",
                         "--batch-size=512 --n-epochs=100 --parallel=1 --momentum=0.9 --beta2=0.99",
                         weight_factor=.0, weight_decay=1e-5, path_to_data=path_to_data, dropout=.0, channels=256,
                         n_layers=2)

In [6]:
dataset = EmbeddingCovtypeDataset(hparams)
# dataset = CovtypeDataset(hparams)
# dataset = CovtypeMaskedDataset(hparams)

In [7]:
dataset.indices

{'test': tensor([215988, 424155, 274437,  ...,  80871, 169606,  70347]),
 'validation': tensor([290047, 332003, 108670,  ..., 370968, 163051, 423266]),
 'train': tensor([401245, 180638,  69781,  ..., 479426, 213708,  94533])}

In [8]:
data_train = dataset[dataset.indices['train']]
data_validation = dataset[dataset.indices['validation']]
data_test = dataset[dataset.indices['test']]

In [9]:
# df = pd.DataFrame(as_numpy(data_train.data['x_cat']))

In [11]:
# train_x = as_numpy(data_train.data['x'][0])
# train_y = as_numpy(data_train.data['y'])

# val_x = as_numpy(data_validation.data['x'][0])
# val_y = as_numpy(data_validation.data['y'])

# test_x = as_numpy(data_test.data['x'][0])
# test_y = as_numpy(data_test.data['y'])
# cat_features = []


# train_x = as_numpy(data_train.data['x'][1])
# train_y = as_numpy(data_train.data['y'])

# val_x = as_numpy(data_validation.data['x'][1])
# val_y = as_numpy(data_validation.data['y'])

# test_x = as_numpy(data_test.data['x'][1])
# test_y = as_numpy(data_test.data['y'])

# cat_features = np.arange(train_x.shape[-1])


# train_x = as_numpy(data_train.data['emb'].flatten(start_dim=1))
# train_y = as_numpy(data_train.data['y'])

# val_x = as_numpy(data_validation.data['emb'].flatten(start_dim=1))
# val_y = as_numpy(data_validation.data['y'])

# test_x = as_numpy(data_test.data['emb'].flatten(start_dim=1))
# test_y = as_numpy(data_test.data['y'])
# cat_features = []


train_x = as_numpy(data_train.data['emb'].sum(dim=1))
train_y = as_numpy(data_train.data['y'])

val_x = as_numpy(data_validation.data['emb'].sum(dim=1))
val_y = as_numpy(data_validation.data['y'])

test_x = as_numpy(data_test.data['emb'].sum(dim=1))
test_y = as_numpy(data_test.data['y'])
cat_features = []


## Catboost classifier

In [12]:
from catboost import CatBoostClassifier

In [13]:
 # only one of the parameters iterations, n_estimators, num_boost_round, num_trees should be initialized.

model = CatBoostClassifier(learning_rate=.1, depth=12, task_type="GPU", devices='0',
                           loss_function='MultiClass', metric_period=50, verbose=True, n_estimators=500)

In [None]:
model.fit(train_x, train_y, cat_features)



0:	learn: 1.7852095	total: 1.03s	remaining: 8m 34s


In [11]:
model.score(val_x, val_y)

0.5933929992900325

In [61]:
model.fit(train_x, train_y, cat_features)

0:	learn: 1.7570187	total: 175ms	remaining: 2m 54s
100:	learn: 0.4598375	total: 16.5s	remaining: 2m 26s
200:	learn: 0.3615342	total: 34.2s	remaining: 2m 16s
300:	learn: 0.3101426	total: 52.2s	remaining: 2m 1s
400:	learn: 0.2668925	total: 1m 10s	remaining: 1m 45s
500:	learn: 0.2345253	total: 1m 29s	remaining: 1m 28s
600:	learn: 0.2054078	total: 1m 48s	remaining: 1m 11s
700:	learn: 0.1815197	total: 2m 7s	remaining: 54.4s
800:	learn: 0.1619548	total: 2m 26s	remaining: 36.4s
900:	learn: 0.1441288	total: 2m 45s	remaining: 18.2s
999:	learn: 0.1292784	total: 3m 4s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f9e44249cd0>

In [33]:
model.score(val_x, val_y)

0.9094791420150169

## LGBM classifier

In [34]:
import lightgbm as lgb

In [35]:
train_data = lgb.Dataset(train_x, label=train_y)
validation_data = lgb.Dataset(val_x, label=val_y)

In [40]:
param = {'objective': 'multiclass',
         'num_leaves': 31,
         'max_depth': 12,
         'gpu_device_id': 2,
         'verbosity': -1,
         'metric': ['multi_error', 'multiclass'],
         'num_class': np.max(train_y) + 1}

In [41]:
lgb.train(param, train_data, 100, valid_sets=[validation_data])

[1]	valid_0's multi_error: 0.490813	valid_0's multi_logloss: 1.01485
[2]	valid_0's multi_error: 0.371335	valid_0's multi_logloss: 0.945688
[3]	valid_0's multi_error: 0.309664	valid_0's multi_logloss: 0.892926
[4]	valid_0's multi_error: 0.28844	valid_0's multi_logloss: 0.844769
[5]	valid_0's multi_error: 0.27423	valid_0's multi_logloss: 0.8101
[6]	valid_0's multi_error: 0.269766	valid_0's multi_logloss: 0.780868
[7]	valid_0's multi_error: 0.268798	valid_0's multi_logloss: 0.756938
[8]	valid_0's multi_error: 0.266109	valid_0's multi_logloss: 0.736936
[9]	valid_0's multi_error: 0.263161	valid_0's multi_logloss: 0.7197
[10]	valid_0's multi_error: 0.261494	valid_0's multi_logloss: 0.704564
[11]	valid_0's multi_error: 0.259741	valid_0's multi_logloss: 0.690477
[12]	valid_0's multi_error: 0.258267	valid_0's multi_logloss: 0.678435
[13]	valid_0's multi_error: 0.255879	valid_0's multi_logloss: 0.667289
[14]	valid_0's multi_error: 0.253835	valid_0's multi_logloss: 0.656453
[15]	valid_0's multi_e

<lightgbm.basic.Booster at 0x7ef6cc07ebb0>

## XGBoost

In [42]:
n_estimators = 1000

In [45]:
import xgboost

AttributeError: /opt/conda/lib/libxgboost.so: undefined symbol: path

In [44]:
from xgboost import XGBClassifier

AttributeError: /opt/conda/lib/libxgboost.so: undefined symbol: path

In [43]:
clf_xgb = XGBClassifier(max_depth=8,
    learning_rate=0.1,
    n_estimators=n_estimators,
    verbosity=0,
    silent=None,
    objective="multi:softmax",
    booster='gbtree',
    n_jobs=-1,
    nthread=None,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.7,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    random_state=0,
    seed=None,)

AttributeError: /opt/conda/lib/libxgboost.so: undefined symbol: path

In [None]:
clf_xgb.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=40,
            verbose=10)