# Tabular Playground Series - Nov 2022
  - Practice your ML skills on this approachable dataset!
  - https://www.kaggle.com/competitions/tabular-playground-series-nov-2022

# 0. 패키지

In [1]:
import os
import time
import requests

from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss

In [2]:
import torch

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

  from .autonotebook import tqdm as notebook_tqdm


## 0.1 파라미터

In [3]:
N_THREADS = 2
N_FOLDS = 5
RANDOM_STATE = 22
TEST_SIZE = 0.2
TIMEOUT = 36000
TARGET_NAME = 'label'

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# 1. 데이터셋

In [5]:
INPUT_DIR = 'datasets/'
SUBS_DIR = INPUT_DIR + 'submission_files/'

In [6]:
labels = pd.read_csv(INPUT_DIR + 'train_labels.csv')
print(labels.shape)

(20000, 2)


In [7]:
submission = pd.read_csv(INPUT_DIR + 'sample_submission.csv')
print(submission.shape)

(20000, 2)


In [8]:
full_data = [pd.DataFrame(list(range(40000)), columns=['id'])]
for i, fname in enumerate(tqdm(sorted(os.listdir(SUBS_DIR)))) :
    full_data.append(pd.Series(np.clip(pd.read_csv(SUBS_DIR + fname)['pred'].values, 1e-6, 1 - 1e-6), name=f'sub_{i}'))
full_data.append(labels[TARGET_NAME].astype(np.int32))

full_data = pd.concat(full_data, axis=1)

100%|██████████| 5000/5000 [01:25<00:00, 58.14it/s]


In [9]:
train_data = full_data.iloc[:labels.shape[0], :]
test_data = full_data.iloc[labels.shape[0]:, :].reset_index(drop=True).drop('label', axis=1)

print(train_data.shape, test_data.shape)

(20000, 5002) (20000, 5001)


In [9]:
train_data.head()

Unnamed: 0,id,sub_0,sub_1,sub_2,sub_3,sub_4,sub_5,sub_6,sub_7,sub_8,...,sub_4991,sub_4992,sub_4993,sub_4994,sub_4995,sub_4996,sub_4997,sub_4998,sub_4999,label
0,0,0.709336,0.799007,0.851891,0.537158,0.62393,0.70597,0.503437,0.633185,0.64155,...,0.75025,0.66337,0.739333,0.822384,0.749498,0.7298,0.867847,0.745888,0.787,0.0
1,1,0.452988,0.364453,0.567582,0.354468,0.513818,0.584119,0.454809,0.238501,0.472171,...,0.794052,0.721298,0.804369,0.620626,0.733606,0.816942,0.814229,0.598331,0.547,1.0
2,2,0.675462,0.84226,0.800013,0.525229,0.692071,0.715418,0.651008,0.609124,0.691198,...,0.779859,0.865657,0.828493,0.76301,0.802883,0.806891,0.896058,0.855776,0.667,1.0
3,3,0.481046,0.577118,0.683032,0.541356,0.630088,0.664514,0.413373,0.50821,0.52614,...,0.799698,0.80013,0.716604,0.603779,0.708499,0.844837,0.853057,0.850657,0.622,1.0
4,4,0.957339,0.910337,0.917322,0.874487,0.787595,0.854273,0.843846,0.876749,0.821128,...,0.90015,0.960911,0.906037,0.96124,0.935608,0.889757,0.978505,0.953681,0.934,0.0


In [10]:
test_data.head()

Unnamed: 0,id,sub_0,sub_1,sub_2,sub_3,sub_4,sub_5,sub_6,sub_7,sub_8,...,sub_4990,sub_4991,sub_4992,sub_4993,sub_4994,sub_4995,sub_4996,sub_4997,sub_4998,sub_4999
0,20000,0.640707,0.611468,0.795757,0.684337,0.700504,0.720927,0.483421,0.635256,0.744267,...,0.890052,0.801783,0.838627,0.80023,0.801438,0.86526,0.779422,0.924805,0.846385,0.839
1,20001,0.636904,0.723449,0.842078,0.678966,0.67866,0.709361,0.861326,0.860177,0.593145,...,0.72253,0.785676,0.713539,0.743692,0.832114,0.83346,0.801346,0.81349,0.802177,0.507
2,20002,0.392496,0.330492,0.542698,0.391343,0.498312,0.560324,0.33706,0.293192,0.456379,...,0.591669,0.599184,0.569485,0.512785,0.679277,0.503172,0.55577,0.56436,0.583173,0.555
3,20003,0.588658,0.5588,0.591969,0.289922,0.523674,0.630457,0.551588,0.387767,0.535254,...,0.767875,0.70179,0.691961,0.62583,0.709976,0.628978,0.626589,0.676782,0.677209,0.513
4,20004,0.783603,0.416023,0.04305,0.244914,0.614647,0.600091,0.348334,0.203046,0.542674,...,0.869603,0.821132,0.742231,0.891385,0.716356,0.804243,0.882422,0.790921,0.699345,0.803


In [10]:
tmp = train_data.drop(columns=['id', 'label'])
train_data['mean_pred'] = tmp.mean(1)
train_data['std_pred'] = tmp.std(1)

tmp = test_data.drop('id', axis=1)
test_data['mean_pred'] = tmp.mean(1)
test_data['std_pred'] = tmp.std(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['mean_pred'] = tmp.mean(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['std_pred'] = tmp.std(1)


In [11]:
useful = ['sub_100', 'sub_1083', 'sub_1108', 'sub_114', 'sub_1165', 'sub_1174', 'sub_121', 'sub_1217', 'sub_122', 'sub_1242', 'sub_131', 'sub_132', 'sub_1359', 'sub_136', 'sub_1367', 
          'sub_1394', 'sub_1396', 'sub_1414', 'sub_1419', 'sub_144', 'sub_1447', 'sub_145', 'sub_1473', 'sub_1477', 'sub_1480', 'sub_151', 'sub_1511', 'sub_152', 'sub_1524', 'sub_1526', 'sub_1556', 
          'sub_158', 'sub_1584', 'sub_1598', 'sub_1609', 'sub_1614', 'sub_1694', 'sub_1702', 'sub_1719', 'sub_1722', 'sub_1735', 'sub_175', 'sub_1753', 'sub_1759', 'sub_1785', 'sub_180', 'sub_1803', 
          'sub_181', 'sub_1811', 'sub_1813', 'sub_1850', 'sub_1861', 'sub_1874', 'sub_189', 'sub_1891', 'sub_190', 'sub_1905', 'sub_1945', 'sub_1963', 'sub_197', 'sub_1974', 'sub_1975', 'sub_1984', 
          'sub_1987', 'sub_2', 'sub_2020', 'sub_2044', 'sub_2055', 'sub_2124', 'sub_2140', 'sub_2148', 'sub_2163', 'sub_2176', 'sub_226', 'sub_2279', 'sub_2292', 'sub_230', 'sub_2316', 'sub_2327', 
          'sub_2334', 'sub_2338', 'sub_2349', 'sub_2355', 'sub_2398', 'sub_2411', 'sub_2444', 'sub_2452', 'sub_2454', 'sub_2464', 'sub_2473', 'sub_2474', 'sub_2492', 'sub_25', 'sub_2500', 'sub_2511', 
          'sub_2521', 'sub_2554', 'sub_2566', 'sub_2572', 'sub_270', 'sub_2717', 'sub_2720', 'sub_2723', 'sub_2755', 'sub_277', 'sub_2771', 'sub_2785', 'sub_2791', 'sub_2816', 'sub_2821', 'sub_2847', 
          'sub_2876', 'sub_2909', 'sub_297', 'sub_2999', 'sub_301', 'sub_3025', 'sub_305', 'sub_3053', 'sub_3056', 'sub_3061', 'sub_3093', 'sub_3099', 'sub_3101', 'sub_3113', 'sub_3165', 'sub_3173', 
          'sub_3195', 'sub_3198', 'sub_3207', 'sub_3211', 'sub_323', 'sub_3254', 'sub_3292', 'sub_3306', 'sub_3308', 'sub_332', 'sub_3349', 'sub_3389', 'sub_3403', 'sub_3434', 'sub_344', 'sub_3444', 
          'sub_3455', 'sub_3485', 'sub_3491', 'sub_3494', 'sub_3501', 'sub_353', 'sub_3532', 'sub_3596', 'sub_3628', 'sub_364', 'sub_3643', 'sub_3655', 'sub_3661', 'sub_3757', 'sub_3772', 'sub_3790', 
          'sub_38', 'sub_381', 'sub_3841', 'sub_3852', 'sub_3853', 'sub_3885', 'sub_3902', 'sub_3920', 'sub_3927', 'sub_3933', 'sub_3945', 'sub_395', 'sub_3956', 'sub_396', 'sub_3960', 'sub_3962', 
          'sub_3964', 'sub_398', 'sub_3987', 'sub_4009', 'sub_4023', 'sub_4045', 'sub_4054', 'sub_4061', 'sub_408', 'sub_4093', 'sub_4135', 'sub_415', 'sub_4159', 'sub_4173', 'sub_4176', 'sub_4182', 
          'sub_4226', 'sub_4231', 'sub_4269', 'sub_4299', 'sub_43', 'sub_434', 'sub_4344', 'sub_4362', 'sub_4364', 'sub_4382', 'sub_4384', 'sub_4387', 'sub_4389', 'sub_44', 'sub_440', 'sub_4401', 
          'sub_4406', 'sub_4407', 'sub_4408', 'sub_4409', 'sub_4439', 'sub_4441', 'sub_4461', 'sub_4464', 'sub_4480', 'sub_4499', 'sub_451', 'sub_4524', 'sub_453', 'sub_4565', 'sub_4576', 'sub_4585', 
          'sub_4616', 'sub_4618', 'sub_4636', 'sub_4649', 'sub_4653', 'sub_4659', 'sub_4679', 'sub_468', 'sub_4686', 'sub_4687', 'sub_47', 'sub_4701', 'sub_4708', 'sub_4734', 'sub_4740', 'sub_4755', 
          'sub_4766', 'sub_478', 'sub_4782', 'sub_4811', 'sub_4825', 'sub_4826', 'sub_4835', 'sub_4841', 'sub_4851', 'sub_4890', 'sub_4892', 'sub_49', 'sub_4906', 'sub_4929', 'sub_4933', 'sub_4936', 
          'sub_4966', 'sub_4974', 'sub_4975', 'sub_4984', 'sub_4991', 'sub_4992', 'sub_4999', 'sub_501', 'sub_510', 'sub_52', 'sub_522', 'sub_526', 'sub_56', 'sub_561', 'sub_562', 'sub_6', 'sub_62', 
          'sub_628', 'sub_63', 'sub_638', 'sub_65', 'sub_655', 'sub_66', 'sub_664', 'sub_666', 'sub_67', 'sub_676', 'sub_685', 'sub_69', 'sub_70', 'sub_717', 'sub_72', 'sub_73', 'sub_741', 'sub_788', 
          'sub_80', 'sub_807', 'sub_813', 'sub_871', 'sub_903', 'sub_91', 'sub_959', 'sub_964', 'sub_986']

train_data = train_data[useful + [TARGET_NAME]]
test_data = test_data[useful]

train_data.shape, test_data.shape

((20000, 300), (20000, 299))

# 2. AutoML

## 2.1 Setup

### 2.1.1 Task

In [12]:
task = Task('binary', metric='logloss')

### 2.1.2 Feature Role

In [13]:
roles = {'target' : TARGET_NAME}

### 2.1.3 Model

In [14]:
models = ['linear_l2', 'gbm', 'lgb', 'dense', 'mlp']

In [15]:
automl = TabularAutoML(
        task=task, 
        timeout=TIMEOUT, 
        cpu_limit=N_THREADS, 
        selection_params= {'mode':2, 'feature_group_size':50, 'select_algos':models},
        reader_params = {'n_jobs':N_THREADS}
    )

In [16]:
oof_pred = automl.fit_predict(train_data, roles=roles, verbose=1)

[15:34:12] Stdout logging level is INFO.
[15:34:12] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[15:34:12] Task: binary

[15:34:12] Start automl preset with listed constraints:
[15:34:13] - time: 36000.00 seconds
[15:34:13] - CPU: 2 cores
[15:34:13] - memory: 16 GB

[15:34:13] [1mTrain data shape: (20000, 300)[0m

[15:34:26] Layer [1m1[0m train process start. Time left 35986.65 secs
[15:34:50] [1mSelector_LightGBM[0m fitting and predicting completed
[15:35:15] [1mSelector_LightGBM[0m fitting and predicting completed
[15:35:31] [1mSelector_LightGBM[0m fitting and predicting completed
[15:35:39] [1mSelector_LightGBM[0m fitting and predicting completed
[15:35:50] [1mSelector_LightGBM[0m fitting and predicting completed
[15:36:01] [1mSelector_LightGBM[0m fitting and predicting completed
[15:36:13] [1mSelector_LightGBM[0m fitting and predicting completed
[15:36:25] [1mSelector_LightGBM[0m fitting and predicting compl

In [17]:
print(automl.create_model_str_desc())

Final prediction for new objects (level 0) = 
	 0.47057 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.30124 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) +
	 0.14078 * (5 averaged models Lvl_0_Pipe_1_Mod_2_CatBoost) +
	 0.08742 * (5 averaged models Lvl_0_Pipe_1_Mod_3_Tuned_CatBoost) 


In [18]:
preds = automl.predict(test_data)

preds.shape

(20000, 1)

In [20]:
submission['pred'] = preds.data[:, 0]

In [23]:
submission.set_index('id').to_csv('submission_automl.csv')