In [None]:
import numpy as np
import pandas as pd

from data import preprocess_data, postprocessing
from func import deviation_metric, get_timestamp

from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
from lightautoml.tasks import Task

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
kwargs = {'cluster': 0.025, 'clean_floor_num': True, 'clean_region_city': True, 'remove_type_0': True}
train_pre, test_pre, num_columns, cat_columns, target = preprocess_data(train, test, **kwargs)

columns = num_columns + cat_columns + [target]

test_pre[columns]

In [None]:
N_THREADS = 12 # threads count
N_FOLDS = 5 # folds count for AutoML
RANDOM_STATE = 55 # fixed random state
TIMEOUT = 1800 # Time in seconds for automl run

In [None]:
task = Task('reg', loss = 'rmsle', metric = deviation_metric)

roles = {
    'target': target,
    'category': cat_columns
    'numeric': num_columns
}

In [None]:
automl = TabularUtilizedAutoML(task = task,
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS,
                                                'cv': N_FOLDS,
                                                'random_state': RANDOM_STATE})

oof_pred = automl.fit_predict(train_pre[columns], roles = roles)

In [None]:
output = pd.DataFrame({'id': test_pre['id'],
                       'per_square_meter_price': automl.predict(test_pre[columns]).data[:, 0]})

output = postprocess(output, target)
output.to_csv(f'lam-{get_timestamp()}.csv', index=False)