In [1]:
##pip install -U lightautoml

In [2]:
# Imports from our package
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, auc, roc_curve, roc_auc_score

from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier, plot_importance

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from sklearn.metrics import log_loss

In [3]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 2021 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 60 * 60 # Time in seconds for automl run

In [4]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.head(5)

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,0,0,0,1,0,1,0,0,0,0,...,0,0,21,0,0,0,0,0,0,Class_2
1,1,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,13,2,0,Class_1
3,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,1,0,Class_4
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Class_2


In [6]:
train['target1'] = train['target'].map({'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3})

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

In [8]:
%%time

automl = TabularUtilizedAutoML(task = Task('multiclass',), 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS},
)

Wall time: 19 ms


In [9]:
target_column = 'target'

roles = {
    'target': target_column,
    'drop': ['id'],
}

lightml_pred = automl.fit_predict(train.iloc[:,:-1], roles = roles)
print('lightml_pred:\n{}\nShape = {}'.format(lightml_pred[:10], lightml_pred.shape))

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'random_state': 42}
Start automl preset with listed constraints:
- time: 3599.9990022182465 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (100000, 52)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 3586.889063835144 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = -1.1128897009730339
Linear model: C = 5e-05 score = -1.1089370454207064
Linear model: C = 0.0001 score = -1.108363511864096
Linear model: C = 0.0005 score = -1.108386828507483
Linear model: C = 0.001 score = -1.1084940301463009

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = -1.1124130284383893
Lin

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.09969
[200]	valid's multi_logloss: 1.09874
[300]	valid's multi_logloss: 1.10109
Early stopping, best iteration is:
[167]	valid's multi_logloss: 1.09846
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's multi_logloss: 1.09915
[200]	valid's multi_logloss: 1.09679
[300]	valid's multi_logloss: 1.09856
Early stopping, best iteration is:
[190]	valid's multi_logloss: 1.0967
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores do

In [10]:
%%time

test_pred = automl.predict(test)
print('Prediction for test set:\n{}\nShape = {}'.format(test_pred[:5], test_pred.shape))

Prediction for test set:
array([[0.09141267, 0.62699586, 0.1675651 , 0.1140264 ],
       [0.09418982, 0.6937283 , 0.12687196, 0.08520998],
       [0.08322127, 0.6442443 , 0.17260253, 0.09993196],
       [0.09169274, 0.538226  , 0.27060184, 0.09947935],
       [0.07784009, 0.6259129 , 0.1869208 , 0.10932626]], dtype=float32)
Shape = (50000, 4)
Wall time: 37.5 s


In [12]:
sub = pd.read_csv('sample_submission.csv')

In [13]:
sub.iloc[:, 1:] = test_pred.data
sub.to_csv('Submission1.csv', index = False)