In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from catboost import Pool
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, StratifiedKFold
import gensim
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("/kaggle/input/ykc-2nd/train.csv")
test = pd.read_csv("/kaggle/input/ykc-2nd/test.csv")
sub = pd.read_csv("/kaggle/input/ykc-2nd/sample_submission.csv")

In [3]:
df = pd.concat([train, test])
df = df.reset_index(drop=True)

In [4]:
target = "department_id" 
n_split = 5
features = ["product_name", "order_rate", "order_dow_mode", "order_hour_of_day_mode"]
text_features = ["product_name"]

In [5]:
train = df[~df[target].isna()]
test = df[df[target].isna()]

In [6]:
X = train.drop(target, axis=1)
y = train[target]

In [7]:
from sklearn.model_selection import train_test_split
from catboost import Pool
import sklearn.metrics
def objective(trial):
    # トレーニングデータとテストデータを分割
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2)
    train_pool = Pool(train_x, train_y, text_features=text_features)
    test_pool = Pool(test_x, test_y, text_features=text_features)

    # パラメータの指定
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50),
        "verbose": False
    }
    # 学習
    model = CatBoostClassifier(**params)
    model.fit(train_pool)

        ## predict on valid
    pred_val = model.predict_proba(test_x)

    ## evaluate
    score = f1_score(test_y, np.argmax(pred_val, axis = 1), average = "micro")
    print(f'F1={score}')
    return 1.0 - score

In [8]:
import optuna
study = optuna.create_study()
study.optimize(objective, n_trials=20)
print("Best trial:")
print(study.best_trial)

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

F1=0.7556852485409539


[I 2020-07-10 00:18:12,134] Finished trial#0 with value: 0.24431475145904613 with parameters: {'iterations': 135, 'depth': 5, 'learning_rate': 0.20059424301783854, 'random_strength': 88, 'bagging_temperature': 5.577381570223931, 'od_type': 'Iter', 'od_wait': 21}. Best is trial#0 with value: 0.24431475145904613.


F1=0.767558864962769


[I 2020-07-10 00:23:37,856] Finished trial#1 with value: 0.23244113503723096 with parameters: {'iterations': 118, 'depth': 6, 'learning_rate': 0.2674695924319836, 'random_strength': 1, 'bagging_temperature': 0.2143866582908958, 'od_type': 'IncToDec', 'od_wait': 29}. Best is trial#1 with value: 0.23244113503723096.


F1=0.4775608774401288


[I 2020-07-10 00:42:14,725] Finished trial#2 with value: 0.5224391225598712 with parameters: {'iterations': 80, 'depth': 9, 'learning_rate': 0.026054629337761922, 'random_strength': 29, 'bagging_temperature': 0.18666630039822943, 'od_type': 'IncToDec', 'od_wait': 32}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7572952304286577


[I 2020-07-10 00:47:30,532] Finished trial#3 with value: 0.24270476957134235 with parameters: {'iterations': 169, 'depth': 5, 'learning_rate': 0.07991346645472824, 'random_strength': 23, 'bagging_temperature': 2.469346056388567, 'od_type': 'IncToDec', 'od_wait': 49}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7667538740189173


[I 2020-07-10 00:50:28,495] Finished trial#4 with value: 0.23324612598108274 with parameters: {'iterations': 264, 'depth': 4, 'learning_rate': 0.1692988757406398, 'random_strength': 22, 'bagging_temperature': 0.01716839970032632, 'od_type': 'Iter', 'od_wait': 11}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7468303481585832


[I 2020-07-10 00:57:26,543] Finished trial#5 with value: 0.2531696518414168 with parameters: {'iterations': 151, 'depth': 6, 'learning_rate': 0.11475914254228946, 'random_strength': 15, 'bagging_temperature': 21.16281682038129, 'od_type': 'Iter', 'od_wait': 31}. Best is trial#1 with value: 0.23244113503723096.


F1=0.5403501710605756


[I 2020-07-10 01:42:10,004] Finished trial#6 with value: 0.4596498289394244 with parameters: {'iterations': 191, 'depth': 9, 'learning_rate': 0.03042039964141004, 'random_strength': 83, 'bagging_temperature': 0.09036267691872883, 'od_type': 'Iter', 'od_wait': 14}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7574964781646206


[I 2020-07-10 02:31:22,023] Finished trial#7 with value: 0.24250352183537938 with parameters: {'iterations': 210, 'depth': 9, 'learning_rate': 0.07730303082777534, 'random_strength': 90, 'bagging_temperature': 0.2634718300332031, 'od_type': 'IncToDec', 'od_wait': 21}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7369692090963975


[I 2020-07-10 02:45:46,909] Finished trial#8 with value: 0.2630307909036025 with parameters: {'iterations': 61, 'depth': 9, 'learning_rate': 0.08812899156233642, 'random_strength': 94, 'bagging_temperature': 5.768706414586887, 'od_type': 'Iter', 'od_wait': 11}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7456228617428053


[I 2020-07-10 04:23:22,646] Finished trial#9 with value: 0.25437713825719466 with parameters: {'iterations': 223, 'depth': 10, 'learning_rate': 0.014387871311614767, 'random_strength': 14, 'bagging_temperature': 4.19439974622849, 'od_type': 'IncToDec', 'od_wait': 17}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7653451398671766


[I 2020-07-10 04:31:31,942] Finished trial#10 with value: 0.23465486013282344 with parameters: {'iterations': 108, 'depth': 7, 'learning_rate': 0.2954613748624662, 'random_strength': 55, 'bagging_temperature': 0.012787379965864503, 'od_type': 'IncToDec', 'od_wait': 41}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7673576172268062


[I 2020-07-10 04:34:54,734] Finished trial#11 with value: 0.23264238277319382 with parameters: {'iterations': 295, 'depth': 4, 'learning_rate': 0.2774446385846101, 'random_strength': 0, 'bagging_temperature': 0.011831297165004155, 'od_type': 'Iter', 'od_wait': 39}. Best is trial#1 with value: 0.23244113503723096.


F1=0.7713825719460656


[I 2020-07-10 04:57:29,984] Finished trial#12 with value: 0.22861742805393437 with parameters: {'iterations': 299, 'depth': 7, 'learning_rate': 0.2837328369248532, 'random_strength': 0, 'bagging_temperature': 0.06067029031572031, 'od_type': 'IncToDec', 'od_wait': 39}. Best is trial#12 with value: 0.22861742805393437.


F1=0.7587039645803985


[I 2020-07-10 05:06:13,173] Finished trial#13 with value: 0.24129603541960154 with parameters: {'iterations': 114, 'depth': 7, 'learning_rate': 0.2965031066650733, 'random_strength': 3, 'bagging_temperature': 0.06639664578514376, 'od_type': 'IncToDec', 'od_wait': 39}. Best is trial#12 with value: 0.22861742805393437.


F1=0.7643389011873617


[I 2020-07-10 05:18:11,342] Finished trial#14 with value: 0.2356610988126383 with parameters: {'iterations': 253, 'depth': 6, 'learning_rate': 0.1590229784984046, 'random_strength': 44, 'bagging_temperature': 0.6660517214192673, 'od_type': 'IncToDec', 'od_wait': 49}. Best is trial#12 with value: 0.22861742805393437.


F1=0.7554840008049908


[I 2020-07-10 05:29:48,457] Finished trial#15 with value: 0.24451599919500921 with parameters: {'iterations': 90, 'depth': 8, 'learning_rate': 0.048800490582603905, 'random_strength': 2, 'bagging_temperature': 0.037649161620677295, 'od_type': 'IncToDec', 'od_wait': 25}. Best is trial#12 with value: 0.22861742805393437.


F1=0.7444153753270276


[I 2020-07-10 05:32:31,412] Finished trial#16 with value: 0.2555846246729724 with parameters: {'iterations': 57, 'depth': 6, 'learning_rate': 0.23981067119555272, 'random_strength': 40, 'bagging_temperature': 0.6016176045792309, 'od_type': 'IncToDec', 'od_wait': 34}. Best is trial#12 with value: 0.22861742805393437.


F1=0.7737975447776213


[I 2020-07-10 06:11:05,694] Finished trial#17 with value: 0.2262024552223787 with parameters: {'iterations': 296, 'depth': 8, 'learning_rate': 0.12981336602827964, 'random_strength': 64, 'bagging_temperature': 0.19418721946609516, 'od_type': 'IncToDec', 'od_wait': 44}. Best is trial#17 with value: 0.2262024552223787.


F1=0.7727913060978064


[I 2020-07-10 06:50:20,185] Finished trial#18 with value: 0.22720869390219356 with parameters: {'iterations': 299, 'depth': 8, 'learning_rate': 0.13507569370285255, 'random_strength': 68, 'bagging_temperature': 0.03489233315457894, 'od_type': 'IncToDec', 'od_wait': 43}. Best is trial#17 with value: 0.2262024552223787.


F1=0.13624471724693096


[I 2020-07-10 06:51:39,675] Finished trial#19 with value: 0.863755282753069 with parameters: {'iterations': 272, 'depth': 8, 'learning_rate': 0.12379085320090004, 'random_strength': 69, 'bagging_temperature': 88.17761525804818, 'od_type': 'IncToDec', 'od_wait': 45}. Best is trial#17 with value: 0.2262024552223787.


Best trial:
FrozenTrial(number=17, value=0.2262024552223787, datetime_start=datetime.datetime(2020, 7, 10, 5, 32, 31, 416088), datetime_complete=datetime.datetime(2020, 7, 10, 6, 11, 5, 694436), params={'iterations': 296, 'depth': 8, 'learning_rate': 0.12981336602827964, 'random_strength': 64, 'bagging_temperature': 0.19418721946609516, 'od_type': 'IncToDec', 'od_wait': 44}, distributions={'iterations': IntUniformDistribution(high=300, low=50, step=1), 'depth': IntUniformDistribution(high=10, low=4, step=1), 'learning_rate': LogUniformDistribution(high=0.3, low=0.01), 'random_strength': IntUniformDistribution(high=100, low=0, step=1), 'bagging_temperature': LogUniformDistribution(high=100.0, low=0.01), 'od_type': CategoricalDistribution(choices=('IncToDec', 'Iter')), 'od_wait': IntUniformDistribution(high=50, low=10, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=17, state=TrialState.COMPLETE)


NameError: name 'trial' is not defined