In [None]:
!pip install dask-cuda

In [None]:
# !pip install xgboost==1.0.0


In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
!nvidia-smi

Wed Jan 19 00:40:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# !pip install dask[dataframe]

In [None]:
!pip install optuna

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.preprocessing import RobustScaler
from tqdm import tqdm

import optuna
import gc
xgb.__version__

'0.90'

In [3]:
train = pd.read_csv('/content/gdrive/MyDrive/song-popularity-prediction/train.csv')
test = pd.read_csv('/content/gdrive/MyDrive/song-popularity-prediction/test.csv')
columns = [col for col in train.columns.to_list() if col not in ['id','song_popularity']]
data=train[columns]
target=train['song_popularity']

In [4]:
def objective(trial):
  train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
  params = {
        'objective': trial.suggest_categorical('objective',['multi:softprob']), 
        'num_class': trial.suggest_categorical('num_class',[2]), 
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),  # 'gpu_hist','hist'
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_depth': trial.suggest_categorical('max_depth', [3,5,7,9,11,13,15,17,20]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['mlogloss']),
  }
  model = xgb.XGBClassifier(**params)
  model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=300,verbose=False)
  preds = model.predict(test_x)
  roc = accuracy_score(test_y, preds)
  return roc

In [5]:
study = optuna.create_study(direction='maximize',study_name='song-pop_trials_200')
study.optimize(objective, n_trials=200)

[32m[I 2022-01-19 00:41:15,052][0m A new study created in memory with name: song-pop_trials_200[0m
[32m[I 2022-01-19 00:41:16,721][0m Trial 0 finished with value: 0.639 and parameters: {'objective': 'multi:softprob', 'num_class': 2, 'tree_method': 'gpu_hist', 'lambda': 4.7576826089951565, 'alpha': 0.2682149956035418, 'colsample_bytree': 0.4444014996642621, 'subsample': 0.8649538168826012, 'learning_rate': 0.00319701542242079, 'max_depth': 11, 'min_child_weight': 278, 'eval_metric': 'mlogloss'}. Best is trial 0 with value: 0.639.[0m
[32m[I 2022-01-19 00:41:18,178][0m Trial 1 finished with value: 0.6388333333333334 and parameters: {'objective': 'multi:softprob', 'num_class': 2, 'tree_method': 'gpu_hist', 'lambda': 0.005383932209330874, 'alpha': 0.001982556210023471, 'colsample_bytree': 0.6420565156195042, 'subsample': 0.5915557024466769, 'learning_rate': 0.0105604458216122, 'max_depth': 9, 'min_child_weight': 138, 'eval_metric': 'mlogloss'}. Best is trial 0 with value: 0.639.[0m

In [6]:
study.best_value

0.6461666666666667

In [7]:
study.best_params

{'alpha': 0.002165284049749127,
 'colsample_bytree': 0.41961032521264147,
 'eval_metric': 'mlogloss',
 'lambda': 0.06732506334605969,
 'learning_rate': 0.0718988345727017,
 'max_depth': 17,
 'min_child_weight': 158,
 'num_class': 2,
 'objective': 'multi:softprob',
 'subsample': 0.7270128671619758,
 'tree_method': 'gpu_hist'}