In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train = pd.read_csv('../input/kepler-labelled-time-series-data/exoTrain.csv') 
test = pd.read_csv('../input/kepler-labelled-time-series-data/exoTest.csv')

In [3]:
train.head()

In [4]:
train.info

In [5]:
# Are the classes balanced?
train['LABEL'].value_counts()

In [6]:
# split the data
x_train=train.drop('LABEL',axis=1)
y_train=train[['LABEL']]
x_test=test.drop('LABEL',axis=1)
y_test=test[['LABEL']]

In [7]:
oversample = RandomOverSampler(0.8)
x_train_new, y_train_new = oversample.fit_resample(x_train, y_train)

In [8]:
# check rebalanced data
y_train_new['LABEL'].value_counts()

In [9]:
# train
xgb_class = XGBClassifier()
xgb_class.fit(x_train_new, y_train_new)

In [10]:
# predict
y_pred = xgb_class.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy is {accuracy}.')

In [11]:
# random search approach
xgb_params={"learning_rate" : [0.05, 0.15,0.4] ,
             "max_depth" : [3, 5, 12],
             "min_child_weight" : [1, 3, 6],
             "gamma" : [0.1, 0.2, 0.4],
             "colsample_bytree" : [0.3, 0.5, 0.7]}

xgb_class = XGBClassifier()

xgb_random_search = RandomizedSearchCV(xgb_class, param_distributions = xgb_params,
                                       scoring= 'roc_auc',
                                       n_jobs= -1, verbose= 3)

xgb_random_search.fit(x_train_new, y_train_new)

In [12]:
# best-found params
params = xgb_random_search.best_params_
print(params)

In [14]:
# retrain with best params
tuned_xgb_classifier = XGBClassifier(min_child_weight = 1,
                                     max_depth = 3,
                                     learning_rate = 0.15,
                                     gamma = 0.4,
                                     colsample_bytree = 0.3)
tuned_xgb_classifier.fit(x_train_new, y_train_new)

In [15]:
# predict
y_pred = tuned_xgb_classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy is {accuracy}.')