In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
np.random.seed(42)

In [3]:
test_df = pd.read_table('data/test.tsv')
train_df = pd.read_table('data/train.tsv')

In [4]:
binary_valiable = {
    'id',
    'bot',
    'default_profile',
    'default_profile_image',
    'geo_enabled',
}

In [5]:
scale_stats = {
    col: {
        'mean': train_df[col].mean(),
        'std': train_df[col].std()
    }
    for col in train_df.columns
    if col not in binary_valiable
} 

In [6]:
std_train_df = train_df.copy()
for col, stats in scale_stats.items():
    std_train_df[col] = (std_train_df[col] - stats['mean']) / stats['std']

In [7]:
train_y = std_train_df.bot.values
train_X = std_train_df[
    [
        'statuses_count',
        'default_profile',
        'default_profile_image',
        'friends_count',
        'followers_count',
        'favourites_count',
        'geo_enabled',
        'listed_count',
        'account_age_hours',
        'diversity',
        'mean_mins_between_tweets',
        'mean_tweet_length',
        'mean_retweets',
        'reply_rate'
    ]
].values

In [8]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [9]:
warnings.filterwarnings('ignore')
from skopt import gp_minimize
from skopt.space import Integer
from skopt.space import Real
from skopt.space import Space

In [10]:
clf = LGBMClassifier(n_jobs=-1)
skf = StratifiedKFold(n_splits=5, shuffle=True)
def objective(params):
    n_estimators, learning_rate, num_leaves, \
    colsample_bytree, subsample, max_depth, reg_alpha, \
    reg_lambda, min_split_gain, min_child_weight = params
    
    clf.set_params(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_depth=max_depth,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        min_split_gain=min_split_gain,
        min_child_weight=min_child_weight,
    )
    return -np.mean(
        cross_val_score(
            estimator=clf,
            X=train_X,
            y=train_y,
            cv=skf,
            scoring='f1'
        ))

In [11]:
space = Space(
    [
        Integer(low=100, high=5000),
        Real(low=10**-5, high=10**5, prior='log-uniform'),
        Integer(low=10, high=500),
        Real(low=10**-5, high=10**0, prior='log-uniform'),
        Real(low=10**-5, high=10**0, prior='log-uniform'),
        Integer(low=3, high=50),
        Real(low=10**-5, high=10**1, prior='log-uniform'),
        Real(low=10**-5, high=10**1, prior='log-uniform'),
        Real(low=10**-5, high=10**1, prior='log-uniform'),
        Real(low=10**-5, high=10**3, prior='log-uniform'),
    ]
)

x0 = [
    10000, 0.02, 34, 0.94, 0.87, 8,
    0.0412, 0.073, 0.02, 39.0
]

In [12]:
warnings.filterwarnings('ignore')
local_opt_params = gp_minimize(
    func=objective, dimensions=space, n_calls=100, n_jobs=-1)
print(local_opt_params.x)

[5000, 0.027804822136363046, 500, 1.0, 1e-05, 3, 0.000134362299164214, 1.4464428056955005, 1e-05, 0.36777894364189095]


In [13]:
local_opt_params.fun

-0.7428035533026927

In [16]:
local_opt_params.fun

-0.9436995802038288

In [10]:
clf = LGBMClassifier(
    n_estimators=10000,
    learning_rate=0.02,
    num_leaves=34,
    colsample_bytree=0.9497036,
    subsample=0.8715623,
    max_depth=8,
    reg_alpha=0.041545473,
    reg_lambda=0.0735294,
    min_split_gain=0.0222415,
    min_child_weight=39.3259775,
    silent=-1,
    verbose=-1,
    random_state=1234,
)

In [14]:
clf.set_params(
    n_estimators=5000,
    learning_rate=0.027804822136363046,
    num_leaves=500,
    colsample_bytree=1.0,
    subsample=1e-05,
    max_depth=3,
    reg_alpha=0.000134362299164214,
    reg_lambda=1.4464428056955005,
    min_split_gain=1e-05,
    min_child_weight=0.36777894364189095,
)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.027804822136363046, max_depth=3,
        min_child_samples=20, min_child_weight=0.36777894364189095,
        min_split_gain=1e-05, n_estimators=5000, n_jobs=-1, num_leaves=500,
        objective=None, random_state=None, reg_alpha=0.000134362299164214,
        reg_lambda=1.4464428056955005, silent=True, subsample=1e-05,
        subsample_for_bin=200000, subsample_freq=0)

In [15]:
cv_metrics = {'p': 'precision', 'r': 'recall', 'f': 'f1'}
cross_validate(X=train_X, y=train_y, estimator=clf, scoring=cv_metrics, cv=skf)

{'fit_time': array([2.23458743, 1.7927947 , 1.91203189, 1.94395232, 1.84266639]),
 'score_time': array([0.10409212, 0.11355686, 0.10776258, 0.12798905, 0.10982537]),
 'test_p': array([0.85      , 0.725     , 0.8       , 0.86666667, 0.825     ]),
 'train_p': array([0.99487179, 1.        , 0.99487179, 1.        , 0.99489796]),
 'test_r': array([0.69387755, 0.59183673, 0.57142857, 0.54166667, 0.6875    ]),
 'train_r': array([1., 1., 1., 1., 1.]),
 'test_f': array([0.76404494, 0.65168539, 0.66666667, 0.66666667, 0.75      ]),
 'train_f': array([0.99742931, 1.        , 0.99742931, 1.        , 0.99744246])}

In [16]:
std_test_df = test_df.copy()

In [17]:
for col, stats in scale_stats.items():
    std_test_df[col] = (std_test_df[col] - stats['mean']) / stats['std']

In [18]:
estimator = clf.fit(X=train_X, y=train_y)
test_X = std_test_df[
    [
        'statuses_count',
        'default_profile',
        'default_profile_image',
        'friends_count',
        'followers_count',
        'favourites_count',
        'geo_enabled',
        'listed_count',
        'account_age_hours',
        'diversity',
        'mean_mins_between_tweets',
        'mean_tweet_length',
        'mean_retweets',
        'reply_rate'
    ]
].values

In [19]:
predict_y = estimator.predict(X=test_X)

In [20]:
result = pd.concat(
    [test_df.id, pd.Series(predict_y).rename('predict')],
    axis=1
)

In [21]:
result.to_csv('data/submit.csv', index=False, header=False)