In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from scipy import stats
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
        
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [4]:
train_user = reduce_mem_usage(pd.read_csv('../input/train/train_user.csv', usecols=['phone_no_m', 'label']))
test_user = reduce_mem_usage(pd.read_csv('../input/test/test_user.csv', usecols=['phone_no_m']))

train_voc = reduce_mem_usage(pd.read_csv('../input/train/train_voc.csv'))
test_voc = reduce_mem_usage(pd.read_csv('../input/test/test_voc.csv'))

Memory usage of dataframe is 97824.00 MB
Memory usage after optimization is: 55082.00 MB
Decreased by 43.7%
Memory usage of dataframe is 16488.00 MB
Memory usage after optimization is: 16488.00 MB
Decreased by 0.0%
Memory usage of dataframe is 320987648.00 MB
Memory usage after optimization is: 255787058.00 MB
Decreased by 20.3%
Memory usage of dataframe is 17697536.00 MB
Memory usage after optimization is: 14102750.00 MB
Decreased by 20.3%


In [5]:
df_user = pd.concat([train_user, test_user])

In [6]:
# 只取最后一个月的数据
train_voc = train_voc[train_voc['start_datetime'] >= '2020-03-01 00:00:00']

In [7]:
train_voc.columns

Index(['phone_no_m', 'opposite_no_m', 'calltype_id', 'start_datetime', 'call_dur', 'city_name', 'county_name', 'imei_m'], dtype='object')

In [8]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

20

In [9]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [10]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [11]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count', opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

del tmp
gc.collect()

20

In [12]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_1_cnt='count', imeis='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长小于30s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_30s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于60s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 60]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_60s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s（5分钟）的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于30s的次数的占比
phone_no_m['voc_calltype_id_1_30s_rate'] = phone_no_m['voc_calltype_id_1_30s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于60s的次数的占比
phone_no_m['voc_calltype_id_1_60s_rate'] = phone_no_m['voc_calltype_id_1_60s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于60s的次数的占比
phone_no_m['voc_calltype_id_1_300s_rate'] = phone_no_m['voc_calltype_id_1_300s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']


# 主叫通话次数/通话次数，
phone_no_m["call_type_id_1_rate"] = phone_no_m['voc_calltype_id_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(city_name_call='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['county_name'].agg(county_name_call='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

0

In [13]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端总通话时长
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(call_count='count', call_sum='sum')

# 与对端通话次数的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_count'].agg(phone2opposite_cnt_mean='mean',
                                                             phone2opposite_cnt_median='median',
                                                             phone2opposite_cnt_min='min',
                                                             phone2opposite_cnt_max='max',
                                                             phone2opposite_cnt_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite
gc.collect()


# 与对端总通话时长的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_sum'].agg(phone2opposite_call_dur_mean='mean',
                                                           phone2opposite_call_dur_median='median',
                                                           phone2opposite_call_dur_min='min',
                                                           phone2opposite_call_dur_max='max',
                                                           phone2opposite_call_dur_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite, tmp
gc.collect()

0

In [14]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

20

In [15]:
# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [16]:
"""
通话时间点的偏好
"""
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')


tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  voc_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [17]:
phone_no_m.columns

Index(['phone_no_m', 'opposite_cnt', 'opposite_nunique', 'voc_calltype_id_1_cnt', 'imeis', 'voc_calltype_id_1_call_dur_sum', 'voc_calltype_id_1_30s_cnt', 'voc_calltype_id_1_60s_cnt', 'voc_calltype_id_1_300s_cnt', 'voc_calltype_id_1_30s_rate', 'voc_calltype_id_1_60s_rate', 'voc_calltype_id_1_300s_rate', 'call_type_id_1_rate', 'city_name_call', 'county_name_call', 'phone2opposite_cnt_mean', 'phone2opposite_cnt_median', 'phone2opposite_cnt_min', 'phone2opposite_cnt_max', 'phone2opposite_cnt_std', 'phone2opposite_call_dur_mean', 'phone2opposite_call_dur_median', 'phone2opposite_call_dur_min', 'phone2opposite_call_dur_max', 'phone2opposite_call_dur_std', 'call_dur_mean', 'call_dur_median', 'call_dur_max', 'call_dur_min', 'call_dur_std', 'city_name_nunique', 'county_name_nunique', 'calltype_id_unique', 'voc_hour_mode', 'voc_hour_mode_count', 'voc_hour_nunique', 'voc_day_mode', 'voc_day_mode_count', 'voc_day_nunique'], dtype='object')

In [18]:
df_voc = df_user.merge(phone_no_m, how='left', on='phone_no_m')

del df_user, phone_no_m
gc.collect()

40

In [19]:
df_voc_train = df_voc[df_voc.label.notna()]
df_voc_test = df_voc[df_voc.label.isna()]

df_voc_train.shape, df_voc_test.shape

((6106, 40), (2045, 40))

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(df_voc_train.drop('label', axis=1), df_voc_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [21]:
train_cols = [i for i in X_train if i not in ['phone_no_m', 'label']]

In [22]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [23]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_eval= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_eval, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.50:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

opposite_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.874224	valid_0's auc: 0.834843
[40]	training's auc: 0.877919	valid_0's auc: 0.831199
Early stopping, best iteration is:
[5]	training's auc: 0.870285	valid_0's auc: 0.838273
*****
0.8382733653800163
********************


opposite_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.904897	valid_0's auc: 0.873224
[40]	training's auc: 0.906297	valid_0's auc: 0.873087
[60]	training's auc: 0.907017	valid_0's auc: 0.872573
Early stopping, best iteration is:
[10]	training's auc: 0.904134	valid_0's auc: 0.873933
*****
0.8739327737121538
********************


voc_calltype_id_1_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.866163	valid_0's auc: 0.83139
[40]	training's auc: 0.869541	valid_0's auc: 0.828081
[60]	training's auc: 0.871562	valid_0's auc: 0.827922
Early stopping, best iteration is:
[18]	training's auc: 0.865

[40]	training's auc: 0.863595	valid_0's auc: 0.78828
[60]	training's auc: 0.866085	valid_0's auc: 0.786771
Early stopping, best iteration is:
[13]	training's auc: 0.854489	valid_0's auc: 0.792704
*****
0.7927039986607356
********************


phone2opposite_call_dur_std
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.858365	valid_0's auc: 0.790863
[40]	training's auc: 0.86129	valid_0's auc: 0.794028
[60]	training's auc: 0.862985	valid_0's auc: 0.795716
[80]	training's auc: 0.863998	valid_0's auc: 0.796097
[100]	training's auc: 0.864723	valid_0's auc: 0.796459
[120]	training's auc: 0.865349	valid_0's auc: 0.796689
[140]	training's auc: 0.865742	valid_0's auc: 0.796752
[160]	training's auc: 0.866047	valid_0's auc: 0.79639
[180]	training's auc: 0.866295	valid_0's auc: 0.796276
Early stopping, best iteration is:
[132]	training's auc: 0.865588	valid_0's auc: 0.796859
*****
0.796859305495767
********************


call_dur_mean
Training until validation s

In [24]:
print(useful_cols)
print(len(useful_cols))

['opposite_cnt', 'opposite_nunique', 'voc_calltype_id_1_cnt', 'imeis', 'voc_calltype_id_1_call_dur_sum', 'voc_calltype_id_1_30s_cnt', 'voc_calltype_id_1_60s_cnt', 'voc_calltype_id_1_300s_cnt', 'voc_calltype_id_1_30s_rate', 'voc_calltype_id_1_60s_rate', 'voc_calltype_id_1_300s_rate', 'call_type_id_1_rate', 'city_name_call', 'county_name_call', 'phone2opposite_cnt_mean', 'phone2opposite_cnt_median', 'phone2opposite_cnt_min', 'phone2opposite_cnt_max', 'phone2opposite_cnt_std', 'phone2opposite_call_dur_mean', 'phone2opposite_call_dur_median', 'phone2opposite_call_dur_min', 'phone2opposite_call_dur_max', 'phone2opposite_call_dur_std', 'call_dur_mean', 'call_dur_median', 'call_dur_max', 'call_dur_min', 'call_dur_std', 'city_name_nunique', 'county_name_nunique', 'calltype_id_unique', 'voc_hour_mode', 'voc_hour_mode_count', 'voc_hour_nunique', 'voc_day_mode', 'voc_day_mode_count', 'voc_day_nunique']
38


In [25]:
print(useless_cols)
print(len(useless_cols))

[]
0


In [26]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_eval= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_valid = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_eval, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.966572	valid_0's auc: 0.920698
[20]	training's auc: 0.976932	valid_0's auc: 0.919152
[30]	training's auc: 0.983382	valid_0's auc: 0.920974
[40]	training's auc: 0.98606	valid_0's auc: 0.919749
[50]	training's auc: 0.987515	valid_0's auc: 0.917118
[60]	training's auc: 0.988128	valid_0's auc: 0.915791
[70]	training's auc: 0.988405	valid_0's auc: 0.914194
[80]	training's auc: 0.988534	valid_0's auc: 0.912174
[90]	training's auc: 0.988673	valid_0's auc: 0.910954
[100]	training's auc: 0.988734	valid_0's auc: 0.910054
[110]	training's auc: 0.988766	valid_0's auc: 0.909764
[120]	training's auc: 0.988809	valid_0's auc: 0.909148
[130]	training's auc: 0.988867	valid_0's auc: 0.908787
Early stopping, best iteration is:
[34]	training's auc: 0.984624	valid_0's auc: 0.92203


In [27]:
# 验证集结果
X_valid['prob'] = lgb_valid.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.5, 1, 0)

f1_05 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_05 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_05: ', f1_05)
print('auc_05: ', auc_05)

f1_05:  0.846
auc_05:  0.9220297029702971


In [28]:
lgb_train_all = lgb.Dataset(df_voc_train[useful_cols].values, df_voc_train['label'])   

print('Start training...')

lgb_train = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_valid.best_iteration + 20,
                      verbose_eval=10)

Start training...


In [29]:
df_voc_test['label'] = np.where(lgb_train.predict(df_voc_test[useful_cols]) > 0.5, 1, 0)
df_voc_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_05), index=False)