# lib导入

In [19]:
# export
import os
from code.config import * 
from loguru import logger
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)  # 设置显示数据的最大列数，防止出现省略号…，导致数据显示不全
pd.set_option('expand_frame_repr', False)  # 当列太多时不自动换行

# utils

In [12]:
#export
def aggregate_features(df_, feat, agg_func_dict, prefix=''):
    """
    """
    from loguru import logger
    df = df_.copy()
    logger.info(f'gen 特征 for {",".join(feat)}...')
    

    agg_df = df.groupby(feat).agg(agg_func_dict)
#     print(agg_df.columns.values)
    agg_df.columns = [prefix + '_'.join(col).strip() for col in agg_df.columns.values]
    logger.info(f'gen 特征 for {",".join(feat)}...end')
    
    return agg_df

# user

In [60]:
def get_user_feats(user_raw, mode='train'):
    _df = user_raw.copy()
    if mode == 'train':
        
        train_users = pd.DataFrame()
        for i in '1908 1909 1910 1911 1912 2001 2002 2003'.split():
            _tmp = _df[['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', f'arpu_20{i}', 'label']].copy()
            _tmp.columns = ['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', f'arpu', 'label']
            _tmp['month_id'] = f'20{i}'
            train_users = train_users.append(_tmp)

        return train_users
    if mode == 'test':
        _df['month_id'] = '202004'
        _df['arpu'] = _df.arpu_202004
        return _df
        
        
        
        
        
        
        

# voc

In [51]:
def get_voc_feats(voc_raw, user_raw):
    _df = voc_raw.copy()
    _tmp = user_raw[['phone_no_m', 'city_name']]
    _tmp.columns = ['phone_no_m', 'city_name_src']
    _df = pd.merge(_df, _tmp)
    logger.info('Original features derivation')
    _df['city_name_flag'] = 1 - \
        (_df.city_name == _df.city_name_src).map(int, na_action='ignore')

    _df["start_datetime"] = pd.to_datetime(_df['start_datetime'])
    _df["hour"] = _df['start_datetime'].dt.hour
    _df["day"] = _df['start_datetime'].dt.day
    _df["weekday"] = _df['start_datetime'].dt.weekday
    _df["month_id"] = _df['start_datetime'].dt.year.map(
        str)+_df['start_datetime'].dt.month.map(lambda x: f'0{x}' if x < 10 else str(x))
    _df["start_datetime_timestamp"] = _df['start_datetime'].map(
        lambda t: int(t.timestamp()))
    _df = _df.sort_values('phone_no_m start_datetime_timestamp'.split())
    # 每次voc时间间隔
    _df['t'] = _df['start_datetime_timestamp']  # + df['call_dur']
    _df['t'] = _df.groupby('phone_no_m month_id'.split())['t'].shift(1)
    _df['time_interval'] = _df['start_datetime_timestamp'] - _df['t']

    logger.info('一阶统计特征')
    # 一阶统计特征

    agg_func = {
        'opposite_no_m': ['count', 'nunique'],  # 通话次数、有过通话的人数
        'city_name': ['nunique'],
        'county_name': ['nunique'],
        'city_name_flag': ['sum', 'mean'],
        'imei_m': ['nunique'],
        'call_dur': ['median', 'min'],
        'time_interval': ['median'],
#         'hour': [_t],  # 通话偏好: 通话次数最多的hour
#         'day': [_t],
#         'weekday': [_t],
    }
    def _t(s):
        return s.mode().iloc[0]
    _dfg = aggregate_features(_df, 'phone_no_m month_id'.split(), agg_func)
    _dfg['hour_mode'] = _df['hour'].agg(_t)
    _dfg['day_mode'] = _df['day'].agg(_t)
    _dfg['weekday_mode'] = _df['weekday'].agg(_t)
    # 一阶统计特征衍生
    _dfg['opposite_no_m_count_nunique_ratio'] = _dfg.opposite_no_m_count / \
        _dfg.opposite_no_m_nunique

    logger.info('2阶统计特征calltype_id')
    # 2阶统计特征calltype_id
    agg_func = {
        'opposite_no_m': ['count'],  # 有过通话的人数
    }
    dfg2 = aggregate_features(
        _df, ['phone_no_m', 'month_id', 'calltype_id'], agg_func)

    dfg2 = dfg2.unstack()

    dfg2.columns = [f'{i[0]}_calltype_id{i[1]}' for i in dfg2.columns]

    dfg2 = dfg2.fillna(0)
    # 2阶统计特征衍生
    dfg2['opposite_no_m_count_calltype_id1_2_ratio'] = dfg2['opposite_no_m_count_calltype_id1'] / \
        (dfg2['opposite_no_m_count_calltype_id2']+0.1)

    dfg2['opposite_no_m_count_calltype_id1_ratio'] = dfg2['opposite_no_m_count_calltype_id1'] / \
        (_dfg['opposite_no_m_count']+0.1)

    # 2阶统计特征hour

    agg_func = {
        'opposite_no_m': ['count'],  # 有过通话的人数
    }
    dfg_hour2 = aggregate_features(
        _df, ['phone_no_m', 'month_id', 'hour'], agg_func)

    dfg_hour2 = dfg_hour2.unstack()

    dfg_hour2.columns = [f'{i[0]}_hour{i[1]}' for i in dfg_hour2.columns]

    dfg_hour2 = dfg_hour2.fillna(0)

    for c in dfg_hour2.columns:
        dfg_hour2[f'{c}_ratio'] = dfg_hour2[c] / \
            (_dfg['opposite_no_m_count']+0.1)
    # 2阶统计特征weekday

    agg_func = {
        'opposite_no_m': ['count'],  # 有过通话的人数
    }
    dfg_weekday2 = aggregate_features(
        _df, ['phone_no_m', 'month_id', 'weekday'], agg_func)
    dfg_weekday2 = dfg_weekday2.unstack()

    dfg_weekday2.columns = [
        f'{i[0]}_weekday{i[1]}' for i in dfg_weekday2.columns]

    dfg_weekday2 = dfg_weekday2.fillna(0)
    for c in dfg_weekday2.columns:
        dfg_weekday2[f'{c}_ratio'] = dfg_weekday2[c] / \
            (_dfg['opposite_no_m_count']+0.1)
    # merge
    df_voc_feats = pd.merge(pd.merge(pd.merge(_dfg, dfg2, left_index=True, right_index=True),
                                     dfg_hour2, left_index=True, right_index=True),
                            dfg_weekday2, left_index=True, right_index=True)

    return df_voc_feats

# sms

In [27]:
def get_sms_feats(sms_raw):
    _df = sms_raw.copy()

    _df['request_datetime'] = pd.to_datetime(_df['request_datetime'])

    _df["hour"] = _df['request_datetime'].dt.hour
    _df["day"] = _df['request_datetime'].dt.day
    _df["weekday"] = _df['request_datetime'].dt.weekday
    _df["month_id"] = _df['request_datetime'].dt.year.map(
        str)+_df['request_datetime'].dt.month.map(lambda x: f'0{x}' if x < 10 else str(x))

    # 发短信的间隔时间
    _df = _df.sort_values('phone_no_m request_datetime'.split())

    _df['t'] = _df.groupby('phone_no_m month_id'.split())[
        'request_datetime'].shift(1)

    _df['time_interval'] = (_df.request_datetime-_df.t).dt.total_seconds()

    # 统计特征
    # 一阶统计特征
    agg_func = {
        'opposite_no_m': ['count', 'nunique'],  # 有过通话的人数
        'hour': ['nunique'],
        'day': ['nunique'],
        'weekday': ['nunique'],
        'time_interval': ['median'],
    }
    dfg = aggregate_features(_df, 'phone_no_m month_id'.split(), agg_func)
    # 一阶统计特征衍生
    dfg['opposite_no_m_count_nunique_ratio'] = dfg.opposite_no_m_count / \
        dfg.opposite_no_m_nunique
    def _t(s):
        return s.mode().iloc[0]
    dfg['hour_mode'] = _df['hour'].agg(_t)
    dfg['day_mode'] = _df['day'].agg(_t)
    dfg['weekday_mode'] = _df['weekday'].agg(_t)
    # 2阶统计特征calltype_id
    agg_func = {
        'opposite_no_m': ['count'],  # 有过通话的人数
    }
    dfg2 = aggregate_features(
        _df, ['phone_no_m', 'month_id', 'calltype_id'], agg_func)

    dfg2 = dfg2.unstack()

    dfg2.columns = [f'{i[0]}_calltype_id{i[1]}' for i in dfg2.columns]

    dfg2 = dfg2.fillna(0)
    # 2阶统计特征衍生
    dfg2['opposite_no_m_count_calltype_id1_2_ratio'] = dfg2['opposite_no_m_count_calltype_id1'] / \
        (dfg2['opposite_no_m_count_calltype_id2']+0.1)
    dfg2['opposite_no_m_count_calltype_id1_ratio'] = dfg2['opposite_no_m_count_calltype_id1'] / \
        (dfg['opposite_no_m_count']+0.1)

    
    # 2阶统计特征hour

    agg_func = {
        'opposite_no_m': ['count'],  # 有过通话的人数
    }
    dfg_hour2 = aggregate_features(
        _df, ['phone_no_m', 'month_id', 'hour'], agg_func)

    dfg_hour2 = dfg_hour2.unstack()

    dfg_hour2.columns = [f'{i[0]}_hour{i[1]}' for i in dfg_hour2.columns]

    dfg_hour2 = dfg_hour2.fillna(0)

    for c in dfg_hour2.columns:
        dfg_hour2[f'{c}_ratio'] = dfg_hour2[c] / \
            (dfg['opposite_no_m_count']+0.1)
    # 2阶统计特征weekday

    agg_func = {
        'opposite_no_m': ['count'],  # 有过通话的人数
    }
    dfg_weekday2 = aggregate_features(
        _df, ['phone_no_m', 'month_id', 'weekday'], agg_func)
    dfg_weekday2 = dfg_weekday2.unstack()

    dfg_weekday2.columns = [
        f'{i[0]}_weekday{i[1]}' for i in dfg_weekday2.columns]

    dfg_weekday2 = dfg_weekday2.fillna(0)
    for c in dfg_weekday2.columns:
        dfg_weekday2[f'{c}_ratio'] = dfg_weekday2[c] / \
            (dfg['opposite_no_m_count']+0.1)
    # merge
    df_sms_feats = pd.merge(pd.merge(pd.merge(dfg, dfg2, left_index=True, right_index=True),
                                     dfg_hour2, left_index=True, right_index=True),
                            dfg_weekday2, left_index=True, right_index=True)


    return df_sms_feats

# app

In [35]:
def get_app_feats(app_raw, mode='train'):
    
    _df = app_raw.copy()
    if mode == 'train':
        # 因为train_app中month_id缺失很少，删掉
        _df = _df[_df.month_id.notnull()]
    else:
        # test_app中month_id填充
        _df.loc[_df.month_id.isnull(), 'month_id'] = '2020-04'
    # 用众数填充
    _df.loc[_df.month_id.isnull(), 'busi_name'] = '<UNK>'
    
#     _df['busi_name_te'] = _df.busi_name.map(busi_name_te_series, na_action='ignore')
    _df['month_id'] = _df['month_id'].map(lambda x: x[:4]+x[-2:])

    # 统计特征
    # 一阶统计特征
    agg_func = {
        'busi_name': ['count','nunique'],  # 有过通话的人数
        'flow': ['sum','mean','median','max','min','std'],
#         'busi_name_te': ['sum','mean','median','max','min','std'],
    }
    dfg = aggregate_features(_df, 'phone_no_m month_id'.split(), agg_func)
    # 一阶统计特征衍生
    dfg['busi_name_count_nunique_ratio'] = dfg.busi_name_count/dfg.busi_name_nunique
    dfg['flow_sum_nunique_ratio'] = dfg.flow_sum/dfg.busi_name_nunique
    
    # merge
    df_app_feats = dfg
    
#     df_app_feats = df_app_feats.reset_index()
#     del df_app_feats['month_id']
#     df_app_feats = df_app_feats.groupby('phone_no_m').max().reset_index()
    
    return df_app_feats

# main

In [2]:
train_user = pd.read_csv(os.path.join(args.DATA_DIR, 'train/train_user.csv'))
test_user = pd.read_csv(os.path.join(args.DATA_DIR, 'test/test_user.csv'))
train_voc = pd.read_csv(os.path.join(args.DATA_DIR, 'train/train_voc.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
train_sms = pd.read_csv(os.path.join(args.DATA_DIR, 'train/train_sms.csv'))

In [30]:
train_app = pd.read_csv(os.path.join(args.DATA_DIR, 'train/train_app.csv'))

In [39]:
test_voc = pd.read_csv(os.path.join(args.DATA_DIR, 'test/test_voc.csv'))
test_sms = pd.read_csv(os.path.join(args.DATA_DIR, 'test/test_sms.csv'))
test_app = pd.read_csv(os.path.join(args.DATA_DIR, 'test/test_app.csv'))

In [55]:
df_user_feats = get_user_feats(train_user)

In [61]:
df_user_feats_test = get_user_feats(test_user, mode='test')

In [56]:
df_user_feats.head()

Unnamed: 0,phone_no_m,city_name,county_name,idcard_cnt,arpu,label,month_id
0,672ddbf02a5544d32e4ecc9433b1981bffe23bf912273a...,绵阳,江油分公司,1,46.06,0,201908
1,5e1272273e041e82cb275ae877710be98cdaf5b0a8f34d...,德阳,旌阳分公司,1,79.0,0,201908
2,eaab3472ec87b076e69e6e8bb62b14341638fc63661a6c...,成都,金堂分公司,2,18.85,0,201908
3,0ce1bb415704178bf44e9c9b431a39b083a132c8e6d99f...,成都,高新分公司,2,19.2,0,201908
4,28b87f35f63f65096a53e3a4c97eaffd4a6c43ffa7e92d...,德阳,旌阳分公司,1,50.0,0,201908


In [57]:
df_user_feats[['phone_no_m', 'city_name', 'county_name', 'idcard_cnt',
                   'arpu', 'month_id', 'label']].to_pickle(os.path.join(args.DATA_DIR,'data_gen/train_user_feat2.pkl'))
        

In [62]:
df_user_feats_test[['phone_no_m', 'city_name', 'county_name', 'idcard_cnt',
     'arpu', 'month_id']].to_pickle(os.path.join(args.DATA_DIR, 'data_gen/test_user_feat2.pkl'))

## voc

In [17]:
df_voc_feats = get_voc_feats(train_voc, train_user)

2020-07-05 09:31:49.183 | INFO     | __main__:get_voc_feats:6 - Original features derivation
2020-07-05 09:32:26.467 | INFO     | __main__:get_voc_feats:24 - 一阶统计特征
2020-07-05 09:32:27.051 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id...
2020-07-05 09:32:41.344 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id...end
2020-07-05 09:32:41.967 | INFO     | __main__:get_voc_feats:49 - 2阶统计特征calltype_id
2020-07-05 09:32:42.494 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,calltype_id...
2020-07-05 09:32:44.625 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,calltype_id...end
2020-07-05 09:32:45.356 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,hour...
2020-07-05 09:32:47.606 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,hour...end
2020-07-05 09:32:48.598 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no

In [20]:
df_voc_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,opposite_no_m_count,opposite_no_m_nunique,city_name_nunique,county_name_nunique,city_name_flag_sum,city_name_flag_mean,imei_m_nunique,call_dur_median,call_dur_min,time_interval_median,hour_mode,day_mode,weekday_mode,opposite_no_m_count_nunique_ratio,opposite_no_m_count_calltype_id1,opposite_no_m_count_calltype_id2,opposite_no_m_count_calltype_id3,opposite_no_m_count_calltype_id1_2_ratio,opposite_no_m_count_calltype_id1_ratio,opposite_no_m_count_hour0,opposite_no_m_count_hour1,opposite_no_m_count_hour2,opposite_no_m_count_hour3,opposite_no_m_count_hour4,opposite_no_m_count_hour5,opposite_no_m_count_hour6,opposite_no_m_count_hour7,opposite_no_m_count_hour8,opposite_no_m_count_hour9,opposite_no_m_count_hour10,opposite_no_m_count_hour11,opposite_no_m_count_hour12,opposite_no_m_count_hour13,opposite_no_m_count_hour14,opposite_no_m_count_hour15,opposite_no_m_count_hour16,opposite_no_m_count_hour17,opposite_no_m_count_hour18,opposite_no_m_count_hour19,opposite_no_m_count_hour20,opposite_no_m_count_hour21,opposite_no_m_count_hour22,opposite_no_m_count_hour23,opposite_no_m_count_hour0_ratio,opposite_no_m_count_hour1_ratio,opposite_no_m_count_hour2_ratio,opposite_no_m_count_hour3_ratio,opposite_no_m_count_hour4_ratio,opposite_no_m_count_hour5_ratio,opposite_no_m_count_hour6_ratio,opposite_no_m_count_hour7_ratio,opposite_no_m_count_hour8_ratio,opposite_no_m_count_hour9_ratio,opposite_no_m_count_hour10_ratio,opposite_no_m_count_hour11_ratio,opposite_no_m_count_hour12_ratio,opposite_no_m_count_hour13_ratio,opposite_no_m_count_hour14_ratio,opposite_no_m_count_hour15_ratio,opposite_no_m_count_hour16_ratio,opposite_no_m_count_hour17_ratio,opposite_no_m_count_hour18_ratio,opposite_no_m_count_hour19_ratio,opposite_no_m_count_hour20_ratio,opposite_no_m_count_hour21_ratio,opposite_no_m_count_hour22_ratio,opposite_no_m_count_hour23_ratio,opposite_no_m_count_weekday0,opposite_no_m_count_weekday1,opposite_no_m_count_weekday2,opposite_no_m_count_weekday3,opposite_no_m_count_weekday4,opposite_no_m_count_weekday5,opposite_no_m_count_weekday6,opposite_no_m_count_weekday0_ratio,opposite_no_m_count_weekday1_ratio,opposite_no_m_count_weekday2_ratio,opposite_no_m_count_weekday3_ratio,opposite_no_m_count_weekday4_ratio,opposite_no_m_count_weekday5_ratio,opposite_no_m_count_weekday6_ratio
phone_no_m,month_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201908,9,4,1,2,0,0.0,2,246.0,8,85293.0,11,20,3,2.25,9.0,0.0,0.0,90.0,0.989011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10989,0.0,0.10989,0.0,0.0,0.0,0.0,0.32967,0.10989,0.0,0.0,0.21978,0.10989,0.0,0.0,0.0,0.0,1.0,2.0,0.0,3.0,0.0,3.0,0.0,0.10989,0.21978,0.0,0.32967,0.0,0.32967,0.0
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201909,31,8,1,2,0,0.0,4,36.0,9,8300.0,11,20,3,3.875,31.0,0.0,0.0,310.0,0.996785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,5.0,2.0,4.0,1.0,2.0,4.0,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032154,0.032154,0.160772,0.0,0.160772,0.064309,0.128617,0.032154,0.064309,0.128617,0.128617,0.064309,0.0,0.0,0.0,11.0,0.0,2.0,2.0,1.0,7.0,8.0,0.353698,0.0,0.064309,0.064309,0.032154,0.22508,0.257235
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201910,39,17,1,2,0,0.0,4,55.0,9,17430.5,11,20,3,2.294118,39.0,0.0,0.0,390.0,0.997442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,4.0,2.0,4.0,6.0,1.0,4.0,1.0,4.0,1.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102302,0.076726,0.102302,0.051151,0.102302,0.153453,0.025575,0.102302,0.025575,0.102302,0.025575,0.051151,0.0,0.076726,0.0,0.0,2.0,11.0,7.0,8.0,3.0,5.0,3.0,0.051151,0.28133,0.179028,0.204604,0.076726,0.127877,0.076726
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201911,28,14,1,2,0,0.0,1,96.5,8,83928.0,11,20,3,2.0,28.0,0.0,0.0,280.0,0.996441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,2.0,4.0,2.0,4.0,1.0,4.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071174,0.106762,0.071174,0.142349,0.071174,0.142349,0.035587,0.142349,0.106762,0.035587,0.071174,0.0,0.0,0.0,6.0,4.0,4.0,4.0,5.0,4.0,1.0,0.213523,0.142349,0.142349,0.142349,0.177936,0.142349,0.035587
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201912,20,10,1,2,0,0.0,1,89.5,2,36469.0,11,20,3,2.0,20.0,0.0,0.0,200.0,0.995025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,7.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049751,0.049751,0.099502,0.348259,0.149254,0.0,0.149254,0.0,0.0,0.0,0.049751,0.0,0.049751,0.0,0.049751,2.0,1.0,2.0,3.0,3.0,1.0,8.0,0.099502,0.049751,0.099502,0.149254,0.149254,0.049751,0.39801


In [42]:
df_voc_feats_test = get_voc_feats(test_voc, test_user)

2020-07-05 10:25:38.520 | INFO     | __main__:get_voc_feats:6 - Original features derivation
2020-07-05 10:25:40.507 | INFO     | __main__:get_voc_feats:24 - 一阶统计特征
2020-07-05 10:25:40.530 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id...
2020-07-05 10:25:41.092 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id...end
2020-07-05 10:25:41.134 | INFO     | __main__:get_voc_feats:49 - 2阶统计特征calltype_id
2020-07-05 10:25:41.149 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,calltype_id...
2020-07-05 10:25:41.264 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,calltype_id...end
2020-07-05 10:25:41.310 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,hour...
2020-07-05 10:25:41.426 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,hour...end
2020-07-05 10:25:41.509 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no

In [26]:
df_voc_feats.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/df_train_voc2.pkl'))

In [43]:
df_voc_feats_test.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/df_test_voc2.pkl'))

## sms

In [28]:
df_sms_feats = get_sms_feats(train_sms)

2020-07-05 10:16:00.712 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id...
2020-07-05 10:16:17.795 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id...end
2020-07-05 10:16:19.142 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,calltype_id...
2020-07-05 10:16:22.903 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,calltype_id...end
2020-07-05 10:16:23.603 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,hour...
2020-07-05 10:16:27.606 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,hour...end
2020-07-05 10:16:28.876 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,weekday...
2020-07-05 10:16:32.716 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,weekday...end


In [29]:
df_sms_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,opposite_no_m_count,opposite_no_m_nunique,hour_nunique,day_nunique,weekday_nunique,time_interval_median,opposite_no_m_count_nunique_ratio,hour_mode,day_mode,weekday_mode,opposite_no_m_count_calltype_id1,opposite_no_m_count_calltype_id2,opposite_no_m_count_calltype_id1_2_ratio,opposite_no_m_count_calltype_id1_ratio,opposite_no_m_count_hour0,opposite_no_m_count_hour1,opposite_no_m_count_hour2,opposite_no_m_count_hour3,opposite_no_m_count_hour4,opposite_no_m_count_hour5,opposite_no_m_count_hour6,opposite_no_m_count_hour7,opposite_no_m_count_hour8,opposite_no_m_count_hour9,opposite_no_m_count_hour10,opposite_no_m_count_hour11,opposite_no_m_count_hour12,opposite_no_m_count_hour13,opposite_no_m_count_hour14,opposite_no_m_count_hour15,opposite_no_m_count_hour16,opposite_no_m_count_hour17,opposite_no_m_count_hour18,opposite_no_m_count_hour19,opposite_no_m_count_hour20,opposite_no_m_count_hour21,opposite_no_m_count_hour22,opposite_no_m_count_hour23,opposite_no_m_count_hour0_ratio,opposite_no_m_count_hour1_ratio,opposite_no_m_count_hour2_ratio,opposite_no_m_count_hour3_ratio,opposite_no_m_count_hour4_ratio,opposite_no_m_count_hour5_ratio,opposite_no_m_count_hour6_ratio,opposite_no_m_count_hour7_ratio,opposite_no_m_count_hour8_ratio,opposite_no_m_count_hour9_ratio,opposite_no_m_count_hour10_ratio,opposite_no_m_count_hour11_ratio,opposite_no_m_count_hour12_ratio,opposite_no_m_count_hour13_ratio,opposite_no_m_count_hour14_ratio,opposite_no_m_count_hour15_ratio,opposite_no_m_count_hour16_ratio,opposite_no_m_count_hour17_ratio,opposite_no_m_count_hour18_ratio,opposite_no_m_count_hour19_ratio,opposite_no_m_count_hour20_ratio,opposite_no_m_count_hour21_ratio,opposite_no_m_count_hour22_ratio,opposite_no_m_count_hour23_ratio,opposite_no_m_count_weekday0,opposite_no_m_count_weekday1,opposite_no_m_count_weekday2,opposite_no_m_count_weekday3,opposite_no_m_count_weekday4,opposite_no_m_count_weekday5,opposite_no_m_count_weekday6,opposite_no_m_count_weekday0_ratio,opposite_no_m_count_weekday1_ratio,opposite_no_m_count_weekday2_ratio,opposite_no_m_count_weekday3_ratio,opposite_no_m_count_weekday4_ratio,opposite_no_m_count_weekday5_ratio,opposite_no_m_count_weekday6_ratio
phone_no_m,month_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201908,77,14,14,17,7,0.0,5.5,10,18,3,0.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,1.0,6.0,3.0,2.0,6.0,9.0,0.0,6.0,1.0,6.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428016,0.01297,0.077821,0.038911,0.02594,0.077821,0.116732,0.0,0.077821,0.01297,0.077821,0.01297,0.01297,0.01297,0.01297,0.0,11.0,12.0,12.0,7.0,4.0,24.0,7.0,0.142672,0.155642,0.155642,0.090791,0.051881,0.311284,0.090791
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201909,91,12,15,19,7,0.0,7.583333,10,18,3,1.0,90.0,0.011099,0.010977,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,8.0,4.0,4.0,1.0,11.0,9.0,11.0,2.0,9.0,16.0,6.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.021954,0.0,0.0,0.0,0.0,0.010977,0.087816,0.043908,0.043908,0.010977,0.120746,0.098793,0.120746,0.021954,0.098793,0.175631,0.065862,0.054885,0.021954,0.0,0.0,27.0,5.0,18.0,12.0,1.0,14.0,14.0,0.296378,0.054885,0.197585,0.131723,0.010977,0.153677,0.153677
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201910,52,14,10,14,6,68.0,3.714286,10,18,3,0.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0,2.0,4.0,4.0,0.0,4.0,16.0,7.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019194,0.0,0.134357,0.038388,0.076775,0.076775,0.0,0.076775,0.307102,0.134357,0.0,0.0,0.076775,0.057582,0.0,0.0,0.0,1.0,11.0,6.0,17.0,9.0,0.0,8.0,0.019194,0.211132,0.115163,0.326296,0.172745,0.0,0.153551
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201911,64,20,11,17,7,941.0,3.2,10,18,3,0.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10.0,4.0,19.0,0.0,3.0,9.0,8.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015601,0.015601,0.156006,0.062402,0.296412,0.0,0.046802,0.140406,0.124805,0.109204,0.015601,0.015601,0.0,0.0,0.0,6.0,4.0,12.0,8.0,12.0,17.0,5.0,0.093604,0.062402,0.187207,0.124805,0.187207,0.265211,0.078003
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201912,71,18,13,18,7,236.5,3.944444,10,18,3,0.0,71.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,4.0,1.0,9.0,0.0,5.0,5.0,25.0,4.0,9.0,2.0,0.0,2.0,0.0,0.0,0.0,0.014065,0.0,0.0,0.0,0.0,0.0,0.0,0.028129,0.028129,0.056259,0.014065,0.126582,0.0,0.070323,0.070323,0.351617,0.056259,0.126582,0.028129,0.0,0.028129,0.0,0.0,11.0,12.0,19.0,9.0,10.0,4.0,6.0,0.154712,0.168776,0.267229,0.126582,0.140647,0.056259,0.084388


In [44]:
df_sms_feats_test = get_sms_feats(test_sms)

2020-07-05 10:26:33.962 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id...
2020-07-05 10:26:34.556 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id...end
2020-07-05 10:26:34.618 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,calltype_id...
2020-07-05 10:26:34.782 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,calltype_id...end
2020-07-05 10:26:34.823 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,hour...
2020-07-05 10:26:35.002 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,hour...end
2020-07-05 10:26:35.078 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id,weekday...
2020-07-05 10:26:35.251 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id,weekday...end


In [45]:
df_sms_feats_test.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/df_test_sms2.pkl'))

## app

In [36]:
df_app_feats = get_app_feats(train_app)

2020-07-05 10:22:40.562 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id...
2020-07-05 10:22:42.768 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id...end


In [49]:
df_app_feats_test = get_app_feats(test_app, mode='test')

2020-07-05 10:28:56.533 | INFO     | __main__:aggregate_features:7 - gen 特征 for phone_no_m,month_id...
2020-07-05 10:28:56.614 | INFO     | __main__:aggregate_features:13 - gen 特征 for phone_no_m,month_id...end


In [37]:
df_app_feats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,busi_name_count,busi_name_nunique,flow_sum,flow_mean,flow_median,flow_max,flow_min,flow_std,busi_name_count_nunique_ratio,flow_sum_nunique_ratio
phone_no_m,month_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201908,62,57,4189.107661,49.28362,0.089949,3163.689316,0.000164,348.520432,1.087719,73.493117
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201909,63,63,3127.770451,48.119545,0.070505,2964.370479,0.000352,367.551885,1.0,49.64715
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201910,15,15,0.318534,0.019908,0.012991,0.086149,0.000145,0.023334,1.0,0.021236
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,201911,13,13,76.028239,5.068549,0.070173,61.185415,0.003056,15.669846,1.0,5.848326
00073ceecc0f7220a440580ac5dea410c90d14b666945839292d187d300bca49f991e0ee9f1c477b28d30b8f656a4421d236a19d67a78e17d25bd0935763076a,202001,2,2,0.00436,0.00218,0.00218,0.003231,0.001129,0.001486,1.0,0.00218


In [38]:
df_app_feats.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/df_train_app2.pkl'))

In [50]:
df_app_feats_test.to_pickle(os.path.join(args.DATA_DIR, 'data_gen/df_test_app2.pkl'))