In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
reguser = pd.read_csv('/mnt/datasets/fusai/user_register_log.txt', sep='\t',
                    names=['uid','rday','rtype','dtype'],
                    dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint16})
applog = pd.read_csv('/mnt/datasets/fusai/app_launch_log.txt', sep='\t',
                  names=['uid','logday'],
                  dtype={0: np.uint32, 1: np.uint8})
vidlog = pd.read_csv('/mnt/datasets/fusai/video_create_log.txt', sep='\t',
                  names=['uid','pday'],
                  dtype={0: np.uint32, 1: np.uint8})
useract = pd.read_csv('/mnt/datasets/fusai/user_activity_log.txt', sep='\t',
                    names=['uid','aday','page','vid','aid','atype'],
                    dtype={0: np.uint32, 1: np.uint8, 2: np.uint8, 3: np.uint32, 4: np.uint32, 5: np.uint8})

In [None]:
def prereg(reguser,d1,d2):
    #n号之前注册的用户
    reguser1 = reguser.loc[reguser.rday <= d2]
    #注册日期与当前的差距
    reguser1['rday'] = d2 + 1 - reguser1['rday']
    #多值特征
    for i in range(0,6):
        reguser2 = reguser1.loc[reguser1.rtype == i]
        reguser2 = reguser2.groupby('uid')['rtype'].size().reset_index().rename(columns = {0:'rtype_' + str(i)})
        reguser1 = pd.merge(reguser1,reguser2,on = 'uid', how = 'left')
    
    #dtpye onehot
    for i in range(0,20):
        reguser2 = reguser1.loc[reguser1.dtype == i]
        reguser2 = reguser2.groupby('uid')['rtype'].size().reset_index().rename(columns = {0:'dtype_' + str(i)})
        reguser1 = pd.merge(reguser1,reguser2,on = 'uid', how = 'left')
    
    #用户拍摄视频数
    useract1 = useract.loc[(useract.aday >= d1) & (useract.aday <= d2),['uid','vid','aid']]
    useract2 = useract1.loc[:,['aid','vid']]
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.groupby(['aid'])['vid'].size().reset_index().rename(columns = {0:'vidcnts','aid':'uid'})
    reguser1 = pd.merge(reguser1,useract2,on = 'uid',how = 'left')
    
    useract2 = useract1.loc[:,['aid','uid']]
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.groupby(['aid'])['uid'].size().reset_index().rename(columns = {0:'watch','aid':'uid'})
    reguser1 = pd.merge(reguser1,useract2,on = 'uid',how = 'left')
    
    reguser1['ave_watch'] = reguser1['watch']/reguser1['vidcnts']
    
    reguser2 = reguser1.groupby('dtype')['uid'].size().reset_index().rename(columns = {0:'dtypecnt'})
    reguser1 = pd.merge(reguser1,reguser2,on = 'dtype',how = 'left')
    
    del reguser2
    reguser1 = reguser1.fillna(value=0)
    print 'reg done'
    return reguser1

import itertools
def test1(row):
    t = np.diff(row)
    temp_list = [0]
    if(len(t)):
        for k,v in itertools.groupby(t):
            if k == 1:
                temp_list.append(len(list(v)))
        num = max(temp_list) + 1
        return num
    else:
        return 1

def preapp(d1,d2):
    applog1 = applog.loc[(applog.logday <= d2)]#(applog.logday >= d1) & 
    #登陆日期调整到相对日期
    applog1['logday'] = applog1['logday'] - d1
    
    #最大连续登陆
    applog1 = applog1.sort_index(axis = 0,ascending = True,by = ['uid','logday'])
    applog2 = applog1.loc[:,['uid']]
    applog2.drop_duplicates(inplace=True)
    applog2 = applog2.reset_index(drop = True)
    c = pd.DataFrame({'a':applog1.groupby(by = 'uid',as_index=False).agg({'logday':test1})['logday']})
    applog2['maxlog'] = c['a']
    applog1 = pd.merge(applog1,applog2,on = 'uid',how = 'right')
    
    #最近登陆
    applog2 = applog1.groupby('uid')['logday'].max().reset_index().rename(columns = {'logday': 'lastlog'})
    applog1 = pd.merge(applog1,applog2,on = 'uid',how = 'right')
    
    applog1['log_gap'] = d2 - d1 + 1 - applog1['lastlog']
    
    #7天内总登陆次数
    applog2 = applog1.groupby('uid')['logday'].count().reset_index().rename(columns = {'logday': 'logsum'})
    applog1 = pd.merge(applog1,applog2,on = 'uid',how = 'right')
    #多值特征
    for i in range(0,d2-d1+1):
        applog2 = applog1.loc[applog1.logday == i]
        applog2 = applog2.groupby('uid')['logday'].count().reset_index().rename(columns = {'logday':'log'+ str(i) + 'day'})
        applog1 = pd.merge(applog1,applog2,on = 'uid',how = 'left')
    del applog2
    
    applog1.pop('logday')
    applog1.drop_duplicates(inplace=True)
    applog1 = applog1.fillna(value=0)
    print 'app done'
    return applog1

def previd(d1,d2):
    vidlog1 = vidlog.loc[(vidlog.pday <= d2)]#(vidlog.pday >= d1) & 
    vidlog1['pday'] = vidlog1['pday'] - d1
    
    #最大连续拍摄
    vidlog2 = vidlog1.sort_index(axis = 0,ascending = True,by = ['uid','pday'])
    vidlog2.drop_duplicates(inplace=True)
    vidlog3 = vidlog2.loc[:,['uid']]
    vidlog3.drop_duplicates(inplace=True)
    vidlog3 = vidlog3.reset_index(drop = True)
    c = pd.DataFrame({'a':vidlog2.groupby(by = 'uid',as_index=False).agg({'pday':test1})['pday']})
    vidlog3['maxp'] = c['a']
    vidlog1 = pd.merge(vidlog1,vidlog3,on = 'uid',how = 'right')
    
    #最多拍摄
    vidlog2 = vidlog1.groupby(['uid','pday'])['maxp'].count().reset_index().rename(columns = {'maxp':'pdaysum'})
    vidlog2 = vidlog2.groupby('uid')['pdaysum'].max().reset_index().rename(columns = {'pdaysum':'mostp'})
    vidlog1 = pd.merge(vidlog1,vidlog2,on = 'uid',how = 'right')
    
    #用户拍摄的总天数
    vidlog2 = vidlog1.loc[:,['uid','pday']]
    vidlog2.drop_duplicates(inplace=True)
    vidlog2 = vidlog2.groupby(['uid'])['pday'].size().reset_index().rename(columns = {0:'pdaycnt'})
    vidlog1 = pd.merge(vidlog1,vidlog2,on = 'uid',how = 'right')
    
    #最近拍摄
    vidlog2 = vidlog1.groupby('uid')['pday'].max().reset_index().rename(columns = {'pday': 'lastp'})
    vidlog1 = pd.merge(vidlog1,vidlog2,on = 'uid',how = 'right')

    vidlog1['pday_gap'] = d2 - d1 + 1 - vidlog1['lastp']

    #总拍摄次数
    vidlog2 = vidlog1.groupby('uid')['pday'].count().reset_index().rename(columns = {'pday': 'sump'})
    vidlog1 = pd.merge(vidlog1,vidlog2,on = 'uid',how = 'right')
    #多值特征
    for i in range(0,d2-d1+1):
        vidlog2 = vidlog1.loc[vidlog1.pday == i]
        vidlog2 = vidlog2.groupby('uid')['pday'].size().reset_index().rename(columns = {0:'p'+ str(i) + 'day'})
        vidlog1 = pd.merge(vidlog1,vidlog2,on = 'uid',how = 'left')
    del vidlog2
    
    #平均每天拍摄
    vidlog1['ave_p'] = vidlog1['sump']/vidlog1['pdaycnt']
    
    vidlog1.pop('pday')
    vidlog1 = vidlog1.fillna(value=0)
    vidlog1.drop_duplicates(inplace=True)
    
    print 'vid done'
    return vidlog1

def preact(d1,d2):
    useract1 = useract.loc[(useract.aday <= d2)]#(useract.aday >= d1) & 
    useract1['aday'] = useract1['aday'] - d1
    
    #最近操作时间
    useract2 = useract1.groupby('uid')['aday'].max().reset_index().rename(columns = {'aday': 'lasta'})
    useract1 = pd.merge(useract1,useract2,on = 'uid',how = 'right')
    useract1['aday_gap'] = d2 - d1 + 1 - useract1['lasta']
    
    del useract1['vid']
    del useract1['aid']
    del useract2
    del useract1['aday']
    del useract1['page']
    del useract1['atype']
    useract1.drop_duplicates(inplace=True)
    useract1 = useract1.fillna(value=0)
    print 'act done'
    return useract1

def atype_cnt(d1,d2,i):
    useract1 = useract.loc[(useract.aday <= d2),['uid','atype']]#(useract.aday >= d1) & 
    
    useract2 = useract1.loc[useract1.atype == i,['uid','atype']]
    del useract1
    useract2 = useract2.groupby(['uid'])['atype'].count().reset_index().rename(columns = {'atype':'atype'+str(i)+'cnt'})
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.fillna(value=0)
    print '用户各种type总数'
    return useract2

def page_cnt(d1,d2,i):
    useract1 = useract.loc[(useract.aday <= d2),['uid','page']]#(useract.aday >= d1) & 
    useract2 = useract1.loc[useract1.page == i,['uid','page']]
    del useract1
    useract2 = useract2.groupby(['uid'])['page'].count().reset_index().rename(columns = {'page':'page'+str(i)+'cnt'})
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.fillna(value=0)
    print '不同page下总操作数'
    return useract2

def aidhot(d1,d2):
    useract1 = useract.loc[(useract.aday <= d2),['uid','vid','aid']]#(useract.aday >= d1) & 
    
    useract2 = useract1.groupby(['aid'])['vid'].size().reset_index().rename(columns = {0:'aidcnt'})
    useract1 = pd.merge(useract1,useract2,on = 'aid',how = 'left')  
    useract2 = useract1.groupby(['uid'])['aidcnt'].mean().reset_index().rename(columns = {'aidcnt':'aidhot'})
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.fillna(value=0)
    print 'aidhot done'
    return useract2

def vidhot(d1,d2):
    useract1 = useract.loc[(useract.aday <= d2),['uid','vid','aid']]#(useract.aday >= d1) & 
    
    useract2 = useract1.groupby(['vid'])['aid'].size().reset_index().rename(columns = {0:'vidcnt'})
    useract1 = pd.merge(useract1,useract2,on = 'vid',how = 'left')
    useract2 = useract1.groupby(['uid'])['vidcnt'].mean().reset_index().rename(columns = {'vidcnt':'vidhot'})
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.fillna(value=0)
    print 'vidhot done'
    return useract2
    
def ave_act(d1,d2):
    useract1 = useract.loc[(useract.aday <= d2),['uid','aday']]#(useract.aday >= d1) & 
    useract1['aday'] = useract1['aday'] - d1
    
    #用户总操作数
    useract2 = useract1.groupby(['uid'])['aday'].size().reset_index().rename(columns = {0:'act_sum'})
    useract1 = pd.merge(useract1,useract2,on = ['uid'],how = 'right')
    
    #用户每天平均操作数
    useract2 = useract1.groupby('uid')['aday'].min().reset_index().rename(columns = {'aday': 'firsta'})
    useract1 = pd.merge(useract1,useract2,on = 'uid',how = 'right')
    useract1['first_gap'] = d2 - d1 + 1 - useract1['firsta']
    useract1['ave_act'] = useract1['act_sum']/useract1['first_gap']
    del useract1['aday']
    useract1.drop_duplicates(inplace=True)
    useract1 = useract1.fillna(value=0)
    print 'ave_act done'
    return useract1

def adaycnt(d1,d2):
    useract1 = useract.loc[(useract.aday <= d2),['uid','aday']]#(useract.aday >= d1) & 
    #用户有act的总天数
    useract1.drop_duplicates(inplace=True)
    useract1 = useract1.groupby(['uid'])['aday'].size().reset_index().rename(columns = {0:'adaycnt'})
    useract1.drop_duplicates(inplace=True)
    useract1 = useract1.fillna(value=0)
    print '用户有act的总天数'
    return useract1

def asum_n(d1,d2):
    useract1 = useract.loc[(useract.aday <= d2),['uid','aday']]#(useract.aday >= d1) & 
    useract1['aday'] = useract1['aday'] - d1
    
    #用户每天操作数
    useract2 = useract1.groupby(['uid','aday']).size().reset_index().rename(columns = {0:'asum'})
    useract2['asum'] = (np.log(useract2['asum'] + 1)/np.log(2)).astype(int)
    for i in range(0,d2-d1+1):
        useract_temp = useract2.loc[useract2.aday == i,['uid','asum']]
        useract_temp['asum_' + str(i)] = useract_temp['asum']
        del useract_temp['asum']
        useract2 = pd.merge(useract2,useract_temp,on = 'uid',how = 'left')
    del useract_temp
    del useract2['asum']
    del useract2['aday']
    del useract1
    useract2.drop_duplicates(inplace=True)
    useract2 = useract2.fillna(value=0)
    print 'act_sum_n done'
    return useract2

def addlabel(d1,d2):
    applog1 = applog.loc[(applog.logday >= d1) & (applog.logday <= d2)]
    vidlog1 = vidlog.loc[(vidlog.pday >= d1) & (vidlog.pday <= d2)]
    useract1 = useract.loc[(useract.aday >= d1) & (useract.aday <= d2)]
    user = pd.concat([applog1['uid'],vidlog1['uid'],useract1['uid']])
    del applog1
    del vidlog1
    del useract1
    user.drop_duplicates(inplace=True)
    user = pd.DataFrame({'uid':user})
    user['label'] = 1
    user = user.fillna(value=0)
    return user

def predata(d1,d2):
    test = prereg(reguser,d1,d2)
    
    # for i in range(0,6):
    test = pd.merge(test,atype_cnt(d1,d2,0),on = 'uid',how = 'left')
    # for i in range(0,5):
    test = pd.merge(test,page_cnt(d1,d2,0),on = 'uid',how = 'left')
    
    test = pd.merge(test,preapp(d1,d2),on = 'uid',how = 'left')
    
    
    test = pd.merge(test,atype_cnt(d1,d2,1),on = 'uid',how = 'left')
    test = pd.merge(test,page_cnt(d1,d2,1),on = 'uid',how = 'left')
    
    test = pd.merge(test,previd(d1,d2),on = 'uid',how = 'left')
    
    test = pd.merge(test,atype_cnt(d1,d2,2),on = 'uid',how = 'left')
    test = pd.merge(test,page_cnt(d1,d2,2),on = 'uid',how = 'left')

    test = pd.merge(test,preact(d1,d2),on = 'uid',how = 'left')
    
    test = pd.merge(test,atype_cnt(d1,d2,3),on = 'uid',how = 'left')
    test = pd.merge(test,page_cnt(d1,d2,3),on = 'uid',how = 'left')
    test = pd.merge(test,atype_cnt(d1,d2,4),on = 'uid',how = 'left')
    test = pd.merge(test,page_cnt(d1,d2,4),on = 'uid',how = 'left')
    test = pd.merge(test,atype_cnt(d1,d2,5),on = 'uid',how = 'left')
    
    test = pd.merge(test,asum_n(d1,d2),on = 'uid',how = 'left')
    # test = pd.merge(test,aidhot(d1,d2),on = 'uid',how = 'left')
    # test = pd.merge(test,vidhot(d1,d2),on = 'uid',how = 'left')
    test = pd.merge(test,ave_act(d1,d2),on = 'uid',how = 'left')
    test = pd.merge(test,adaycnt(d1,d2),on = 'uid',how = 'left')
    # print test.columns
    
    test['ave_act_1'] = test['act_sum']/test['adaycnt']
    
    test['asum1'] = test['asum_9'] + test['asum_10'] * 2 + test['asum_11'] * 3 + test['asum_12'] * 4\
                + test['asum_13'] * 5 + test['asum_14'] * 6 + test['asum_15'] * 7
    test['psum1'] = test['p9day'] + test['p10day'] * 2 + test['p11day'] * 3 + test['p12day'] * 4\
                + test['p13day'] * 5 + test['p14day'] * 6 + test['p15day'] * 7
                
    test['atype0_ratio'] = test['atype0cnt']/test['act_sum']
    test['atype1_ratio'] = test['atype1cnt']/test['act_sum']
    test['atype2_ratio'] = test['atype2cnt']/test['act_sum']
    test['atype3_ratio'] = test['atype3cnt']/test['act_sum']
    test['atype4_ratio'] = test['atype4cnt']/test['act_sum']
    test['atype5_ratio'] = test['atype5cnt']/test['act_sum']
    
    test['page0_ratio'] = test['page0cnt']/test['act_sum']
    test['page1_ratio'] = test['page1cnt']/test['act_sum']
    test['page2_ratio'] = test['page2cnt']/test['act_sum']
    test['page3_ratio'] = test['page3cnt']/test['act_sum']
    test['page4_ratio'] = test['page4cnt']/test['act_sum']
    
    test.drop_duplicates(inplace=True)
    test = test.fillna(value=0)
    return test

def pretrain():
    train = predata(8,23)
    train = pd.merge(train,addlabel(24,30),on = 'uid',how = 'left')
    
    temp = predata(1,16)
    temp = pd.merge(temp,addlabel(17,23),on = 'uid',how = 'left')
    train = pd.concat([train,temp])
    
    del temp
    
    train.drop_duplicates(inplace=True)
    train['label'] = train['label'].fillna(value=0)
    return train

def pretest():
    test = predata(15,30)
    
    test = test.fillna(value=0)
    test.drop_duplicates(inplace=True)
    return test
    
test = pretest()    
train = pretrain()

print train.shape
print train.columns
print test.shape
print test.columns

data_p = predata(-6,9)
data_p = pd.merge(data_p,addlabel(10,16),on = 'uid',how = 'left')
data_p = data_p.fillna(value=0)
data_p.drop_duplicates(inplace=True)

In [None]:
y = train.pop('label')
res = pd.DataFrame()
res['uid'] = test['uid']
import lightgbm as lgb
from sklearn.cross_validation import train_test_split

In [None]:
feature = ['rday', 'rtype', 'dtype', 'dtypecnt', 'rtype_0', 'rtype_1', 'rtype_2',
       'rtype_3', 'rtype_4', 'rtype_5', 'dtype_0', 'dtype_1', 'dtype_2',
       'dtype_3', 'dtype_4', 'dtype_5', 'dtype_6', 'dtype_7', 'dtype_8',
       'dtype_9', 'dtype_10', 'dtype_11', 'dtype_12', 'dtype_13',
       'dtype_14', 'dtype_15', 'dtype_16', 'dtype_17', 'dtype_18',
       'dtype_19', 'vidcnts', 'watch', 'ave_watch', 'atype0cnt',
       'page0cnt', 'maxlog', 'lastlog', 'log_gap', 'logsum', 'log0day',
       'log1day', 'log2day', 'log3day', 'log4day', 'log5day', 'log6day',
       'log7day', 'log8day', 'log9day', 'log10day', 'log11day',
       'log12day', 'log13day', 'log14day', 'log15day', 'atype1cnt',
       'page1cnt', 'maxp', 'mostp', 'pdaycnt', 'lastp', 'pday_gap',
       'sump', 'p0day', 'p1day', 'p2day', 'p3day', 'p4day', 'p5day',
       'p6day', 'p7day', 'p8day', 'p9day', 'p10day', 'p11day', 'p12day',
       'p13day', 'p14day', 'p15day', 'ave_p', 'atype2cnt', 'page2cnt',
       'lasta', 'aday_gap', 'atype3cnt', 'page3cnt', 'atype4cnt',
       'page4cnt', 'atype5cnt', 'asum_0', 'asum_1', 'asum_2', 'asum_3',
       'asum_4', 'asum_5', 'asum_6', 'asum_7', 'asum_8', 'asum_9',
       'asum_10', 'asum_11', 'asum_12', 'asum_13', 'asum_14', 'asum_15',
       'act_sum', 'firsta', 'first_gap', 'ave_act', 'adaycnt',
       'ave_act_1', 'asum1', 'psum1', 'atype0_ratio', 'atype1_ratio',
       'atype2_ratio', 'atype3_ratio', 'atype4_ratio', 'atype5_ratio',
       'page0_ratio', 'page1_ratio', 'page2_ratio', 'page3_ratio',
       'page4_ratio'
      ]

In [None]:
temp_1 = train[feature]
temp_2 = data_p[feature]
temp_3 = test[feature]

In [None]:
cross = ['dtype', 'page0cnt', 'page3cnt', 'ave_act', 'page1cnt', 'page1cnt', 'atype0cnt',
        'page2cnt', 'atype1cnt', 'act_sum', 'rday', 'rtype', 'lastlog', 'logsum', 'maxlog',
        'atype2cnt', 'lasta', 'firsta', 'page4cnt', 'asum_15', 'asum_14', 'asum_12', 'asum_13',
        'log_gap', 'atype3cnt', 'asum_11', 'asum_10', 'lastp', 'asum_9', 'aday_gap', 'asum_8',
        'first_gap', 'asum_5', 'asum_7', 'asum_6', 'pday_gap']

for i in range(0,len(cross)):
    print i
    for j in range(i + 1,len(cross)):
        temp_1[str(i) + 'X' + str(j)] = (temp_1[cross[i]]+1) / (temp_1[cross[j]]+1)
        temp_2[str(i) + 'X' + str(j)] = (temp_2[cross[i]]+1) / (temp_2[cross[j]]+1)
        temp_3[str(i) + 'X' + str(j)] = (temp_3[cross[i]]+1) / (temp_3[cross[j]]+1)
        # print cross[i],cross[j]

In [None]:
feature_lgb = ['rday', 'rtype', 'dtype', 'dtypecnt', 'rtype_0', 'rtype_1', 'rtype_2',
       'rtype_3', 'rtype_4', 'rtype_5', 'maxlog', 'lastlog', 'log_gap',
       'logsum', 'log0day', 'log1day', 'log2day', 'log3day', 'log4day',
       'log5day', 'log6day', 'log7day', 'log8day', 'log9day', 'log10day',
       'log11day', 'log12day', 'log13day', 'log14day', 'log15day',
       'lastp', 'pday_gap', 'sump', 
       'p0day', 'p1day', 'p2day', 'p3day', 'p4day', 'p5day', 'p6day', 'p7day',
       'p8day', 'p9day', 'p10day', 'p11day', 'p12day', 'p13day',
       'p14day', 'p15day', 'lasta', 'aday_gap',  
        'act_sum', 'firsta', 'first_gap', 'ave_act', 
        'asum_9', 'asum_10','asum_11', 'asum_12', 'asum_13', 'asum_14', 'asum_15', 
        'atype0cnt', 'atype1cnt', 'atype2cnt', 'atype3cnt', 'atype4cnt', 'atype5cnt', 
        'page0cnt', 'page1cnt', 'page2cnt', 'page3cnt', 'page4cnt', 'vidcnts', 'watch', 'ave_watch',
      'dtype_0', 'dtype_1', 'dtype_2','dtype_3', 'dtype_4', 'dtype_5', 'dtype_6', 'dtype_7', 'dtype_8','dtype_9',
       'asum_1', 'asum_2', 'asum_3', 'asum_4', 'asum_5','asum_6', 'asum_7', 'asum_8',
       'maxp','10X13','0X3','0X10','0X11','0X13',
       '0X14','0X6','1X15','0X15','10X29','8X9','4X15',
      ]

In [None]:
train_x, evals_x_0, train_y, evals_y_0 = train_test_split(temp_1[feature_lgb], y, test_size=0.2,

                                                      random_state=2018)
train_x, train_x1, train_y, train_y1 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_2, train_x_3, train_y_2, train_y_3 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_4, train_x_5, train_y_4, train_y_5 = train_test_split(train_x1, train_y1, test_size=0.5,
                                                      random_state=2018)

train_x_0 = [evals_x_0,train_x_2,train_x_3,train_x_4,train_x_5]
train_y_0 = [evals_y_0,train_y_2,train_y_3,train_y_4,train_y_5]
for i in range(0,5):    
    evals_x = train_x_0[i]
    evals_y = train_y_0[i]
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()
    for j in range(0,5):
        if j != i:
            train_x = pd.concat([train_x,train_x_0[j]])
            train_y = pd.concat([train_y,train_y_0[j]])
    
    train_x = pd.concat([train_x,temp_2[feature_lgb]])
    train_y = pd.concat([train_y,data_p['label']])
    
    print train_x.shape
    print evals_x.shape
    
    print("LGB test")
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=55, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=10000, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs=-1
    )
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y),(evals_x, evals_y)], 
            eval_metric={'auc'},early_stopping_rounds=500, categorical_feature=['rtype', ])
    
    
    res['lgb' + str(i)] = clf.predict_proba(temp_3[feature_lgb])[:,1]

res['lgb_1'] = (res['lgb0']+res['lgb1']+res['lgb2']+res['lgb3']+res['lgb4'])/5

In [None]:
train_x, evals_x_0, train_y, evals_y_0 = train_test_split(temp_1[feature_lgb], y, test_size=0.2,

                                                      random_state=2018)
train_x, train_x1, train_y, train_y1 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_2, train_x_3, train_y_2, train_y_3 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_4, train_x_5, train_y_4, train_y_5 = train_test_split(train_x1, train_y1, test_size=0.5,
                                                      random_state=2018)

train_x_0 = [evals_x_0,train_x_2,train_x_3,train_x_4,train_x_5]
train_y_0 = [evals_y_0,train_y_2,train_y_3,train_y_4,train_y_5]
for i in range(0,5):    
    evals_x = train_x_0[i]
    evals_y = train_y_0[i]
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()
    for j in range(0,5):
        if j != i:
            train_x = pd.concat([train_x,train_x_0[j]])
            train_y = pd.concat([train_y,train_y_0[j]])
    
    train_x = pd.concat([train_x,temp_2[feature_lgb]])
    train_y = pd.concat([train_y,data_p['label']])
    
    print train_x.shape
    print evals_x.shape
    
    print("XGB test")
    dtrain = xgb.DMatrix(train_x, label=train_y)
    devals = xgb.DMatrix(evals_x, label=evals_y)
    dtest = xgb.DMatrix(temp_3[feature_lgb])
    params = {'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': 7,
        'lambda': 0,
        'subsample': 1.0,
        'colsample_bytree': 0.75,
        'min_child_weight': 50,
        'eta': 0.01,
        'seed': 2018,
        # 'nthread': 4,
        'silent': 1}
    watchlist = [(dtrain, 'train'),(devals,'evals')]
    bst = xgb.train(params, dtrain, num_boost_round=10000, evals=watchlist,early_stopping_rounds=300)
    
    res['xgb' + str(i)] = bst.predict(dtest)
    
res['xgb_1'] = (res['xgb0']+res['xgb1']+res['xgb2']+res['xgb3']+res['xgb4'])/5

In [None]:
train_x, evals_x_0, train_y, evals_y_0 = train_test_split(temp_1[feature_lgb], y, test_size=0.2,

                                                      random_state=2018)
train_x, train_x1, train_y, train_y1 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_2, train_x_3, train_y_2, train_y_3 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_4, train_x_5, train_y_4, train_y_5 = train_test_split(train_x1, train_y1, test_size=0.5,
                                                      random_state=2018)

train_x_0 = [evals_x_0,train_x_2,train_x_3,train_x_4,train_x_5]
train_y_0 = [evals_y_0,train_y_2,train_y_3,train_y_4,train_y_5]
for i in range(0,5):    
    evals_x = train_x_0[i]
    evals_y = train_y_0[i]
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()
    for j in range(0,5):
        if j != i:
            train_x = pd.concat([train_x,train_x_0[j]])
            train_y = pd.concat([train_y,train_y_0[j]])
    
    train_x = pd.concat([train_x,temp_2[feature_lgb]])
    train_y = pd.concat([train_y,data_p['label']])
    # train_y = np.array(train_y.tolist())
    print train_x.shape
    print evals_x.shape
    # print train_y[0]
    print("LGB test")
    params = {
        'boosting':'dart',
        'colsample_bytree':0.7,
        'learning_rate':0.01,
        'metric':'auc',
        'min_child_samples':50,
        'num_leaves':55,
        'objective':'regression',
        'reg_alpha':0,
        'reg_lambda':1,
        'seed':2018,
        'silent':1,
        'subsample':0.7,
        'verbose':1
    }
    train_part = lgb.Dataset(train_x,label=train_y[0])
    evals = lgb.Dataset(evals_x,label=evals_y)
    bst = lgb.train(params,train_part, 
                  num_boost_round=20000, valid_sets=[train_part,evals], 
                  valid_names=['train','evals'], fobj=None,feval=None,feature_name='auto', 
                  categorical_feature='auto', early_stopping_rounds=1000,
                  evals_result=None, verbose_eval=True, learning_rates=None, 
                  keep_training_booster=False, callbacks=None)
    
    
    res['reg' + str(i)] = bst.predict(temp_3[feature_lgb])
    
res['reg_1'] = (res['reg0']+res['reg1']+res['reg2']+res['reg3']+res['reg4'])/5

In [None]:
feature_lgb = ['rday', 'rtype', 'dtype', 'rtype_0', 'rtype_1', 'rtype_2',
       'rtype_3', 'rtype_4', 'rtype_5', 'maxlog', 'lastlog', 
       'logsum', 'log0day', 'log1day', 'log2day', 'log3day', 'log4day',
       'log5day', 'log6day', 'log7day', 'log8day', 'log9day', 'log10day',
       'log11day', 'log12day', 'log13day', 'log14day', 'log15day',
       'lastp', 'pday_gap', 'sump', 
       'p0day', 'p1day', 'p2day', 'p3day', 'p4day', 'p5day', 'p6day', 'p7day',
       'p8day', 'p9day', 'p10day', 'p11day', 'p12day', 'p13day',
       'p14day', 'p15day', 'lasta', 'aday_gap',  
        'act_sum', 'firsta', 'first_gap', 'ave_act', 
        'asum_9', 'asum_10','asum_11', 'asum_12', 'asum_13', 'asum_14', 'asum_15', 
        'atype0cnt', 'atype1cnt', 'atype2cnt', 'atype3cnt', 'atype4cnt', 'atype5cnt', 
        'page0cnt', 'page1cnt', 'page2cnt', 'page3cnt', 'page4cnt', 'vidcnts', 'watch', 'ave_watch',
      'dtype_0', 'dtype_1', 'dtype_2','dtype_3', 'dtype_4', 'dtype_5', 'dtype_6', 'dtype_7', 'dtype_8','dtype_9',
       'asum_1', 'asum_2', 'asum_3', 'asum_4', 'asum_5','asum_6', 'asum_7', 'asum_8',
       'maxp','10X13','0X3','0X10','0X11','0X13',
       '0X14','0X6','1X15','0X15','10X29','8X9','4X15',
      ]

In [None]:
train_x, evals_x_0, train_y, evals_y_0 = train_test_split(temp_1[feature_lgb], y, test_size=0.2,

                                                      random_state=2018)
train_x, train_x1, train_y, train_y1 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_2, train_x_3, train_y_2, train_y_3 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_4, train_x_5, train_y_4, train_y_5 = train_test_split(train_x1, train_y1, test_size=0.5,
                                                      random_state=2018)

train_x_0 = [evals_x_0,train_x_2,train_x_3,train_x_4,train_x_5]
train_y_0 = [evals_y_0,train_y_2,train_y_3,train_y_4,train_y_5]
for i in range(0,5):    
    evals_x = train_x_0[i]
    evals_y = train_y_0[i]
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()
    for j in range(0,5):
        if j != i:
            train_x = pd.concat([train_x,train_x_0[j]])
            train_y = pd.concat([train_y,train_y_0[j]])
    
    train_x = pd.concat([train_x,temp_2[feature_lgb]])
    train_y = pd.concat([train_y,data_p['label']])
    
    print train_x.shape
    print evals_x.shape
    
    print("LGB test")
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=55, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=10000, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs=-1
    )
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y),(evals_x, evals_y)], 
            eval_metric={'auc'},early_stopping_rounds=500, categorical_feature=['rtype', ])
    
    
    res['lgb' + str(i)] = clf.predict_proba(temp_3[feature_lgb])[:,1]

res['lgb_2'] = (res['lgb0']+res['lgb1']+res['lgb2']+res['lgb3']+res['lgb4'])/5

In [None]:
train_x, evals_x_0, train_y, evals_y_0 = train_test_split(temp_1[feature_lgb], y, test_size=0.2,

                                                      random_state=2018)
train_x, train_x1, train_y, train_y1 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_2, train_x_3, train_y_2, train_y_3 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_4, train_x_5, train_y_4, train_y_5 = train_test_split(train_x1, train_y1, test_size=0.5,
                                                      random_state=2018)

train_x_0 = [evals_x_0,train_x_2,train_x_3,train_x_4,train_x_5]
train_y_0 = [evals_y_0,train_y_2,train_y_3,train_y_4,train_y_5]
for i in range(0,5):    
    evals_x = train_x_0[i]
    evals_y = train_y_0[i]
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()
    for j in range(0,5):
        if j != i:
            train_x = pd.concat([train_x,train_x_0[j]])
            train_y = pd.concat([train_y,train_y_0[j]])
    
    train_x = pd.concat([train_x,temp_2[feature_lgb]])
    train_y = pd.concat([train_y,data_p['label']])
    
    print train_x.shape
    print evals_x.shape
    
    print("XGB test")
    dtrain = xgb.DMatrix(train_x, label=train_y)
    devals = xgb.DMatrix(evals_x, label=evals_y)
    dtest = xgb.DMatrix(temp_3[feature_lgb])
    params = {'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': 7,
        'lambda': 0,
        'subsample': 1.0,
        'colsample_bytree': 0.75,
        'min_child_weight': 50,
        'eta': 0.01,
        'seed': 2018,
        # 'nthread': 4,
        'silent': 1}
    watchlist = [(dtrain, 'train'),(devals,'evals')]
    bst = xgb.train(params, dtrain, num_boost_round=10000, evals=watchlist,early_stopping_rounds=300)
    
    res['xgb' + str(i)] = bst.predict(dtest)
    
res['xgb_2'] = (res['xgb0']+res['xgb1']+res['xgb2']+res['xgb3']+res['xgb4'])/5

In [None]:
train_x, evals_x_0, train_y, evals_y_0 = train_test_split(temp_1[feature_lgb], y, test_size=0.2,

                                                      random_state=2018)
train_x, train_x1, train_y, train_y1 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_2, train_x_3, train_y_2, train_y_3 = train_test_split(train_x, train_y, test_size=0.5,
                                                      random_state=2018)
train_x_4, train_x_5, train_y_4, train_y_5 = train_test_split(train_x1, train_y1, test_size=0.5,
                                                      random_state=2018)

train_x_0 = [evals_x_0,train_x_2,train_x_3,train_x_4,train_x_5]
train_y_0 = [evals_y_0,train_y_2,train_y_3,train_y_4,train_y_5]
for i in range(0,5):    
    evals_x = train_x_0[i]
    evals_y = train_y_0[i]
    train_x = pd.DataFrame()
    train_y = pd.DataFrame()
    for j in range(0,5):
        if j != i:
            train_x = pd.concat([train_x,train_x_0[j]])
            train_y = pd.concat([train_y,train_y_0[j]])
    
    train_x = pd.concat([train_x,temp_2[feature_lgb]])
    train_y = pd.concat([train_y,data_p['label']])
    # train_y = np.array(train_y.tolist())
    print train_x.shape
    print evals_x.shape
    # print train_y[0]
    print("LGB test")
    params = {
        'boosting':'dart',
        'colsample_bytree':0.7,
        'learning_rate':0.01,
        'metric':'auc',
        'min_child_samples':50,
        'num_leaves':55,
        'objective':'regression',
        'reg_alpha':0,
        'reg_lambda':1,
        'seed':2018,
        'silent':1,
        'subsample':0.7,
        'verbose':1
    }
    train_part = lgb.Dataset(train_x,label=train_y[0])
    evals = lgb.Dataset(evals_x,label=evals_y)
    bst = lgb.train(params,train_part, 
                  num_boost_round=20000, valid_sets=[train_part,evals], 
                  valid_names=['train','evals'], fobj=None,feval=None,feature_name='auto', 
                  categorical_feature='auto', early_stopping_rounds=1000,
                  evals_result=None, verbose_eval=True, learning_rates=None, 
                  keep_training_booster=False, callbacks=None)
    
    
    res['reg' + str(i)] = bst.predict(temp_3[feature_lgb])
    
res['reg_2'] = (res['reg0']+res['reg1']+res['reg2']+res['reg3']+res['reg4'])/5

In [None]:
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM, Embedding
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback, ModelCheckpoint
# from keras import callbacks
from keras import optimizers

In [None]:
feature_nn = ['rday', 'rtype', 'dtype', 'rtype_0', 'rtype_1', 'rtype_2',
       'rtype_3', 'rtype_4', 'rtype_5', 'dtype_0', 'dtype_1', 'dtype_2',
       'dtype_3', 'dtype_4', 'dtype_5', 'dtype_6', 'dtype_7', 'dtype_8',
       'dtype_9', 'dtype_10', 'dtype_11', 'dtype_12', 'dtype_13',
       'dtype_14', 'dtype_15', 'dtype_16', 'dtype_17', 'dtype_18',
       'dtype_19', 'vidcnts', 'watch', 'ave_watch', 'atype0cnt',
       'page0cnt', 'maxlog', 'lastlog', 'log_gap', 'logsum', 'log0day',
       'log1day', 'log2day', 'log3day', 'log4day', 'log5day', 'log6day',
       'log7day', 'log8day', 'log9day', 'log10day', 'log11day',
       'log12day', 'log13day', 'log14day', 'log15day', 'atype1cnt',
       'page1cnt', 'maxp', 'mostp', 'pdaycnt', 'lastp', 'pday_gap',
       'sump', 'p0day', 'p1day', 'p2day', 'p3day', 'p4day', 'p5day',
       'p6day', 'p7day', 'p8day', 'p9day', 'p10day', 'p11day', 'p12day',
       'p13day', 'p14day', 'p15day', 'ave_p', 'atype2cnt', 'page2cnt',
       'lasta', 'aday_gap', 'atype3cnt', 'page3cnt', 'atype4cnt',
       'page4cnt', 'atype5cnt', 'asum_0', 'asum_1', 'asum_2', 'asum_3',
       'asum_4', 'asum_5', 'asum_6', 'asum_7', 'asum_8', 'asum_9',
       'asum_10', 'asum_11', 'asum_12', 'asum_13', 'asum_14', 'asum_15',
       'act_sum', 'firsta', 'first_gap', 'ave_act', 'adaycnt',
       'ave_act_1', 'asum1', 'psum1', 'atype0_ratio', 'atype1_ratio',
       'atype2_ratio', 'atype3_ratio', 'atype4_ratio', 'atype5_ratio',
       'page0_ratio', 'page1_ratio', 'page2_ratio', 'page3_ratio',
       'page4_ratio',
      'maxp','10X13','0X3','0X10','0X11','0X13','dtypecnt', 
      ]
train_x, evals_x, train_y, evals_y = train_test_split(temp_1[feature_nn], y, test_size=0.2,
                                                      random_state=2018)

train_x = pd.concat([train_x,temp_2[feature_nn]])
train_y = pd.concat([train_y,data_p['label']])

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.x_val,self.y_val = validation_data
    def on_epoch_end(self, epoch, log={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print('\n ROC_AUC - epoch:%d - score:%.6f \n' % (epoch+1, score))

max_features = len(feature_nn)
model = Sequential()
# model.add(Embedding(max_features, output_dim=32))train_x.shape[1],train_x.shape[2]
model.add(Dense(128,input_shape=(max_features,)))#
model.add(BatchNormalization())
model.add(Dropout(.2))

model.add(Dense(128))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.05))

model.add(Dense(128))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(.05))

model.add(Dense(1, activation='sigmoid'))

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['acc'])

callbacks = [
#     Histories(),
    RocAucEvaluation(validation_data=(evals_x,evals_y), interval=1),
    EarlyStopping(monitor='loss', patience=3, verbose=0),
    ReduceLROnPlateau(monitor='loss', factor=0.1, patience=1, verbose=1, epsilon=1e-4, mode='min'),
    # ModelCheckpoint('lstm_model', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    ]

model.fit(train_x, train_y, batch_size=256 * 4, epochs=100,
         validation_data=(evals_x,evals_y), class_weight = {0:1,1:1},
          callbacks=callbacks)

res['nn_1'] = model.predict(test[feature_nn], verbose=0)

In [None]:
res['score']=(res['xgb_1']*0.4+res['lgb_1']*0.6)*0.42\
            +(res['nn_1']*0.1+res['xgb_2']*0.45+res['lgb_2']*0.45)*0.2 \
            +(res['reg_1']*0.6+res['reg_2']*0.4)*0.28

In [None]:
temp = test.loc[:,['uid','rday','lastlog','rtype']]
temp['score'] = res['score']

In [None]:
temp1 = temp.groupby('score')['rday','rtype','lastlog'].size().reset_index().rename(columns = {0:'scorecnt'})
temp = pd.merge(temp,temp1,on = 'score',how = 'right')
temp.sort_index(axis = 0,ascending = False,by = 'scorecnt')

In [None]:
temp.loc[((temp.scorecnt >= 50) & (temp.score <= 0.05 ) & (temp.rday <= 15 )),'score'] = 0
temp[['uid','score']].to_csv('mysubmission.csv', index=False, header=False)

In [None]:
!wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v1/kesci_submit&&chmod +x kesci_submit
!./kesci_submit -token 0e0bfea8c2a86f26 -file mysubmission.csv

In [None]:
import pandas as pd
res = pd.read_csv('/home/kesci/work/lgb+xgb.csv')

In [None]:
res.to_csv('/home/kesci/work/lgb+xgb.csv',index = None)

In [None]:
temp = test.loc[:,['uid','rday','lastlog','rtype']]

In [None]:
# import matplotlib.pyplot as plt
# import matplotlib as mpl
# import seaborn as sns
# color = sns.color_palette()
# temp.drop_duplicates(inplace=True)
# temp['score'] = res['score']
# temp.loc[(temp.lastlog == 9)&(temp.score > 0.0468785)&(temp.score < 0.0468795),\
#         'score'] = 0
# 
# temp.loc[((temp.rday + temp.lastlog == 16) & (temp.rday >= 7 )) | (temp.lastlog == 0),'score'] = 0
temp.loc[((temp.scorecnt >= 50) & (temp.score <= 0.05 ) & (temp.rday <= 15 )),'score'] = 0
# temp.loc[((temp.scorecnt >= 10) & (temp.score <= 0.087 ) & (temp.rday <= 15 )),'score'] = 0
# temp.loc[((temp.scorecnt >= 50) & (temp.score <= 0.087 )),'score'] = 0
# temp.loc[(temp.scorecnt >= 10000),'score'] = 0
# cnt = temp.groupby('rday').score.size()
# plt.figure(figsize=(12,6))
# sns.barplot(cnt.index, cnt.values, alpha=0.8, color=color[0])
# plt.ylabel('CVR', fontsize=12)
# plt.xlabel('day', fontsize=12)
# plt.show()
temp[['uid','score']].to_csv('mysubmission.csv', index=False, header=False)
temp.describe()

In [None]:
temp1 = temp.groupby('score')['rday','rtype','lastlog'].size().reset_index().rename(columns = {0:'scorecnt'})
temp = pd.merge(temp,temp1,on = 'score',how = 'right')
temp.sort_index(axis = 0,ascending = False,by = 'scorecnt')
# temp.describe()
# temp.pop('uid')
# temp.drop_duplicates(inplace = True)

In [None]:
temp1 = reguser.loc[:,['uid','rday']]
cnt = temp1.groupby('rday').size()
plt.figure(figsize=(12,6))
sns.barplot(cnt.index, cnt.values, alpha=0.8, color=color[0])
plt.ylabel('CVR', fontsize=12)
plt.xlabel('day', fontsize=12)
plt.show()