In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
#读取数据
age_train = pd.read_csv("age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("user_app_actived.csv", names=['uid','appId'])
#user_app_usage = pd.read_csv("user_app_usage.csv")
app_info = pd.read_csv("app_info.csv", names=['appId', 'category'])

In [None]:
def f(x):
    s = x.value_counts()
    return np.nan if len(s) == 0 else s.index[0]
#处理数据量较大的user_app_usage.csv，结合app_info.csv简单统计得到appuseProcessed.csv作为特征
def processUserAppUsage():
    resTable = pd.DataFrame()
    reader = pd.read_csv("user_app_usage.csv", names=['uid','appId','duration','times','use_date'], iterator=True)
    last_df = pd.DataFrame()
    
    app_info = pd.read_csv("app_info.csv", names=['appId','category'])
    cats = list(set(app_info['category']))
    category2id = dict(zip(sorted(cats), range(0,len(cats))))
    id2category = dict(zip(range(0,len(cats)), sorted(cats)))
    app_info['category'] = app_info['category'].apply(lambda x: category2id[x])
    i = 1
    
    while True:
        try:
            print("index: {}".format(i))
            i+=1
            df = reader.get_chunk(1000000)
            df = pd.concat([last_df, df])
            idx = df.shape[0]-1
            last_user = df.iat[idx,0]
            while(df.iat[idx,0]==last_user):
                idx-=1
            last_df = df[idx+1:]
            df = df[:idx+1]

            now_df = pd.DataFrame()
            now_df['uid'] = df['uid'].unique()
            now_df = now_df.merge(df.groupby('uid')['appId'].count().to_frame(), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['appId','use_date'].agg(['nunique']), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['duration','times'].agg(['mean','max','std']), how='left', on='uid')    

            now_df.columns = ['uid','usage_cnt','usage_appid_cnt','usage_date_cnt','duration_mean','duration_max','duration_std','times_mean','times_max','times_std']


            df = df.merge(app_info, how='left', on='appId')
            now_df = now_df.merge(df.groupby('uid')['category'].nunique().to_frame(), how='left', on='uid')
            print(df.groupby(['uid'])['category'].value_counts().index[0])
            now_df['usage_most_used_category'] = df.groupby(['uid'])['category'].transform(f)
            resTable = pd.concat([resTable, now_df])
        except StopIteration:
            break
    
    resTable.to_csv("appuseProcessed.csv",index=0)
    
    print("Iterator is stopped")

In [4]:
#将user_basic_info.csv 和 user_behavior_info.csv中的字符值编码成可以训练的数值类型，合并
class2id = {}
id2class = {}
def mergeBasicTables(baseTable):
    resTable = baseTable.merge(user_basic_info, how='left', on='uid', suffixes=('_base0', '_ubaf'))
    resTable = resTable.merge(user_behavior_info, how='left', on='uid', suffixes=('_base1', '_ubef'))
    cat_columns = ['city','prodName','color','carrier','os','ct']
    for c in cat_columns:
        resTable[c] = resTable[c].apply(lambda x: x if type(x)==str else str(x))
        sort_temp = sorted(list(set(resTable[c])))  
        class2id[c+'2id'] = dict(zip(sort_temp, range(1, len(sort_temp)+1)))
        id2class['id2'+c] = dict(zip(range(1,len(sort_temp)+1), sort_temp))
        resTable[c] = resTable[c].apply(lambda x: class2id[c+'2id'][x])
        
    return resTable

In [5]:
#处理app使用相关数据
#对user_app_actived.csv简单统计
#将之前训练的appuseProcess.csv进行合并
def mergeAppData(baseTable):
    resTable = baseTable.merge(user_app_actived, how='left', on='uid')
    resTable['appId'] = resTable['appId'].apply(lambda x: len(list(x.split('#'))))
    appusedTable = pd.read_csv("appuseProcessed.csv")
    resTable = resTable.merge(appusedTable, how='left', on='uid')
    resTable[['category', 'usage_most_used_category']] = resTable[['category', 'usage_most_used_category']].fillna(41)
    resTable = resTable.fillna(0)
    #print(resTable[:5])
    return resTable

In [6]:
#合并用户基本特征以及app使用相关特征，作为训练集和测试集
df_train = mergeAppData(mergeBasicTables(age_train))
df_test = mergeAppData(mergeBasicTables(age_test))
print(df_train.shape)
print(df_test.shape)

(2010000, 34)
(502500, 33)


In [7]:
#训练模型

from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [8]:
print("训练模型：")
param = {
        'learning_rate': 0.1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.2,
        'max_depth': 20,
        'objective': 'multiclass',
        'num_class': 7,
        'num_leaves': 31,
        'min_data_in_leaf': 50,
        'max_bin': 230,
        'feature_fraction': 0.8,
        'metric': 'multi_error'
        }

X = df_train.drop(['age_group','uid'], axis=1)
y = df_train['age_group']
uid = df_test['uid']
test = df_test.drop('uid', axis=1)

xx_score = []
cv_pred = []
skf = StratifiedKFold(n_splits=3, random_state=1030, shuffle=True)
for index, (train_index, vali_index) in enumerate(skf.split(X, y)):
    print(index)
    x_train, y_train, x_vali, y_vali = np.array(X)[train_index], np.array(y)[train_index], np.array(X)[vali_index], np.array(y)[vali_index]
    train = lgb.Dataset(x_train, y_train)
    vali =lgb.Dataset(x_vali, y_vali)
    print("training start...")
    model = lgb.train(param, train, num_boost_round=1000, valid_sets=[vali], early_stopping_rounds=50)
    xx_pred = model.predict(x_vali,num_iteration=model.best_iteration)
    xx_pred = [np.argmax(x) for x in xx_pred]
    xx_score.append(f1_score(y_vali,xx_pred,average='weighted'))
    y_test = model.predict(test,num_iteration=model.best_iteration)
    y_test = [np.argmax(x) for x in y_test]
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
        
submit = []
for line in cv_pred:
    submit.append(np.argmax(np.bincount(line)))
df = pd.DataFrame({'id':uid.as_matrix(),'label':submit})
df.to_csv('submission.csv',index=False)

训练模型：
0
training start...
[1]	valid_0's multi_error: 0.701493
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_error: 0.701493
[3]	valid_0's multi_error: 0.692941
[4]	valid_0's multi_error: 0.680643
[5]	valid_0's multi_error: 0.669613
[6]	valid_0's multi_error: 0.662776
[7]	valid_0's multi_error: 0.654668
[8]	valid_0's multi_error: 0.646763
[9]	valid_0's multi_error: 0.63844
[10]	valid_0's multi_error: 0.633157
[11]	valid_0's multi_error: 0.627631
[12]	valid_0's multi_error: 0.623586
[13]	valid_0's multi_error: 0.61987
[14]	valid_0's multi_error: 0.6176
[15]	valid_0's multi_error: 0.61462
[16]	valid_0's multi_error: 0.612762
[17]	valid_0's multi_error: 0.611176
[18]	valid_0's multi_error: 0.610086
[19]	valid_0's multi_error: 0.6086
[20]	valid_0's multi_error: 0.607284
[21]	valid_0's multi_error: 0.606153
[22]	valid_0's multi_error: 0.605087
[23]	valid_0's multi_error: 0.604142
[24]	valid_0's multi_error: 0.603017
[25]	valid_0's multi_error: 0.602194
[26

[218]	valid_0's multi_error: 0.562927
[219]	valid_0's multi_error: 0.562921
[220]	valid_0's multi_error: 0.562901
[221]	valid_0's multi_error: 0.562761
[222]	valid_0's multi_error: 0.562711
[223]	valid_0's multi_error: 0.562676
[224]	valid_0's multi_error: 0.562612
[225]	valid_0's multi_error: 0.562512
[226]	valid_0's multi_error: 0.562454
[227]	valid_0's multi_error: 0.562496
[228]	valid_0's multi_error: 0.562403
[229]	valid_0's multi_error: 0.562339
[230]	valid_0's multi_error: 0.562306
[231]	valid_0's multi_error: 0.562204
[232]	valid_0's multi_error: 0.562206
[233]	valid_0's multi_error: 0.562128
[234]	valid_0's multi_error: 0.562104
[235]	valid_0's multi_error: 0.562036
[236]	valid_0's multi_error: 0.561971
[237]	valid_0's multi_error: 0.56192
[238]	valid_0's multi_error: 0.561882
[239]	valid_0's multi_error: 0.561882
[240]	valid_0's multi_error: 0.561852
[241]	valid_0's multi_error: 0.561824
[242]	valid_0's multi_error: 0.561763
[243]	valid_0's multi_error: 0.561704
[244]	valid_0

[435]	valid_0's multi_error: 0.556868
[436]	valid_0's multi_error: 0.556801
[437]	valid_0's multi_error: 0.556808
[438]	valid_0's multi_error: 0.556819
[439]	valid_0's multi_error: 0.556783
[440]	valid_0's multi_error: 0.55674
[441]	valid_0's multi_error: 0.556738
[442]	valid_0's multi_error: 0.556749
[443]	valid_0's multi_error: 0.556752
[444]	valid_0's multi_error: 0.556717
[445]	valid_0's multi_error: 0.55672
[446]	valid_0's multi_error: 0.55672
[447]	valid_0's multi_error: 0.556638
[448]	valid_0's multi_error: 0.556632
[449]	valid_0's multi_error: 0.556583
[450]	valid_0's multi_error: 0.556554
[451]	valid_0's multi_error: 0.556588
[452]	valid_0's multi_error: 0.556569
[453]	valid_0's multi_error: 0.556541
[454]	valid_0's multi_error: 0.556513
[455]	valid_0's multi_error: 0.556506
[456]	valid_0's multi_error: 0.556489
[457]	valid_0's multi_error: 0.556508
[458]	valid_0's multi_error: 0.55652
[459]	valid_0's multi_error: 0.556529
[460]	valid_0's multi_error: 0.556495
[461]	valid_0's 

[652]	valid_0's multi_error: 0.554878
[653]	valid_0's multi_error: 0.554894
[654]	valid_0's multi_error: 0.55488
[655]	valid_0's multi_error: 0.554858
[656]	valid_0's multi_error: 0.554856
[657]	valid_0's multi_error: 0.554829
[658]	valid_0's multi_error: 0.554839
[659]	valid_0's multi_error: 0.554857
[660]	valid_0's multi_error: 0.554844
[661]	valid_0's multi_error: 0.55483
[662]	valid_0's multi_error: 0.554815
[663]	valid_0's multi_error: 0.554804
[664]	valid_0's multi_error: 0.554798
[665]	valid_0's multi_error: 0.554781
[666]	valid_0's multi_error: 0.554802
[667]	valid_0's multi_error: 0.554771
[668]	valid_0's multi_error: 0.554785
[669]	valid_0's multi_error: 0.554776
[670]	valid_0's multi_error: 0.554758
[671]	valid_0's multi_error: 0.554783
[672]	valid_0's multi_error: 0.55479
[673]	valid_0's multi_error: 0.55477
[674]	valid_0's multi_error: 0.554757
[675]	valid_0's multi_error: 0.554769
[676]	valid_0's multi_error: 0.554767
[677]	valid_0's multi_error: 0.554798
[678]	valid_0's 

[869]	valid_0's multi_error: 0.554057
[870]	valid_0's multi_error: 0.554055
[871]	valid_0's multi_error: 0.554061
[872]	valid_0's multi_error: 0.554049
[873]	valid_0's multi_error: 0.554047
[874]	valid_0's multi_error: 0.554059
[875]	valid_0's multi_error: 0.554079
[876]	valid_0's multi_error: 0.554037
[877]	valid_0's multi_error: 0.554026
[878]	valid_0's multi_error: 0.554028
[879]	valid_0's multi_error: 0.554028
[880]	valid_0's multi_error: 0.554063
[881]	valid_0's multi_error: 0.554056
[882]	valid_0's multi_error: 0.554064
[883]	valid_0's multi_error: 0.55405
[884]	valid_0's multi_error: 0.554052
[885]	valid_0's multi_error: 0.554033
[886]	valid_0's multi_error: 0.554031
[887]	valid_0's multi_error: 0.554051
[888]	valid_0's multi_error: 0.554032
[889]	valid_0's multi_error: 0.554026
[890]	valid_0's multi_error: 0.554047
[891]	valid_0's multi_error: 0.554019
[892]	valid_0's multi_error: 0.554024
[893]	valid_0's multi_error: 0.554014
[894]	valid_0's multi_error: 0.554004
[895]	valid_0

[84]	valid_0's multi_error: 0.577507
[85]	valid_0's multi_error: 0.57735
[86]	valid_0's multi_error: 0.57716
[87]	valid_0's multi_error: 0.57693
[88]	valid_0's multi_error: 0.576682
[89]	valid_0's multi_error: 0.576526
[90]	valid_0's multi_error: 0.576374
[91]	valid_0's multi_error: 0.57614
[92]	valid_0's multi_error: 0.576009
[93]	valid_0's multi_error: 0.575835
[94]	valid_0's multi_error: 0.575632
[95]	valid_0's multi_error: 0.575404
[96]	valid_0's multi_error: 0.575258
[97]	valid_0's multi_error: 0.574957
[98]	valid_0's multi_error: 0.574843
[99]	valid_0's multi_error: 0.574622
[100]	valid_0's multi_error: 0.574526
[101]	valid_0's multi_error: 0.574359
[102]	valid_0's multi_error: 0.574054
[103]	valid_0's multi_error: 0.573895
[104]	valid_0's multi_error: 0.573638
[105]	valid_0's multi_error: 0.573465
[106]	valid_0's multi_error: 0.5732
[107]	valid_0's multi_error: 0.573018
[108]	valid_0's multi_error: 0.572897
[109]	valid_0's multi_error: 0.572655
[110]	valid_0's multi_error: 0.572

[301]	valid_0's multi_error: 0.559998
[302]	valid_0's multi_error: 0.559963
[303]	valid_0's multi_error: 0.559942
[304]	valid_0's multi_error: 0.559866
[305]	valid_0's multi_error: 0.559822
[306]	valid_0's multi_error: 0.559763
[307]	valid_0's multi_error: 0.559746
[308]	valid_0's multi_error: 0.559724
[309]	valid_0's multi_error: 0.559719
[310]	valid_0's multi_error: 0.559669
[311]	valid_0's multi_error: 0.559638
[312]	valid_0's multi_error: 0.559572
[313]	valid_0's multi_error: 0.559572
[314]	valid_0's multi_error: 0.559599
[315]	valid_0's multi_error: 0.559605
[316]	valid_0's multi_error: 0.559522
[317]	valid_0's multi_error: 0.559499
[318]	valid_0's multi_error: 0.559454
[319]	valid_0's multi_error: 0.559325
[320]	valid_0's multi_error: 0.559322
[321]	valid_0's multi_error: 0.559249
[322]	valid_0's multi_error: 0.559211
[323]	valid_0's multi_error: 0.559195
[324]	valid_0's multi_error: 0.559186
[325]	valid_0's multi_error: 0.559169
[326]	valid_0's multi_error: 0.559105
[327]	valid_

[517]	valid_0's multi_error: 0.556239
[518]	valid_0's multi_error: 0.556224
[519]	valid_0's multi_error: 0.556217
[520]	valid_0's multi_error: 0.556213
[521]	valid_0's multi_error: 0.556211
[522]	valid_0's multi_error: 0.556226
[523]	valid_0's multi_error: 0.556218
[524]	valid_0's multi_error: 0.556206
[525]	valid_0's multi_error: 0.55617
[526]	valid_0's multi_error: 0.556148
[527]	valid_0's multi_error: 0.556111
[528]	valid_0's multi_error: 0.556062
[529]	valid_0's multi_error: 0.556064
[530]	valid_0's multi_error: 0.556036
[531]	valid_0's multi_error: 0.555994
[532]	valid_0's multi_error: 0.555983
[533]	valid_0's multi_error: 0.55597
[534]	valid_0's multi_error: 0.555955
[535]	valid_0's multi_error: 0.555946
[536]	valid_0's multi_error: 0.555913
[537]	valid_0's multi_error: 0.555914
[538]	valid_0's multi_error: 0.555898
[539]	valid_0's multi_error: 0.555896
[540]	valid_0's multi_error: 0.55586
[541]	valid_0's multi_error: 0.555882
[542]	valid_0's multi_error: 0.555918
[543]	valid_0's

[734]	valid_0's multi_error: 0.554719
[735]	valid_0's multi_error: 0.554728
[736]	valid_0's multi_error: 0.554703
[737]	valid_0's multi_error: 0.554706
[738]	valid_0's multi_error: 0.554678
[739]	valid_0's multi_error: 0.554709
[740]	valid_0's multi_error: 0.554721
[741]	valid_0's multi_error: 0.554701
[742]	valid_0's multi_error: 0.554698
[743]	valid_0's multi_error: 0.554691
[744]	valid_0's multi_error: 0.554696
[745]	valid_0's multi_error: 0.5547
[746]	valid_0's multi_error: 0.55468
[747]	valid_0's multi_error: 0.554658
[748]	valid_0's multi_error: 0.554663
[749]	valid_0's multi_error: 0.5547
[750]	valid_0's multi_error: 0.554686
[751]	valid_0's multi_error: 0.554659
[752]	valid_0's multi_error: 0.554671
[753]	valid_0's multi_error: 0.554677
[754]	valid_0's multi_error: 0.554689
[755]	valid_0's multi_error: 0.554693
[756]	valid_0's multi_error: 0.554689
[757]	valid_0's multi_error: 0.554697
[758]	valid_0's multi_error: 0.55471
[759]	valid_0's multi_error: 0.554716
[760]	valid_0's mu

[951]	valid_0's multi_error: 0.553951
[952]	valid_0's multi_error: 0.553957
[953]	valid_0's multi_error: 0.553994
[954]	valid_0's multi_error: 0.553987
[955]	valid_0's multi_error: 0.553994
[956]	valid_0's multi_error: 0.553997
[957]	valid_0's multi_error: 0.554013
[958]	valid_0's multi_error: 0.554009
[959]	valid_0's multi_error: 0.554007
[960]	valid_0's multi_error: 0.553974
[961]	valid_0's multi_error: 0.553953
[962]	valid_0's multi_error: 0.553943
[963]	valid_0's multi_error: 0.553931
[964]	valid_0's multi_error: 0.553909
[965]	valid_0's multi_error: 0.553916
[966]	valid_0's multi_error: 0.553897
[967]	valid_0's multi_error: 0.553886
[968]	valid_0's multi_error: 0.553881
[969]	valid_0's multi_error: 0.553882
[970]	valid_0's multi_error: 0.553872
[971]	valid_0's multi_error: 0.553853
[972]	valid_0's multi_error: 0.553858
[973]	valid_0's multi_error: 0.553868
[974]	valid_0's multi_error: 0.553869
[975]	valid_0's multi_error: 0.55389
[976]	valid_0's multi_error: 0.553895
[977]	valid_0



In [12]:
age_train['age_group'].nunique()

6