In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
pd.set_option('display.max_columns',None)

In [3]:
#读取数据
age_train = pd.read_csv("age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("user_app_actived.csv", names=['uid','appId'])
#user_app_usage = pd.read_csv("user_app_usage.csv")
app_info = pd.read_csv("app_info.csv", names=['appId', 'category'])

In [4]:
#处理数据量较大的user_app_usage.csv，结合app_info.csv简单统计得到appuseProcessed.csv作为特征
def f(x):
    s = x.value_counts()
    return np.nan if len(s) == 0 else s.index[0]
def processUserAppUsage():
    resTable = pd.DataFrame()
    reader = pd.read_csv("user_app_usage.csv", names=['uid','appId','duration','times','use_date'], iterator=True)
    last_df = pd.DataFrame()
    
    app_info = pd.read_csv("app_info.csv", names=['appId','category'])
    cats = list(set(app_info['category']))
    category2id = dict(zip(sorted(cats), range(0,len(cats))))
    id2category = dict(zip(range(0,len(cats)), sorted(cats)))
    app_info['category'] = app_info['category'].apply(lambda x: category2id[x])
    i = 1
    
    while True:
        try:
            print("index: {}".format(i))
            i+=1
            df = reader.get_chunk(1000000)
            df = pd.concat([last_df, df])
            idx = df.shape[0]-1
            last_user = df.iat[idx,0]
            while(df.iat[idx,0]==last_user):
                idx-=1
            last_df = df[idx+1:]
            df = df[:idx+1]

            now_df = pd.DataFrame()
            now_df['uid'] = df['uid'].unique()
            now_df = now_df.merge(df.groupby('uid')['appId'].count().to_frame(), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['appId','use_date'].agg(['nunique']), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['duration','times'].agg(['mean','max','std']), how='left', on='uid')    

            now_df.columns = ['uid','usage_cnt','usage_appid_cnt','usage_date_cnt','duration_mean','duration_max','duration_std','times_mean','times_max','times_std']


            df = df.merge(app_info, how='left', on='appId')
            now_df = now_df.merge(df.groupby('uid')['category'].nunique().to_frame(), how='left', on='uid')
            #print(df.groupby(['uid'])['category'].value_counts().index[0])
            now_df['usage_most_used_category'] = df.groupby(['uid'])['category'].transform(f)
            resTable = pd.concat([resTable, now_df])
        except StopIteration:
            break
    
    resTable.to_csv("appuseProcessed.csv",index=0)
    
    print("Iterator is stopped")

In [5]:
processUserAppUsage()

index: 1




index: 2
index: 3
index: 4
index: 5
index: 6
index: 7
index: 8
index: 9
index: 10
index: 11
index: 12
index: 13
index: 14
index: 15
index: 16
index: 17
index: 18
index: 19
index: 20
index: 21
index: 22
index: 23
index: 24
index: 25
index: 26
index: 27
index: 28
index: 29
index: 30
index: 31
index: 32
index: 33
index: 34
index: 35
index: 36
index: 37
index: 38
index: 39
index: 40
index: 41
index: 42
index: 43
index: 44
index: 45
index: 46
index: 47
index: 48
index: 49
index: 50
index: 51
index: 52
index: 53
index: 54
index: 55
index: 56
index: 57
index: 58
index: 59
index: 60
index: 61
index: 62
index: 63
index: 64
index: 65
index: 66
index: 67
index: 68
index: 69
index: 70
index: 71
index: 72
index: 73
index: 74
index: 75
index: 76
index: 77
index: 78
index: 79
index: 80
index: 81
index: 82
index: 83
index: 84
index: 85
index: 86
index: 87
index: 88
index: 89
index: 90
index: 91
index: 92
index: 93
index: 94
index: 95
index: 96
index: 97
index: 98
index: 99
index: 100
index: 101
index:

In [15]:
# 将user_basic_info.csv 和 processed_behavior.csv中的字符值编码成可以训练的数值类型，合并
class2id = {}
id2class = {}
processed_behavior = pd.read_csv("processed_behavior.csv", names=['uid', 'bootTimes', 'AllFuncSum', 'MaxFunc', 'FuncCnt', 'FFuncSum'])
def mergeBasicTables(baseTable):
    resTable = baseTable.merge(user_basic_info, how='left', on='uid', suffixes=('_base0', '_ubaf'))
    resTable = resTable.merge(processed_behavior, how='left', on='uid', suffixes=('_base1', '_ubef'))
    cat_columns = ['city','prodName','color','carrier','os','ct']
    for c in cat_columns:
        resTable[c] = resTable[c].apply(lambda x: x if type(x)==str else str(x))
        sort_temp = sorted(list(set(resTable[c])))  
        class2id[c+'2id'] = dict(zip(sort_temp, range(1, len(sort_temp)+1)))
        id2class['id2'+c] = dict(zip(range(1,len(sort_temp)+1), sort_temp))
        resTable[c] = resTable[c].apply(lambda x: class2id[c+'2id'][x])
        
    return resTable

In [18]:
# 对user_app_actived.csv简单统计
# 将之前训练的appuseProcess.csv进行合并
def mergeAppData(baseTable):
    resTable = baseTable.merge(user_app_actived, how='left', on='uid')
    resTable['appId'] = resTable['appId'].apply(lambda x: len(list(x.split('#'))))
    appusedTable = pd.read_csv("appuseProcessed.csv")
    resTable = resTable.merge(appusedTable, how='left', on='uid')
    resTable[['category', 'usage_most_used_category']] = resTable[['category', 'usage_most_used_category']].fillna(41)
    resTable = resTable.fillna(0)
    return resTable

In [19]:
#合并用户基本特征以及app使用相关特征，作为训练集和测试集
df_train = mergeAppData(mergeBasicTables(age_train))
df_test = mergeAppData(mergeBasicTables(age_test))
df_train.to_csv("train_data.csv", index = 0)
print(df_train.shape)
print(df_test.shape)

(2010000, 31)
(502500, 30)


In [20]:
#训练模型
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [21]:
print("训练模型：")
param = {
        'learning_rate': 0.08,
        'lambda_l1': 0.1,
        'lambda_l2': 0.2,
        'max_depth': 20,
        'objective': 'multiclass',
        'num_class': 7,
        'num_leaves': 31,
        'min_data_in_leaf': 50,
        'max_bin': 230,
        'feature_fraction': 0.8,
        'metric': 'multi_error'
        }

X = df_train.drop(['age_group','uid'], axis=1)
y = df_train['age_group']
uid = df_test['uid']
test = df_test.drop('uid', axis=1)

xx_score = []
cv_pred = []
skf = StratifiedKFold(n_splits=3, random_state=1030, shuffle=True)
for index, (train_index, vali_index) in enumerate(skf.split(X, y)):
    print(index)
    x_train, y_train, x_vali, y_vali = np.array(X)[train_index], np.array(y)[train_index], np.array(X)[vali_index], np.array(y)[vali_index]
    train = lgb.Dataset(x_train, y_train)
    vali =lgb.Dataset(x_vali, y_vali)
    print("training start...")
    model = lgb.train(param, train, num_boost_round=1000, valid_sets=[vali], early_stopping_rounds=50)
    xx_pred = model.predict(x_vali,num_iteration=model.best_iteration)
    xx_pred = [np.argmax(x) for x in xx_pred]
    xx_score.append(f1_score(y_vali,xx_pred,average='weighted'))
    y_test = model.predict(test,num_iteration=model.best_iteration)
    y_test = [np.argmax(x) for x in y_test]
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
        
submit = []
for line in cv_pred:
    submit.append(np.argmax(np.bincount(line)))
df = pd.DataFrame({'id':uid.as_matrix(),'label':submit})
df.to_csv('submission.csv',index=False)

训练模型：
0
training start...
[1]	valid_0's multi_error: 0.701493
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's multi_error: 0.701493
[3]	valid_0's multi_error: 0.701493
[4]	valid_0's multi_error: 0.696723
[5]	valid_0's multi_error: 0.690597
[6]	valid_0's multi_error: 0.682768
[7]	valid_0's multi_error: 0.675212
[8]	valid_0's multi_error: 0.66898
[9]	valid_0's multi_error: 0.662759
[10]	valid_0's multi_error: 0.656592
[11]	valid_0's multi_error: 0.650705
[12]	valid_0's multi_error: 0.645209
[13]	valid_0's multi_error: 0.640651
[14]	valid_0's multi_error: 0.636717
[15]	valid_0's multi_error: 0.63409
[16]	valid_0's multi_error: 0.631468
[17]	valid_0's multi_error: 0.628668
[18]	valid_0's multi_error: 0.626318
[19]	valid_0's multi_error: 0.623714
[20]	valid_0's multi_error: 0.622507
[21]	valid_0's multi_error: 0.620935
[22]	valid_0's multi_error: 0.61961
[23]	valid_0's multi_error: 0.618318
[24]	valid_0's multi_error: 0.617087
[25]	valid_0's multi_error: 0.616013

[218]	valid_0's multi_error: 0.571178
[219]	valid_0's multi_error: 0.57108
[220]	valid_0's multi_error: 0.571081
[221]	valid_0's multi_error: 0.57106
[222]	valid_0's multi_error: 0.570892
[223]	valid_0's multi_error: 0.57085
[224]	valid_0's multi_error: 0.570814
[225]	valid_0's multi_error: 0.570595
[226]	valid_0's multi_error: 0.570548
[227]	valid_0's multi_error: 0.570532
[228]	valid_0's multi_error: 0.570465
[229]	valid_0's multi_error: 0.570429
[230]	valid_0's multi_error: 0.570398
[231]	valid_0's multi_error: 0.570325
[232]	valid_0's multi_error: 0.570341
[233]	valid_0's multi_error: 0.570357
[234]	valid_0's multi_error: 0.570195
[235]	valid_0's multi_error: 0.570101
[236]	valid_0's multi_error: 0.570086
[237]	valid_0's multi_error: 0.570038
[238]	valid_0's multi_error: 0.570035
[239]	valid_0's multi_error: 0.569872
[240]	valid_0's multi_error: 0.569798
[241]	valid_0's multi_error: 0.56979
[242]	valid_0's multi_error: 0.569671
[243]	valid_0's multi_error: 0.569636
[244]	valid_0's 

[435]	valid_0's multi_error: 0.563508
[436]	valid_0's multi_error: 0.563459
[437]	valid_0's multi_error: 0.563399
[438]	valid_0's multi_error: 0.563326
[439]	valid_0's multi_error: 0.563283
[440]	valid_0's multi_error: 0.563342
[441]	valid_0's multi_error: 0.563293
[442]	valid_0's multi_error: 0.563266
[443]	valid_0's multi_error: 0.563239
[444]	valid_0's multi_error: 0.563295
[445]	valid_0's multi_error: 0.563226
[446]	valid_0's multi_error: 0.563219
[447]	valid_0's multi_error: 0.563181
[448]	valid_0's multi_error: 0.563192
[449]	valid_0's multi_error: 0.563228
[450]	valid_0's multi_error: 0.563211
[451]	valid_0's multi_error: 0.563144
[452]	valid_0's multi_error: 0.563147
[453]	valid_0's multi_error: 0.563077
[454]	valid_0's multi_error: 0.563099
[455]	valid_0's multi_error: 0.563062
[456]	valid_0's multi_error: 0.563072
[457]	valid_0's multi_error: 0.563068
[458]	valid_0's multi_error: 0.56305
[459]	valid_0's multi_error: 0.563056
[460]	valid_0's multi_error: 0.563057
[461]	valid_0

[651]	valid_0's multi_error: 0.560893
[652]	valid_0's multi_error: 0.560877
[653]	valid_0's multi_error: 0.560845
[654]	valid_0's multi_error: 0.560808
[655]	valid_0's multi_error: 0.560828
[656]	valid_0's multi_error: 0.56085
[657]	valid_0's multi_error: 0.560831
[658]	valid_0's multi_error: 0.56081
[659]	valid_0's multi_error: 0.560807
[660]	valid_0's multi_error: 0.560783
[661]	valid_0's multi_error: 0.560796
[662]	valid_0's multi_error: 0.560811
[663]	valid_0's multi_error: 0.560819
[664]	valid_0's multi_error: 0.560828
[665]	valid_0's multi_error: 0.560781
[666]	valid_0's multi_error: 0.560784
[667]	valid_0's multi_error: 0.560787
[668]	valid_0's multi_error: 0.560814
[669]	valid_0's multi_error: 0.560831
[670]	valid_0's multi_error: 0.560856
[671]	valid_0's multi_error: 0.560831
[672]	valid_0's multi_error: 0.560831
[673]	valid_0's multi_error: 0.560771
[674]	valid_0's multi_error: 0.560753
[675]	valid_0's multi_error: 0.560732
[676]	valid_0's multi_error: 0.560732
[677]	valid_0'

[868]	valid_0's multi_error: 0.55962
[869]	valid_0's multi_error: 0.559614
[870]	valid_0's multi_error: 0.559653
[871]	valid_0's multi_error: 0.559651
[872]	valid_0's multi_error: 0.559638
[873]	valid_0's multi_error: 0.559639
[874]	valid_0's multi_error: 0.559599
[875]	valid_0's multi_error: 0.559602
[876]	valid_0's multi_error: 0.559581
[877]	valid_0's multi_error: 0.55958
[878]	valid_0's multi_error: 0.559584
[879]	valid_0's multi_error: 0.559565
[880]	valid_0's multi_error: 0.559595
[881]	valid_0's multi_error: 0.559628
[882]	valid_0's multi_error: 0.559633
[883]	valid_0's multi_error: 0.559614
[884]	valid_0's multi_error: 0.55962
[885]	valid_0's multi_error: 0.559639
[886]	valid_0's multi_error: 0.559595
[887]	valid_0's multi_error: 0.559622
[888]	valid_0's multi_error: 0.559626
[889]	valid_0's multi_error: 0.559633
[890]	valid_0's multi_error: 0.559628
[891]	valid_0's multi_error: 0.559586
[892]	valid_0's multi_error: 0.559548
[893]	valid_0's multi_error: 0.559544
[894]	valid_0's

[83]	valid_0's multi_error: 0.587325
[84]	valid_0's multi_error: 0.587143
[85]	valid_0's multi_error: 0.586882
[86]	valid_0's multi_error: 0.586666
[87]	valid_0's multi_error: 0.586485
[88]	valid_0's multi_error: 0.586296
[89]	valid_0's multi_error: 0.586148
[90]	valid_0's multi_error: 0.585896
[91]	valid_0's multi_error: 0.585606
[92]	valid_0's multi_error: 0.585331
[93]	valid_0's multi_error: 0.585207
[94]	valid_0's multi_error: 0.584881
[95]	valid_0's multi_error: 0.584497
[96]	valid_0's multi_error: 0.584519
[97]	valid_0's multi_error: 0.584193
[98]	valid_0's multi_error: 0.583957
[99]	valid_0's multi_error: 0.583903
[100]	valid_0's multi_error: 0.583603
[101]	valid_0's multi_error: 0.583501
[102]	valid_0's multi_error: 0.583236
[103]	valid_0's multi_error: 0.583028
[104]	valid_0's multi_error: 0.58279
[105]	valid_0's multi_error: 0.582443
[106]	valid_0's multi_error: 0.582409
[107]	valid_0's multi_error: 0.582239
[108]	valid_0's multi_error: 0.582101
[109]	valid_0's multi_error: 0

[300]	valid_0's multi_error: 0.567522
[301]	valid_0's multi_error: 0.567515
[302]	valid_0's multi_error: 0.567415
[303]	valid_0's multi_error: 0.567336
[304]	valid_0's multi_error: 0.567319
[305]	valid_0's multi_error: 0.567349
[306]	valid_0's multi_error: 0.56731
[307]	valid_0's multi_error: 0.567234
[308]	valid_0's multi_error: 0.567191
[309]	valid_0's multi_error: 0.56717
[310]	valid_0's multi_error: 0.567175
[311]	valid_0's multi_error: 0.567106
[312]	valid_0's multi_error: 0.567054
[313]	valid_0's multi_error: 0.567069
[314]	valid_0's multi_error: 0.567003
[315]	valid_0's multi_error: 0.567012
[316]	valid_0's multi_error: 0.567003
[317]	valid_0's multi_error: 0.566884
[318]	valid_0's multi_error: 0.566915
[319]	valid_0's multi_error: 0.566848
[320]	valid_0's multi_error: 0.566769
[321]	valid_0's multi_error: 0.566743
[322]	valid_0's multi_error: 0.566745
[323]	valid_0's multi_error: 0.566663
[324]	valid_0's multi_error: 0.566643
[325]	valid_0's multi_error: 0.566615
[326]	valid_0'

[517]	valid_0's multi_error: 0.563252
[518]	valid_0's multi_error: 0.563227
[519]	valid_0's multi_error: 0.563225
[520]	valid_0's multi_error: 0.563236
[521]	valid_0's multi_error: 0.563191
[522]	valid_0's multi_error: 0.563172
[523]	valid_0's multi_error: 0.563182
[524]	valid_0's multi_error: 0.563155
[525]	valid_0's multi_error: 0.563088
[526]	valid_0's multi_error: 0.563076
[527]	valid_0's multi_error: 0.563063
[528]	valid_0's multi_error: 0.563075
[529]	valid_0's multi_error: 0.563066
[530]	valid_0's multi_error: 0.563093
[531]	valid_0's multi_error: 0.563073
[532]	valid_0's multi_error: 0.563063
[533]	valid_0's multi_error: 0.563052
[534]	valid_0's multi_error: 0.563013
[535]	valid_0's multi_error: 0.563003
[536]	valid_0's multi_error: 0.563
[537]	valid_0's multi_error: 0.562939
[538]	valid_0's multi_error: 0.562912
[539]	valid_0's multi_error: 0.56294
[540]	valid_0's multi_error: 0.562906
[541]	valid_0's multi_error: 0.562916
[542]	valid_0's multi_error: 0.562897
[543]	valid_0's 

[734]	valid_0's multi_error: 0.561161
[735]	valid_0's multi_error: 0.561106
[736]	valid_0's multi_error: 0.561093
[737]	valid_0's multi_error: 0.561039
[738]	valid_0's multi_error: 0.561025
[739]	valid_0's multi_error: 0.561015
[740]	valid_0's multi_error: 0.561003
[741]	valid_0's multi_error: 0.561009
[742]	valid_0's multi_error: 0.561046
[743]	valid_0's multi_error: 0.560996
[744]	valid_0's multi_error: 0.560993
[745]	valid_0's multi_error: 0.560963
[746]	valid_0's multi_error: 0.560954
[747]	valid_0's multi_error: 0.560964
[748]	valid_0's multi_error: 0.560949
[749]	valid_0's multi_error: 0.560928
[750]	valid_0's multi_error: 0.560916
[751]	valid_0's multi_error: 0.560901
[752]	valid_0's multi_error: 0.560916
[753]	valid_0's multi_error: 0.560931
[754]	valid_0's multi_error: 0.560928
[755]	valid_0's multi_error: 0.560901
[756]	valid_0's multi_error: 0.560907
[757]	valid_0's multi_error: 0.560919
[758]	valid_0's multi_error: 0.560951
[759]	valid_0's multi_error: 0.56093
[760]	valid_0

[951]	valid_0's multi_error: 0.560279
[952]	valid_0's multi_error: 0.560318
[953]	valid_0's multi_error: 0.560319
[954]	valid_0's multi_error: 0.560324
[955]	valid_0's multi_error: 0.560297
[956]	valid_0's multi_error: 0.560327
[957]	valid_0's multi_error: 0.560252
[958]	valid_0's multi_error: 0.560272
[959]	valid_0's multi_error: 0.56023
[960]	valid_0's multi_error: 0.560252
[961]	valid_0's multi_error: 0.560263
[962]	valid_0's multi_error: 0.560264
[963]	valid_0's multi_error: 0.560243
[964]	valid_0's multi_error: 0.560284
[965]	valid_0's multi_error: 0.5603
[966]	valid_0's multi_error: 0.560282
[967]	valid_0's multi_error: 0.560318
[968]	valid_0's multi_error: 0.560303
[969]	valid_0's multi_error: 0.560325
[970]	valid_0's multi_error: 0.5603
[971]	valid_0's multi_error: 0.560281
[972]	valid_0's multi_error: 0.560306
[973]	valid_0's multi_error: 0.560304
[974]	valid_0's multi_error: 0.560303
[975]	valid_0's multi_error: 0.560291
[976]	valid_0's multi_error: 0.560278
[977]	valid_0's m

[166]	valid_0's multi_error: 0.575956
[167]	valid_0's multi_error: 0.575862
[168]	valid_0's multi_error: 0.575808
[169]	valid_0's multi_error: 0.575732
[170]	valid_0's multi_error: 0.575655
[171]	valid_0's multi_error: 0.575564
[172]	valid_0's multi_error: 0.57542
[173]	valid_0's multi_error: 0.575262
[174]	valid_0's multi_error: 0.575086
[175]	valid_0's multi_error: 0.57504
[176]	valid_0's multi_error: 0.574975
[177]	valid_0's multi_error: 0.574887
[178]	valid_0's multi_error: 0.574825
[179]	valid_0's multi_error: 0.57481
[180]	valid_0's multi_error: 0.574732
[181]	valid_0's multi_error: 0.57467
[182]	valid_0's multi_error: 0.574464
[183]	valid_0's multi_error: 0.574416
[184]	valid_0's multi_error: 0.57432
[185]	valid_0's multi_error: 0.574259
[186]	valid_0's multi_error: 0.574135
[187]	valid_0's multi_error: 0.574074
[188]	valid_0's multi_error: 0.573999
[189]	valid_0's multi_error: 0.573923
[190]	valid_0's multi_error: 0.573865
[191]	valid_0's multi_error: 0.573799
[192]	valid_0's m

[383]	valid_0's multi_error: 0.56498
[384]	valid_0's multi_error: 0.56492
[385]	valid_0's multi_error: 0.564944
[386]	valid_0's multi_error: 0.564887
[387]	valid_0's multi_error: 0.564847
[388]	valid_0's multi_error: 0.564784
[389]	valid_0's multi_error: 0.564811
[390]	valid_0's multi_error: 0.564747
[391]	valid_0's multi_error: 0.564774
[392]	valid_0's multi_error: 0.564743
[393]	valid_0's multi_error: 0.564693
[394]	valid_0's multi_error: 0.564675
[395]	valid_0's multi_error: 0.564756
[396]	valid_0's multi_error: 0.564734
[397]	valid_0's multi_error: 0.56468
[398]	valid_0's multi_error: 0.564635
[399]	valid_0's multi_error: 0.56471
[400]	valid_0's multi_error: 0.56471
[401]	valid_0's multi_error: 0.564734
[402]	valid_0's multi_error: 0.564744
[403]	valid_0's multi_error: 0.56469
[404]	valid_0's multi_error: 0.564641
[405]	valid_0's multi_error: 0.564646
[406]	valid_0's multi_error: 0.564608
[407]	valid_0's multi_error: 0.564583
[408]	valid_0's multi_error: 0.564583
[409]	valid_0's mu

[600]	valid_0's multi_error: 0.561853
[601]	valid_0's multi_error: 0.561877
[602]	valid_0's multi_error: 0.561822
[603]	valid_0's multi_error: 0.561814
[604]	valid_0's multi_error: 0.561792
[605]	valid_0's multi_error: 0.561799
[606]	valid_0's multi_error: 0.561792
[607]	valid_0's multi_error: 0.561783
[608]	valid_0's multi_error: 0.561768
[609]	valid_0's multi_error: 0.561741
[610]	valid_0's multi_error: 0.561746
[611]	valid_0's multi_error: 0.561717
[612]	valid_0's multi_error: 0.561732
[613]	valid_0's multi_error: 0.561732
[614]	valid_0's multi_error: 0.561732
[615]	valid_0's multi_error: 0.561672
[616]	valid_0's multi_error: 0.561684
[617]	valid_0's multi_error: 0.561692
[618]	valid_0's multi_error: 0.561723
[619]	valid_0's multi_error: 0.561737
[620]	valid_0's multi_error: 0.561777
[621]	valid_0's multi_error: 0.561789
[622]	valid_0's multi_error: 0.56175
[623]	valid_0's multi_error: 0.561738
[624]	valid_0's multi_error: 0.56171
[625]	valid_0's multi_error: 0.561671
[626]	valid_0'

[817]	valid_0's multi_error: 0.560159
[818]	valid_0's multi_error: 0.560183
[819]	valid_0's multi_error: 0.560214
[820]	valid_0's multi_error: 0.560253
[821]	valid_0's multi_error: 0.560246
[822]	valid_0's multi_error: 0.560249
[823]	valid_0's multi_error: 0.560275
[824]	valid_0's multi_error: 0.560269
[825]	valid_0's multi_error: 0.560229
[826]	valid_0's multi_error: 0.560187
[827]	valid_0's multi_error: 0.560193
[828]	valid_0's multi_error: 0.560181
[829]	valid_0's multi_error: 0.560189
[830]	valid_0's multi_error: 0.560161
[831]	valid_0's multi_error: 0.560135
[832]	valid_0's multi_error: 0.560159
[833]	valid_0's multi_error: 0.560177
[834]	valid_0's multi_error: 0.560156
[835]	valid_0's multi_error: 0.560189
[836]	valid_0's multi_error: 0.560172
[837]	valid_0's multi_error: 0.560193
[838]	valid_0's multi_error: 0.560213
[839]	valid_0's multi_error: 0.560211
[840]	valid_0's multi_error: 0.560181
[841]	valid_0's multi_error: 0.560137
[842]	valid_0's multi_error: 0.560146
[843]	valid_



In [10]:
age_train['age_group'].nunique()

6

In [14]:
history

import pandas as pd
import numpy as np
import lightgbm as lgb
pd.set_option('display.max_columns',None)
#读取数据
age_train = pd.read_csv("age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("user_app_actived.csv", names=['uid','appId'])
#user_app_usage = pd.read_csv("user_app_usage.csv")
app_info = pd.read_csv("app_info.csv", names=['appId', 'category'])
#处理数据量较大的user_app_usage.csv，结合app_info.csv简单统计得到appuseProcessed.csv作为特征
def f(x):
    s = x.value_counts()
    return np.nan if len(s) == 0 else s.index[0]
def processUserAppUsage():
    resTable = pd.