In [1]:
import pandas as pd 
import numpy as np
import sklearn
import os
import gc
# parallel apply!
import swifter
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()



In [2]:
data_dir = './data'

In [3]:
age_test_df = pd.read_csv(os.path.join(data_dir, "age_test.csv"), header=None)
age_train_df = pd.read_csv(os.path.join(data_dir, "age_train.csv"), header=None)
app_info_df = pd.read_csv(os.path.join(data_dir, "app_info.csv"), header=None)
user_app_actived_df = pd.read_csv(os.path.join(data_dir, "user_app_actived.csv"), header=None)
user_basic_info_df = pd.read_csv(os.path.join(data_dir, "user_basic_info.csv"), header=None)
user_behavior_info_df = pd.read_csv(os.path.join(data_dir, "user_behavior_info.csv"), header=None)
usage_summary_df = pd.read_csv(os.path.join(data_dir, "usages_summary.csv"))
user_app_usage = None

In [4]:
age_train_df.columns = ['uid', 'age_group']
age_test_df.columns = ['uid']
user_basic_info_df.columns = ['uid', 'gender', 'city', 'prodName', 'ramCap', 'ramLeft', 
                            'romCap', 'romLeft', 'color', 'fontSize', 'ct', 'carrier', 'os']
user_behavior_info_df.columns = ['uid', 'bootTimes', 'aTimes', 'bTimes', 'cTimes', 'dTimes', 
                                 'eTimes', 'fTimes', 'fSum']
user_app_actived_df.columns = ['uid', 'appId']
app_info_df.columns = ['appId', 'category']

In [5]:
import sklearn.preprocessing
def encodeCategory(df):
    encoder = sklearn.preprocessing.LabelEncoder()
    return encoder.fit_transform(df)

### process basic info

In [6]:
basic_cates = ['city', 'prodName', 'color', 'carrier']
for field in basic_cates:
    user_basic_info_df[field] = encodeCategory(user_basic_info_df[field].astype(str))

In [7]:
# [4g,3g,2g,wifi]
ct_onehot_dict = {
    "4g": [1,0,0,0],
    'wifi': [0,0,0,1],
    '4g#wifi': [1,0,0,1],
    0: [0,0,0,0], #nan
    '3g#wifi': [0,1,0,1],
    '3g': [0,1,0,0],
    '2g#wifi': [0,0,1,1],
    '2g': [0,0,1,0]
}
user_basic_info_df['4g'], user_basic_info_df['3g'], user_basic_info_df['2g'], user_basic_info_df['wifi'] = zip(*user_basic_info_df['ct'].fillna(0).apply(lambda c: ct_onehot_dict[c]))
user_basic_info_df.drop('ct', axis=1, inplace=True)

### process user_app_actived and app_info

In [8]:
app_info_df['category_id'] = encodeCategory(app_info_df['category'])
app_info_map = {r[1]['appId']: r[1]['category'] for r in tqdm(app_info_df.iterrows(), 
                                                              total=app_info_df.shape[0], leave=False)}

HBox(children=(IntProgress(value=0, max=188864), HTML(value='')))



In [9]:
user_app_actived_df['app_category'] = user_app_actived_df['appId'].progress_apply(
    lambda a: ' '.join([app_info_map.get(appId, 'unknown') for appId in a.split('#')]))

HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))




In [10]:
import sklearn.feature_extraction.text
# vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
w2v_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
w2v_app_vectors = w2v_vectorizer.fit_transform(user_app_actived_df['app_category'])
tf_idf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
tf_idf_vectors = tf_idf_vectorizer.fit_transform(user_app_actived_df['app_category'])

In [11]:
user_app_vec = pd.DataFrame(user_app_actived_df['uid'])
for i in range(len(w2v_vectorizer.vocabulary_)):
    user_app_vec['app_w2v_%s'%list(w2v_vectorizer.vocabulary_.keys())[i]] = w2v_app_vectors[:, i].toarray()
for i in range(len(tf_idf_vectorizer.vocabulary_)):
    user_app_vec['app_tfidf_%s'%list(tf_idf_vectorizer.vocabulary_.keys())[i]] = tf_idf_vectors[:, i].toarray()
user_app_vec['total_app'] = user_app_actived_df['appId'].progress_apply(
    lambda a: len(a.split('#')))

HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))




## Prepare training features

In [12]:
user_info_df = user_basic_info_df.merge(user_behavior_info_df, on='uid')

In [13]:
features_df = user_info_df.merge(user_app_vec, on='uid', how='left') 

In [14]:
features_df = features_df.merge(usage_summary_df, on='uid', how='left') 

In [15]:
features_df.sort_values(by='uid', inplace=True)
features_df.reset_index(drop=True, inplace=True)

In [16]:
features_df.columns, len(features_df.columns)

(Index(['uid', 'gender', 'city', 'prodName', 'ramCap', 'ramLeft', 'romCap',
        'romLeft', 'color', 'fontSize',
        ...
        'duration_旅游住宿', 'usage_动作冒险', 'times_动作冒险', 'duration_动作冒险',
        'usage_策略游戏', 'times_策略游戏', 'duration_策略游戏', 'usage_合作壁纸*',
        'times_合作壁纸*', 'duration_合作壁纸*'],
       dtype='object', length=204), 204)

In [17]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [18]:
# def one_hot_category(df, feature_name):
#     dum = pd.get_dummies(df[feature_name])
#     for col in dum.columns:
#         df['%s_%s'%(feature_name, col)] = dum[col]
#     df.drop(feature_name, axis=1, inplace=True)
# for cate in categorical_features:
#     one_hot_category(features_df, cate)

In [19]:
del user_basic_info_df
del user_behavior_info_df
del user_app_vec
del user_info_df
del usage_summary_df
gc.collect()

90

In [20]:
train_data = features_df.merge(age_train_df, on='uid', how='right')
test_x = features_df.merge(age_test_df, on='uid', how='right')

In [21]:
del features_df
gc.collect()

68

In [22]:
train_x = train_data.drop(['uid', 'age_group'], axis=1)
train_y = train_data['age_group']

In [23]:
del train_data
gc.collect()

47

In [24]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

## Train Model


In [25]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import xgboost as xgb

def cv(clf, x, y, params={}, splits=3, fit_params={}):
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, verbose=5, fit_params=fit_params)
    return cv_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [26]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        "metric": 'multi_logloss',
        'n_jobs': -1,
    
        #better
#         'n_estimators': 600,
#         "num_leaves": 900,
#         "subsample_for_bin": 50000,
#         'max_bin': 750,
    
        #fast
        'n_estimators': 100,
        "num_leaves": 200,
        "subsample_for_bin": 20000,
        'max_bin': 512,
    
    
        "max_depth": -1,
        'min_child_samples': 100,
        'learning_rate': 0.06,
        'boost_from_average': True,
        'min_child_weight': 1e-3,
        'reg_alpha': 2,
        'reg_lambda': 5,
        'colsample_bytree':0.7, 
}
def run_cross_validation(clf, x, y, fit_params={}):
    cv_result = cv(clf, x, y, params=params, splits=3, fit_params=fit_params)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [27]:
gc.collect()
clf = lgb.LGBMClassifier(**params)
run_cross_validation(clf, train_x, train_y, {'categorical_feature': categorical_features})

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


New categorical_feature is ['carrier', 'city', 'color', 'gender', 'prodName']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[CV] .................... , accuracy=0.5097887913600129, total= 7.7min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.4min remaining:    0.0s


[CV]  ................................................................
[CV] ..................... , accuracy=0.510410447761194, total= 7.6min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 16.6min remaining:    0.0s


[CV]  ................................................................
[CV] .................... , accuracy=0.5094097155368889, total= 8.2min
fit_time: [441.84404588 437.93021297 476.13990712]
Average fit_time: 451.971389
score_time: [19.64626932 15.80504704 15.8221848 ]
Average score_time: 17.091167
test_accuracy: [0.50978879 0.51041045 0.50940972]
Average test_accuracy: 0.509870
train_accuracy: [0.56870863 0.5667791  0.56837047]
Average train_accuracy: 0.567953


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 25.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 25.4min finished


In [28]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
#     clf.fit(x, y, categorical_feature=categorical_features)
    clf.fit(x, y, categorical_feature=categorical_features)
    return clf


In [29]:
model = train(train_x, train_y, params)

In [30]:
pred_y = model.predict(test_x.drop('uid', axis=1))

In [31]:
result = pd.DataFrame(test_x['uid'])
result.columns = ['id']
result['label'] = pred_y

In [32]:
result.to_csv('submission.csv', index=False)

In [33]:
for i in range(len(train_x.columns)):
    print(train_x.columns[i], model.feature_importances_[i])

gender 1004
city 33746
prodName 12197
ramCap 28
ramLeft 44
romCap 95
romLeft 812
color 5023
fontSize 1826
carrier 73
os 59
4g 240
3g 0
2g 0
wifi 360
bootTimes 701
aTimes 666
bTimes 85
cTimes 354
dTimes 2088
eTimes 2478
fTimes 2
fSum 1273
app_w2v_社交通讯 598
app_w2v_教育 11
app_w2v_运动健康 15
app_w2v_便捷生活 118
app_w2v_实用工具 100
app_w2v_金融理财 593
app_w2v_购物比价 208
app_w2v_儿童 479
app_w2v_新闻阅读 224
app_w2v_商务 0
app_w2v_汽车 767
app_w2v_unknown 414
app_w2v_出行导航 463
app_w2v_影音娱乐 1101
app_w2v_美食 1130
app_w2v_旅游住宿 230
app_w2v_拍摄美化 18
app_w2v_角色扮演 277
app_w2v_动作射击 0
app_w2v_经营策略 502
app_w2v_棋牌桌游 372
app_w2v_休闲益智 0
app_w2v_主题个性 96
app_w2v_体育竞速 0
app_w2v_学习办公 61
app_w2v_网络游戏 106
app_w2v_益智棋牌 29
app_w2v_表盘个性 641
app_tfidf_社交通讯 922
app_tfidf_教育 67
app_tfidf_运动健康 82
app_tfidf_便捷生活 387
app_tfidf_实用工具 282
app_tfidf_金融理财 871
app_tfidf_购物比价 727
app_tfidf_儿童 941
app_tfidf_新闻阅读 952
app_tfidf_商务 1
app_tfidf_汽车 1600
app_tfidf_unknown 1355
app_tfidf_出行导航 955
app_tfidf_影音娱乐 1536
app_tfidf_美食 1326
app_tfidf_旅游住宿 755
app_tfid

In [34]:
gc.collect()

168