In [1]:
import pandas as pd 
import numpy as np
import sklearn
import os
import gc
# parallel apply!
import swifter
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()



In [2]:
def parallel_apply(df, apply_func):
    ddf = dd.from_pandas(df, npartitions=24)
    return ddf.map_partitions(lambda df: df.apply(apply_func))

In [3]:
data_dir = './data'

In [4]:
age_test_df = pd.read_csv(os.path.join(data_dir, "age_test.csv"), header=None)
age_train_df = pd.read_csv(os.path.join(data_dir, "age_train.csv"), header=None)
app_info_df = pd.read_csv(os.path.join(data_dir, "app_info.csv"), header=None)
user_app_actived_df = pd.read_csv(os.path.join(data_dir, "user_app_actived.csv"), header=None)
user_basic_info_df = pd.read_csv(os.path.join(data_dir, "user_basic_info.csv"), header=None)
user_behavior_info_df = pd.read_csv(os.path.join(data_dir, "user_behavior_info.csv"), header=None)
user_app_usage = None

In [5]:
age_train_df.columns = ['uid', 'age_group']
age_test_df.columns = ['uid']
user_basic_info_df.columns = ['uid', 'gender', 'city', 'prodName', 'ramCap', 'ramLeft', 
                            'romCap', 'romLeft', 'color', 'fontSize', 'ct', 'carrier', 'os']
user_behavior_info_df.columns = ['uid', 'bootTimes', 'aTimes', 'bTimes', 'cTimes', 'dTimes', 
                                 'eTimes', 'fTimes', 'fSum']
user_app_actived_df.columns = ['uid', 'appId']
app_info_df.columns = ['appId', 'category']

In [6]:
import sklearn.preprocessing
def encodeCategory(df):
    encoder = sklearn.preprocessing.LabelEncoder()
    return encoder.fit_transform(df)

### process basic info

In [7]:
basic_cates = ['city', 'prodName', 'color', 'carrier']
for field in basic_cates:
    user_basic_info_df[field] = encodeCategory(user_basic_info_df[field].astype(str))

In [8]:
# [4g,3g,2g,wifi]
ct_onehot_dict = {
    "4g": [1,0,0,0],
    'wifi': [0,0,0,1],
    '4g#wifi': [1,0,0,1],
    0: [0,0,0,0], #nan
    '3g#wifi': [0,1,0,1],
    '3g': [0,1,0,0],
    '2g#wifi': [0,0,1,1],
    '2g': [0,0,1,0]
}
user_basic_info_df['4g'], user_basic_info_df['3g'], user_basic_info_df['2g'], user_basic_info_df['wifi'] = zip(*user_basic_info_df['ct'].fillna(0).apply(lambda c: ct_onehot_dict[c]))
user_basic_info_df.drop('ct', axis=1, inplace=True)

### process user_app_actived and app_info

In [9]:
app_info_df['category_id'] = encodeCategory(app_info_df['category'])
app_info_map = {r[1]['appId']: r[1]['category'] for r in tqdm(app_info_df.iterrows(), 
                                                              total=app_info_df.shape[0], leave=False)}

HBox(children=(IntProgress(value=0, max=188864), HTML(value='')))



In [10]:
user_app_actived_df['app_category'] = user_app_actived_df['appId'].progress_apply(
    lambda a: ' '.join([app_info_map.get(appId, 'unknown') for appId in a.split('#')]))

HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))




In [11]:
import sklearn.feature_extraction.text
# vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
w2v_vectorizer = sklearn.feature_extraction.text.CountVectorizer()
w2v_app_vectors = w2v_vectorizer.fit_transform(user_app_actived_df['app_category'])
tf_idf_vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
tf_idf_vectors = tf_idf_vectorizer.fit_transform(user_app_actived_df['app_category'])

In [12]:
user_app_vec = pd.DataFrame(user_app_actived_df['uid'])
for i in range(len(w2v_vectorizer.vocabulary_)):
    user_app_vec['app_w2v_%s'%list(w2v_vectorizer.vocabulary_.keys())[i]] = w2v_app_vectors[:, i].toarray()
for i in range(len(tf_idf_vectorizer.vocabulary_)):
    user_app_vec['app_tfidf_%s'%list(tf_idf_vectorizer.vocabulary_.keys())[i]] = tf_idf_vectors[:, i].toarray()
user_app_vec['total_app'] = user_app_actived_df['appId'].progress_apply(
    lambda a: len(a.split('#')))

HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))




## Prepare training features

In [13]:
user_info_df = user_basic_info_df.merge(user_behavior_info_df, on='uid')

In [14]:
features_df = user_info_df.merge(user_app_vec, on='uid', how='left') 

In [15]:
features_df.sort_values(by='uid', inplace=True)
features_df.reset_index(drop=True, inplace=True)

In [16]:
features_df.columns

Index(['uid', 'gender', 'city', 'prodName', 'ramCap', 'ramLeft', 'romCap',
       'romLeft', 'color', 'fontSize', 'carrier', 'os', '4g', '3g', '2g',
       'wifi', 'bootTimes', 'aTimes', 'bTimes', 'cTimes', 'dTimes', 'eTimes',
       'fTimes', 'fSum', 'app_w2v_社交通讯', 'app_w2v_教育', 'app_w2v_运动健康',
       'app_w2v_便捷生活', 'app_w2v_实用工具', 'app_w2v_金融理财', 'app_w2v_购物比价',
       'app_w2v_儿童', 'app_w2v_新闻阅读', 'app_w2v_商务', 'app_w2v_汽车',
       'app_w2v_unknown', 'app_w2v_出行导航', 'app_w2v_影音娱乐', 'app_w2v_美食',
       'app_w2v_旅游住宿', 'app_w2v_拍摄美化', 'app_w2v_角色扮演', 'app_w2v_动作射击',
       'app_w2v_经营策略', 'app_w2v_棋牌桌游', 'app_w2v_休闲益智', 'app_w2v_主题个性',
       'app_w2v_体育竞速', 'app_w2v_学习办公', 'app_w2v_网络游戏', 'app_w2v_益智棋牌',
       'app_w2v_表盘个性', 'app_tfidf_社交通讯', 'app_tfidf_教育', 'app_tfidf_运动健康',
       'app_tfidf_便捷生活', 'app_tfidf_实用工具', 'app_tfidf_金融理财', 'app_tfidf_购物比价',
       'app_tfidf_儿童', 'app_tfidf_新闻阅读', 'app_tfidf_商务', 'app_tfidf_汽车',
       'app_tfidf_unknown', 'app_tfidf_出行导航', 'app_tfi

In [17]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [18]:
# def one_hot_category(df, feature_name):
#     dum = pd.get_dummies(df[feature_name])
#     for col in dum.columns:
#         df['%s_%s'%(feature_name, col)] = dum[col]
#     df.drop(feature_name, axis=1, inplace=True)
# for cate in categorical_features:
#     one_hot_category(features_df, cate)

In [19]:
train_data = features_df.merge(age_train_df, on='uid', how='right')
test_x = features_df.merge(age_test_df, on='uid', how='right')

In [20]:
train_x = train_data.drop(['uid', 'age_group'], axis=1)
train_y = train_data['age_group']

In [21]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

In [22]:
del features_df
del user_basic_info_df
del user_behavior_info_df
del user_app_vec
gc.collect()

56

## Train Model


In [26]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import xgboost as xgb

def cv(clf, x, y, params={}, splits=3, fit_params={}):
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, verbose=5, fit_params=fit_params)
    return cv_score

In [33]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        "metric": 'multi_logloss',
        'n_jobs': -1,
    
        "num_leaves": 180,
        "max_depth": -1,
        'min_child_samples': 100,
        'n_estimators': 100,
        'learning_rate': 0.05,
        'boost_from_average': True,
        'min_child_weight': 1e-3,
        "subsample_for_bin": 50000,
        'max_bin': 510,
        'reg_alpha': 3,
        'reg_lambda': 5,
        'subsample': 0.9,
        'colsample_bytree':0.7, 
        'subsample_freq': 1,
}
def run_cross_validation(clf, x, y, fit_params={}):
    cv_result = cv(clf, x, y, params=params, splits=3, fit_params=fit_params)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [34]:
clf = lgb.LGBMClassifier(**params)
run_cross_validation(clf, train_x, train_y, {'categorical_feature': categorical_features})

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


New categorical_feature is ['carrier', 'city', 'color', 'gender', 'prodName']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[CV] ..................... , accuracy=0.485750021268625, total= 4.0min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.6min remaining:    0.0s


[CV] .................... , accuracy=0.4857373134328358, total= 3.5min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.7min remaining:    0.0s


[CV] ..................... , accuracy=0.484461917107339, total= 3.4min
fit_time: [222.33386493 197.4687469  189.05100322]
Average fit_time: 202.951205
score_time: [16.71494985 15.12748599 13.94717073]
Average score_time: 15.263202
test_accuracy: [0.48575002 0.48573731 0.48446192]
Average test_accuracy: 0.485316
train_accuracy: [0.53195189 0.53182164 0.53244139]
Average train_accuracy: 0.532072


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 12.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 12.6min finished


In [None]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
#     clf.fit(x, y, categorical_feature=categorical_features)
    clf.fit(x, y, categorical_feature=categorical_features)
    return clf


In [None]:
model = train(train_x, train_y, params)

In [None]:
pred_y = model.predict(test_x.drop('uid', axis=1))

In [None]:
result = pd.DataFrame(test_x['uid'])
result.columns = ['id']
result['label'] = pred_y

In [None]:
result.to_csv('submission.csv', index=False)

In [None]:
for i in range(len(train_x.columns)):
    print(train_x.columns[i], model.feature_importances_[i])