In [1]:
import pandas as pd 
import numpy as np
import sklearn
import os
import gc
# parallel apply!
import swifter
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()



In [2]:
def parallel_apply(df, apply_func):
    ddf = dd.from_pandas(df, npartitions=24)
    return ddf.map_partitions(lambda df: df.apply(apply_func))

In [3]:
data_dir = './data'

In [4]:
age_test_df = pd.read_csv(os.path.join(data_dir, "age_test.csv"), header=None)
age_train_df = pd.read_csv(os.path.join(data_dir, "age_train.csv"), header=None)
app_info_df = pd.read_csv(os.path.join(data_dir, "app_info.csv"), header=None)
user_app_actived_df = pd.read_csv(os.path.join(data_dir, "user_app_actived.csv"), header=None)
user_basic_info_df = pd.read_csv(os.path.join(data_dir, "user_basic_info.csv"), header=None)
user_behavior_info_df = pd.read_csv(os.path.join(data_dir, "user_behavior_info.csv"), header=None)
user_app_usage = None

In [5]:
age_train_df.columns = ['uid', 'age_group']
age_test_df.columns = ['uid']
user_basic_info_df.columns = ['uid', 'gender', 'city', 'prodName', 'ramCap', 'ramLeft', 
                            'romCap', 'romLeft', 'color', 'fontSize', 'ct', 'carrier', 'os']
user_behavior_info_df.columns = ['uid', 'bootTimes', 'aTimes', 'bTimes', 'cTimes', 'dTimes', 
                                 'eTimes', 'fTimes', 'fSum']
user_app_actived_df.columns = ['uid', 'appId']
app_info_df.columns = ['appId', 'category']

In [6]:
import sklearn.preprocessing
def encodeCategory(df):
    encoder = sklearn.preprocessing.LabelEncoder()
    return encoder.fit_transform(df)

### process basic info

In [7]:
basic_cates = ['city', 'prodName', 'color', 'carrier']
for field in basic_cates:
    user_basic_info_df[field] = encodeCategory(user_basic_info_df[field].astype(str))

In [8]:
# [4g,3g,2g,wifi]
ct_onehot_dict = {
    "4g": [1,0,0,0],
    'wifi': [0,0,0,1],
    '4g#wifi': [1,0,0,1],
    0: [0,0,0,0], #nan
    '3g#wifi': [0,1,0,1],
    '3g': [0,1,0,0],
    '2g#wifi': [0,0,1,1],
    '2g': [0,0,1,0]
}
user_basic_info_df['4g'], user_basic_info_df['3g'], user_basic_info_df['2g'], user_basic_info_df['wifi'] = zip(*user_basic_info_df['ct'].fillna(0).apply(lambda c: ct_onehot_dict[c]))
user_basic_info_df.drop('ct', axis=1, inplace=True)

### process user_app_actived and app_info

In [9]:
app_info_df['category_id'] = encodeCategory(app_info_df['category'])
app_info_map = {r[1]['appId']: r[1]['category'] for r in tqdm(app_info_df.iterrows(), 
                                                              total=app_info_df.shape[0], leave=False)}

HBox(children=(IntProgress(value=0, max=188864), HTML(value='')))



In [10]:
user_app_actived_df['app_category'] = user_app_actived_df['appId'].progress_apply(
    lambda a: ' '.join([app_info_map.get(appId, 'unknown') for appId in a.split('#')]))

HBox(children=(IntProgress(value=0, max=2512500), HTML(value='')))




In [11]:
import sklearn.feature_extraction.text
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
app_vectors = vectorizer.fit_transform(user_app_actived_df['app_category'])

In [12]:
user_app_vec = pd.DataFrame(user_app_actived_df['uid'])
for i in range(len(vectorizer.vocabulary_)):
    user_app_vec['app_vec_%d'%i] = app_vectors[:, i].toarray()

## Prepare training features

In [13]:
user_info_df = user_basic_info_df.merge(user_behavior_info_df, on='uid')

In [14]:
features_df = user_info_df.merge(user_app_vec, on='uid', how='left')

In [15]:
features_df.sort_values(by='uid', inplace=True)
features_df.reset_index(drop=True, inplace=True)

In [16]:
train_data = features_df.merge(age_train_df, on='uid', how='right')
test_x = features_df.merge(age_test_df, on='uid', how='right')

In [17]:
train_x = train_data.drop(['uid', 'age_group'], axis=1)
train_y = train_data['age_group']

In [26]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

## Train Model


In [19]:
train_x.columns

Index(['gender', 'city', 'prodName', 'ramCap', 'ramLeft', 'romCap', 'romLeft',
       'color', 'fontSize', 'carrier', 'os', '4g', '3g', '2g', 'wifi',
       'bootTimes', 'aTimes', 'bTimes', 'cTimes', 'dTimes', 'eTimes', 'fTimes',
       'fSum', 'app_vec_0', 'app_vec_1', 'app_vec_2', 'app_vec_3', 'app_vec_4',
       'app_vec_5', 'app_vec_6', 'app_vec_7', 'app_vec_8', 'app_vec_9',
       'app_vec_10', 'app_vec_11', 'app_vec_12', 'app_vec_13', 'app_vec_14',
       'app_vec_15', 'app_vec_16', 'app_vec_17', 'app_vec_18', 'app_vec_19',
       'app_vec_20', 'app_vec_21', 'app_vec_22', 'app_vec_23', 'app_vec_24',
       'app_vec_25', 'app_vec_26', 'app_vec_27'],
      dtype='object')

In [20]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [21]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import xgboost as xgb

def cv(x, y, params={}, splits=3):
    clf = lgb.LGBMClassifier(**params)
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, fit_params={'categorical_feature': categorical_features})
    return cv_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [22]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        "num_leaves": 200,
        "max_depth": -1,
        "learning_rate": 0.1,
        'min_child_samples': 100,
        'n_estimators': 100,
        'learning_rate': 0.05,
        'boost_from_average': True,
        'min_child_weight': 1e-3,
        "subsample_for_bin": 20000,
        'max_bin': 512,
        "metric": 'multi_logloss',
        'reg_alpha': 3,
        'reg_lambda': 5,
        'subsample': 0.9,
        'colsample_bytree':0.7, 
        'subsample_freq': 1,
        'n_jobs': -1,
}
def run_cross_validation(x, y):
    cv_result = cv(x, y, params=params, splits=3)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [23]:
run_cross_validation(train_x, train_y)

New categorical_feature is ['carrier', 'city', 'color', 'gender', 'prodName']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


fit_time: [158.46991014 184.35367703 180.13871121]
Average fit_time: 174.320766
score_time: [10.08137679 14.20033097 12.07854986]
Average score_time: 12.120086
test_accuracy: [0.48316644 0.48291045 0.48329624]
Average test_accuracy: 0.483124
train_accuracy: [0.53406906 0.5357694  0.53474288]
Average train_accuracy: 0.534860


In [24]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y, categorical_feature=categorical_features)
    return clf


In [27]:
model = train(train_x, train_y, params)

In [31]:
pred_y = model.predict(test_x.drop('uid', axis=1))

In [39]:
result = pd.DataFrame(test_x['uid'])
result.columns = ['id']
result['label'] = pred_y

In [41]:
result.to_csv('submission.csv', index=False)

## Load user_app_usage

In [None]:
user_app_usage_lines = 651007719
pieces = 10
interval = int(user_app_usage_lines / pieces)
uau_skiprows = [range(0, i*interval) for i in range(pieces)]
user_app_usage_path = '/Volumes/nvsd/user_app_usage.csv'

In [None]:
def process_date(d):
    ymd = d.split('-')
    if ymd[1] == '02':
        return 0
    else: return int(ymd[2])

def read_user_app_usage(pieces_num):
    global user_app_usage
    if user_app_usage is not None:
        del user_app_usage
        gc.collect()
    user_app_usage_part = pd.read_csv('/Volumes/miguch/data/user_app_usage%d.csv'%i, header=None)
    user_app_usage_part.columns = ['uid', 'appId', 'duration', 'times', 'use_date']
    user_app_usage = user_app_usage_part
    user_app_usage['duration'] = user_app_usage['duration'].astype(np.int32)
    user_app_usage['times'] = user_app_usage['times'].astype(np.int16)
    user_app_usage['use_date'] = user_app_usage['use_date'].swifter.allow_dask_on_strings().apply(process_date)

In [None]:
read_user_app_usage(6)