In [None]:
import pandas as pd 
import numpy as np
import sklearn
import os
import gc
# parallel apply!
import swifter
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

In [None]:
def parallel_apply(df, apply_func):
    ddf = dd.from_pandas(df, npartitions=24)
    return ddf.map_partitions(lambda df: df.apply(apply_func))

In [None]:
data_dir = './data'

In [None]:
age_test_df = pd.read_csv(os.path.join(data_dir, "age_test.csv"), header=None)
age_train_df = pd.read_csv(os.path.join(data_dir, "age_train.csv"), header=None)
app_info_df = pd.read_csv(os.path.join(data_dir, "app_info.csv"), header=None)
user_app_actived_df = pd.read_csv(os.path.join(data_dir, "user_app_actived.csv"), header=None)
user_basic_info_df = pd.read_csv(os.path.join(data_dir, "user_basic_info.csv"), header=None)
user_behavior_info_df = pd.read_csv(os.path.join(data_dir, "user_behavior_info.csv"), header=None)
user_app_usage = None

In [None]:
age_train_df.columns = ['uid', 'age_group']
age_test_df.columns = ['uid']
user_basic_info_df.columns = ['uid', 'gender', 'city', 'prodName', 'ramCap', 'ramLeft', 
                            'romCap', 'romLeft', 'color', 'fontSize', 'ct', 'carrier', 'os']
user_behavior_info_df.columns = ['uid', 'bootTimes', 'aTimes', 'bTimes', 'cTimes', 'dTimes', 
                                 'eTimes', 'fTimes', 'fSum']
user_app_actived_df.columns = ['uid', 'appId']
app_info_df.columns = ['appId', 'category']

In [None]:
import sklearn.preprocessing
def encodeCategory(df):
    encoder = sklearn.preprocessing.LabelEncoder()
    return encoder.fit_transform(df)

### process basic info

In [None]:
basic_cates = ['city', 'prodName', 'color', 'carrier']
for field in basic_cates:
    user_basic_info_df[field] = encodeCategory(user_basic_info_df[field].astype(str))

In [None]:
# [4g,3g,2g,wifi]
ct_onehot_dict = {
    "4g": [1,0,0,0],
    'wifi': [0,0,0,1],
    '4g#wifi': [1,0,0,1],
    0: [0,0,0,0], #nan
    '3g#wifi': [0,1,0,1],
    '3g': [0,1,0,0],
    '2g#wifi': [0,0,1,1],
    '2g': [0,0,1,0]
}
user_basic_info_df['4g'], user_basic_info_df['3g'], user_basic_info_df['2g'], user_basic_info_df['wifi'] = zip(*user_basic_info_df['ct'].fillna(0).apply(lambda c: ct_onehot_dict[c]))
user_basic_info_df.drop('ct', axis=1, inplace=True)

### process user_app_actived and app_info

In [None]:
app_info_df['category_id'] = encodeCategory(app_info_df['category'])
app_info_map = {r[1]['appId']: r[1]['category'] for r in tqdm(app_info_df.iterrows(), 
                                                              total=app_info_df.shape[0], leave=False)}

In [None]:
user_app_actived_df['app_category'] = user_app_actived_df['appId'].progress_apply(
    lambda a: ' '.join([app_info_map.get(appId, 'unknown') for appId in a.split('#')]))

In [None]:
import sklearn.feature_extraction.text
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
app_vectors = vectorizer.fit_transform(user_app_actived_df['app_category'])

In [None]:
user_app_vec = pd.DataFrame(user_app_actived_df['uid'])
for i in range(len(vectorizer.vocabulary_)):
    user_app_vec['app_vec_%d'%i] = app_vectors[:, i].toarray()

## Load user_app_usage

In [None]:
user_app_usage_lines = 651007719
pieces = 10
interval = int(user_app_usage_lines / pieces)
uau_skiprows = [range(0, i*interval) for i in range(pieces)]
user_app_usage_path = '/Volumes/nvsd/user_app_usage.csv'

In [None]:
def process_date(d):
    ymd = d.split('-')
    if ymd[1] == '02':
        return 0
    else: return int(ymd[2])

def read_user_app_usage(pieces_num):
    global user_app_usage
    if user_app_usage is not None:
        del user_app_usage
        gc.collect()
    user_app_usage_part = pd.read_csv('/Volumes/miguch/data/user_app_usage%d.csv'%pieces_num, header=None)
    user_app_usage_part.columns = ['uid', 'appId', 'duration', 'times', 'use_date']
    user_app_usage = user_app_usage_part
    user_app_usage['duration'] = user_app_usage['duration'].astype(np.int32)
    user_app_usage['times'] = user_app_usage['times'].astype(np.int16)
    user_app_usage['use_date'] = user_app_usage['use_date'].swifter.allow_dask_on_strings().apply(process_date)

In [None]:
read_user_app_usage(6)

## Prepare training features

In [None]:
user_info_df = user_basic_info_df.merge(user_behavior_info_df, on='uid')

In [None]:
features_df = user_info_df.merge(user_app_vec, on='uid', how='left')

In [None]:
features_df.sort_values(by='uid', inplace=True)
features_df.reset_index(drop=True, inplace=True)

In [None]:
train_data = features_df.merge(age_train_df, on='uid', how='right')
test_x = features_df.merge(age_test_df, on='uid', how='right')

In [None]:
train_x = train_data.drop(['uid', 'age_group'], axis=1)
train_y = train_data['age_group']

In [None]:
train_x.fillna(train_x.mean(), inplace=True)
test_x.fillna(train_x.mean(), inplace=True)

## Train Model


In [None]:
train_x.columns

In [None]:
categorical_features = ['gender', 'city', 'prodName', 'color', 'carrier']

In [None]:
import sklearn.model_selection
import sklearn.metrics
import lightgbm as lgb
import sklearn.ensemble
import xgboost as xgb

def cv(x, y, params={}, splits=3):
    clf = lgb.LGBMClassifier(**params)
    kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
    cv_score = sklearn.model_selection.cross_validate(clf, x, y, cv=kfold, scoring={
        'accuracy': 'accuracy',
    }, return_train_score=True, fit_params={'categorical_feature': categorical_features})
    return cv_score

In [None]:

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        "num_leaves": 200,
        "max_depth": -1,
        "learning_rate": 0.1,
        'min_child_samples': 100,
        'n_estimators': 100,
        'learning_rate': 0.05,
        'boost_from_average': True,
        'min_child_weight': 1e-3,
        "subsample_for_bin": 20000,
        'max_bin': 512,
        "metric": 'multi_logloss',
        'reg_alpha': 3,
        'reg_lambda': 5,
        'subsample': 0.9,
        'colsample_bytree':0.7, 
        'subsample_freq': 1,
        'n_jobs': -1,
}
def run_cross_validation(x, y):
    cv_result = cv(x, y, params=params, splits=3)
    for scorer, score in cv_result.items():
        print('%s: %s' % (scorer, score))
        print('Average %s: %f' % (scorer, score.mean()))

In [None]:
run_cross_validation(train_x, train_y)

In [None]:
def train(x, y, params={}):
    clf = lgb.LGBMClassifier(**params)
    clf.fit(x, y, categorical_feature=categorical_features)
    return clf


In [None]:
model = train(train_x, train_y, params)

In [None]:
pred_y = model.predict(test_x.drop('uid', axis=1))

In [None]:
result = pd.DataFrame(test_x['uid'])
result.columns = ['id']
result['label'] = pred_y

In [None]:
result.to_csv('submission.csv', index=False)

In [None]:
for n in range(len(train_x.columns)):
    print(train_x.columns[n], model.feature_importances_[n])
