In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score
from sklearn.metrics import mean_squared_error

import gc
import time
import os
import sys
import warning

pickle_path = "../pickle"

train = pd.read_csv("../data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data/age_test.csv",names=['uid']).sort_values(by=['uid'])
info = pd.read_csv("../data/app_info.csv",names=['appid','category'])
usage = pd.read_pickle("{}/user_app_usage.pickle".format(pickle_path))
user_basic_info = pd.read_csv("../data/user_basic_info.csv",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])
behavior_info = pd.read_csv("../data/user_behavior_info.csv",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])
print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))

all_data = train.append(test)
all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)
print(all_data.shape)

for i in tqdm(user_basic_info.select_dtypes('object').columns):
    lbl = LabelEncoder()
    user_basic_info[i] = lbl.fit_transform(user_basic_info[i].astype('str')) 

appid = LabelEncoder()
usage['appid'] = appid.fit_transform(usage['appid'])

usage = usage.merge(behavior_info,how='left',on='uid').merge(user_basic_info,how='left',on='uid')
print(usage.head())

usage['uid_appid_count'] = usage[['uid','appid']].groupby(['uid'])['appid'].transform('count')
usage['appid_uid_count'] = usage[['uid','appid']].groupby(['appid'])['uid'].transform('count')

usage['uid_fontsize_std'] = usage[['uid','fontsize']].groupby(['uid'])['fontsize'].transform('std')
usage['uid_fontsize_mean'] = usage[['uid','fontsize']].groupby(['uid'])['fontsize'].transform('mean')

usage['woy'] = usage['use_date'].dt.weekofyear
usage['doy'] = usage['use_date'].dt.dayofyear
usage['wday'] = usage['use_date'].dt.dayofweek
usage['weekend'] = (usage.use_date.dt.weekday >=5).astype(int)
usage['day'] = usage['use_date'].dt.day

for i in tqdm(['duration','times']):
    usage['appid_{}_mean'.format(i)] = usage[['appid',i]].groupby(['appid'])[i].transform('mean')
    usage['appid_{}_mean'.format(i)] = usage[['appid',i]].groupby(['appid'])[i].transform('std')
    
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

usage = reduce_mem_usage(usage)
usage['use_date'] = usage['use_date'].astype('int') * 1e-16
usage = usage.merge(train,how='left',on='uid')

from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold,RepeatedKFold

def rmse(y_true, y_pred):
    return (mean_squared_error(y_true, y_pred))** .5

# use df_hist_train df_new_train df_hist_new_train to train 3 models
train_df = usage[usage['age_group'].notnull()]
test_df = usage[usage['age_group'].isnull()]

drop_features = ['age_group', 'uid',]
cat_features = ['appid'] 

feats = [f for f in usage.columns if f not in drop_features]

n_splits= 3
folds = GroupKFold(n_splits=n_splits)
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])

print ('feats:' + str(len(feats)))

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['age_group'],groups=train_df['uid'])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df['age_group'].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['age_group'].iloc[valid_idx] 
    
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    params = {
               "objective" : "regression", 
               "boosting" : "gbdt", 
               "metric" : "rmse",  
               "max_depth": 7, 
               "num_leaves" : 31, 
               "max_bin" : 255, 
               "learning_rate" : 0.1, 
               "subsample" : 0.8,
               "colsample_bytree" : 0.8, 
               "verbosity": -1,
               "num_threads" : 40,
    }
    

    if n_fold >= 0:
        evals_result = {}
        dtrain = lgb.Dataset(
            train_x, label=train_y,categorical_feature=cat_features)
        dval = lgb.Dataset(
            valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features)
        bst = lgb.train(
            params, dtrain, num_boost_round=30000,
            valid_sets=[dval], early_stopping_rounds=100, verbose_eval=20,)#feval = evalerror
        
        new_list = sorted(zip(feats, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]
        for item in new_list:
            print (item) 

        oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)

        sub_preds += bst.predict(test_df[feats], num_iteration=bst.best_iteration) / folds.n_splits # test_df_new

cv = rmse(train_df['age_group'],  oof_preds)
print('Full OOF RMSE %.6f' % cv)  

a = train_df[['uid']]
b = test_df[['uid']]

a['age_pred'] = oof_preds
b['age_pred'] = sub_preds

a1 = a.groupby(['uid'])['age_pred'].agg(['mean','std','min','max','median',])
b1 = b.groupby(['uid'])['age_pred'].agg(['mean','std','min','max','median',])

a1.append(b1).add_prefix("usage_GROUPKFOLD_agg_pred_").reset_index().sort_values(by=['uid']).reset_index(drop=True).to_pickle("../pickle/Meta_usage_GROUP_Regeress.pickle")