In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"
active = pd.read_pickle("{}/user_app_active_flatten.pickle".format(pickle_path))
usage = pd.read_pickle("{}/user_app_usage.pickle".format(pickle_path))

In [None]:
train = pd.read_csv("../data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data/age_test.csv",names=['uid']).sort_values(by=['uid'])
all_data = train.append(test)
all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)
print(all_data.shape)

In [None]:
from tqdm import tqdm

In [None]:
from collections import Counter

def Gini(pr):
    cate=Counter(pr)
    gini=1
    total=len(pr)
    for i in cate:
        gini=gini-(cate[i]/total)**2
    return gini

import math
# Calc Entropy
def entropy(pr):
    cate = Counter(pr)
    log2 = math.log2
    total = len(pr)
    ent = 0
    for i in cate:
        p = float(cate[i] / total)
        if p==0:
            ent=0
            continue
        ent = ent - p * (log2(p))
    return ent

def get_small(x,a,b):
    return int((x>a) & (x<b))

def get_feature_flatten(df):
    
    fea = []
    # Count Encoder
    t1 = time.time()
    df['appid_count'] = df.groupby(['appid'])['uid'].transform('count')
    fea.append(df[['uid','appid_count']].groupby(['uid'])['appid_count'].agg({'mean','std','min','max','median'}))
    fea.append(df[['uid','appid']].groupby(['uid'])['appid'].apply(Gini))
    fea.append(df[['uid','appid']].groupby(['uid'])['appid'].apply(entropy))
    df['appid_count_0_1e3'] = df['appid_count'].map(lambda x:get_small(x,0,1e3))
    df['appid_count_1e3_1e4'] = df['appid_count'].map(lambda x:get_small(x,1e3,1e4))
    df['appid_count_1e4_2e5'] = df['appid_count'].map(lambda x:get_small(x,1e4,2e5))
    tmp = df.groupby(['uid']).agg({'appid_count_0_1e3' : ['mean','sum','std'],
                          'appid_count_1e3_1e4' : ['mean','sum','std'],
                          'appid_count_1e4_2e5' : ['mean','sum','std']
                          })

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    fea.append(tmp)
    print("Count Active Finish... :",time.time()-t1)
    
    return fea


uid0 = all_data[['uid']]
fea0 = get_feature_flatten(usage)
for i in tqdm(fea0):
    uid0 = uid0.merge(i,how='left',on='uid')

uid0 = uid0.set_index('uid').add_prefix("usage_").reset_index()

uid1 = all_data[['uid']]
fea1 = get_feature_flatten(active)

for i in tqdm(fea1):
    uid1 = uid1.merge(i,how='left',on='uid')
uid1 = uid1.set_index('uid').add_prefix("active_").reset_index()

uid0.to_pickle("../pickle/usage_flatten_stat.pickle")
uid1.to_pickle("../pickle/active_flatten_stat.pickle")

In [10]:
uid1

Unnamed: 0,uid,active_max,active_std,active_min,active_median,active_mean,active_appid_x,active_appid_y,active_appid_count_0_1e3_mean,active_appid_count_0_1e3_sum,active_appid_count_0_1e3_std,active_appid_count_1e3_1e4_mean,active_appid_count_1e3_1e4_sum,active_appid_count_1e3_1e4_std,active_appid_count_1e4_2e5_mean,active_appid_count_1e4_2e5_sum,active_appid_count_1e4_2e5_std
0,1000006,4882205.0,1.264451e+06,750.0,604620.0,1.154490e+06,0.978723,5.554589,0.042553,2.0,0.204030,0.085106,4.0,0.282057,0.191489,9.0,0.397727
1,1000009,4882205.0,1.149125e+06,2430.0,349023.0,8.769154e+05,0.986301,6.189825,0.000000,0.0,0.000000,0.068493,5.0,0.254338,0.342466,25.0,0.477818
2,1000010,4882205.0,1.106938e+06,1029.0,202722.5,7.977411e+05,0.989583,6.584963,0.000000,0.0,0.000000,0.125000,12.0,0.332455,0.364583,35.0,0.483840
3,1000011,4882205.0,1.499393e+06,2123.0,164864.0,1.017023e+06,0.952381,4.392317,0.000000,0.0,0.000000,0.142857,3.0,0.358569,0.380952,8.0,0.497613
4,1000012,4882205.0,1.249960e+06,2750.0,432174.0,9.964622e+05,0.969697,5.044394,0.000000,0.0,0.000000,0.030303,1.0,0.174078,0.303030,10.0,0.466694
5,1000014,4882205.0,1.554756e+06,5835.0,1407021.0,1.651767e+06,0.960000,4.643856,0.000000,0.0,0.000000,0.040000,1.0,0.200000,0.280000,7.0,0.458258
6,1000020,4882205.0,1.598320e+06,10275.0,863964.0,1.675255e+06,0.941176,4.087463,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.117647,2.0,0.332106
7,1000027,4882205.0,1.517398e+06,23032.0,1131031.0,1.537524e+06,0.950000,4.321928,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.250000,5.0,0.444262
8,1000033,4882205.0,1.272768e+06,10264.0,1022596.0,1.416801e+06,0.976190,5.392317,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.166667,7.0,0.377195
9,1000034,4882205.0,1.356017e+06,1458.0,652594.0,1.183765e+06,0.974359,5.285402,0.000000,0.0,0.000000,0.128205,5.0,0.338688,0.179487,7.0,0.388776
