In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

In [2]:
import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

In [5]:
train = pd.read_csv("../data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data/age_test.csv",names=['uid']).sort_values(by=['uid'])
info = pd.read_csv("../data/app_info.csv",names=['appid','category'])
active = pd.read_csv("../data/user_app_actived.csv",names=['uid','appid']).sort_values(by=['uid'])
usage = pd.read_csv("../data/user_app_usage.csv",names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])
user_basic_info = pd.read_csv("../data/user_basic_info.csv",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])
behavior_info = pd.read_csv("../data/user_behavior_info.csv",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])
print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape))#usage.shape,


In [8]:
pickle_path = "../pickle"
if not os.path.exists(pickle_path):
    os.mkdir(pickle_path)

In [13]:
if not os.path.exists("{}/user_app_usage.pickle".format(pickle_path)):
    t1 = time.time()
    usage.to_pickle("{}/user_app_usage.pickle".format(pickle_path))
    print('USAGE TO PICKLE: ',time.time()-t1)

usage_app_seq = usage[['uid','appid']].groupby(['uid'])['appid'].apply(lambda x:list(x)).reset_index()
usage_app_seq.to_pickle("{}/user_app_seq.pickle".format(pickle_path))

USAGE TO PICKLE:  109.59347701072693


In [14]:
from tqdm import tqdm

def flatten_active(df):    
    u = []
    a = []
    for i in tqdm(range(len(df['appid'].values))):
        u += [df['uid'].values[i]]*df['app_len'].values[i]
        a += list(df['appid'].values[i])
        
    new_df = pd.DataFrame()
    new_df['uid'] = u
    new_df['appid'] = a
        
    return new_df

In [15]:
active['appid'] = active['appid'].map(lambda x:x.split('#'))
active['app_len'] = active['appid'].map(lambda x:len(x))
active = active.reset_index(drop=True)
deal_active = flatten_active(active)

100%|██████████| 4999341/4999341 [00:51<00:00, 96820.22it/s]


In [16]:
if not os.path.exists("{}/user_app_active.pickle".format(pickle_path)):
    t1 = time.time()
    active.to_pickle("{}/user_app_active.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  54.16705322265625


In [17]:
if not os.path.exists("{}/user_app_active_flatten.pickle".format(pickle_path)):
    t1 = time.time()
    deal_active.to_pickle("{}/user_app_active_flatten.pickle".format(pickle_path))
    print('Deal ACTIVE TO PICKLE: ',time.time()-t1)

Deal ACTIVE TO PICKLE:  59.198060512542725


In [27]:
active = pd.read_pickle("../pickle/user_app_active.pickle")

In [28]:
active.head()

Unnamed: 0,uid,appid,app_len
0,1000006,"[a001012, a001036, a001062, a001172, a001275, ...",47
1,1000009,"[a001012, a001015, a001055, a001062, a00107, a...",73
2,1000010,"[a001012, a001036, a001050, a001055, a001062, ...",96
3,1000011,"[a001012, a001063, a002450, a003083, a00326, a...",21
4,1000012,"[a001036, a001062, a001580, a001583, a003570, ...",33


In [16]:
all_data = train.append(test)
all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)

In [29]:
active_train = active.merge(train,how='right',on='uid')
active_test = active.merge(test,how='right',on='uid')
active_train.to_pickle("../pickle/active_text_train.pickle")
active_test.to_pickle("../pickle/active_text_test.pickle")

In [30]:
usage_train = usage.merge(train,how='right',on='uid')
usage_test = usage.merge(test,how='right',on='uid')
usage_train.to_pickle("../pickle/usage_text_train.pickle")
usage_test.to_pickle("../pickle/usage_text_test.pickle")