In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"

train = pd.read_csv("../data/age_train.csv",names=['uid','age_group']).sort_values(by=['uid'])
test = pd.read_csv("../data/age_test.csv",names=['uid']).sort_values(by=['uid'])
info = pd.read_csv("../data/app_info.csv",names=['appid','category'])
active = pd.read_pickle("{}/user_app_active.pickle".format(pickle_path))
usage = pd.read_pickle("{}/user_app_usage.pickle".format(pickle_path))
user_basic_info = pd.read_csv("../data/user_basic_info.csv",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])
behavior_info = pd.read_csv("../data/user_behavior_info.csv",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])
print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))

all_data = train.append(test)
all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)
print(all_data.shape)


((4000000, 2), (1000000, 1)) ((12460, 2), (4999341, 3), (5000000, 13), (5000000, 9), (840560515, 5))
(5000000, 2)


In [2]:
from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument
from glove import *

def get_w2c_feature(df,load_model,model,prefix):
    w2c_arr = []
    vocab = load_model.vocab.keys()
    
    for v in vocab :
        w2c_arr.append(list(load_model.wv[v]))

    # w2v Stat
    df_w2c = pd.DataFrame()
    df_w2c['word_id'] = vocab
    df_w2c = pd.concat([df_w2c, pd.DataFrame(w2c_arr)], axis=1)
    df_w2c.columns = ['appid'] + ['appid_{}'.format(model) + '_embedding_' + str(i) for i in range(size)]
    df_w2c_feat = df[['uid', 'appid']].merge(df_w2c, on='appid', how='left')

    agg = {}
    for l in ['appid_{}'.format(model) + '_embedding_' + str(i) for i in range(size)] :
        agg[l] = ['mean', 'std', 'max', 'min']

    df_agg = df_w2c_feat.groupby('uid').agg(agg)
    df_agg.columns = pd.Index(['{}_uid_'.format(model) + prefix  + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
    df_agg = df_agg.reset_index().sort_values(by=['uid'],ascending=True)
    return df_agg

def get_gensim_feature(now=None,model='word2vec',size=5,window=10,prefix='active'):

    df = now.copy()
    if os.path.exists("../pickle/{}_{}_emb.pickle".format(prefix,model)):
        return pd.read_pickle("../pickle/{}_{}_emb.pickle".format(prefix,model))
    else:
        dictionary = corpora.Dictionary(df['appid'].values)
        corpus = [dictionary.doc2bow(text) for text in df['appid'].values]
        if model=='word2vec':
            if os.path.exists("../vector/w2v.model"):
                w2v = models.KeyedVectors.load_word2vec_format("../vector/w2v.model", binary=False)
            else:
                w2v = models.Word2Vec(df['appid'].values, size=size, window=window, workers=40)
                w2v.wv.save_word2vec_format("../vector/w2v.model")
            vocab = list(w2v.wv.vocab.keys())

            # Sentence Embedding

            w2v_feature = np.zeros((df.shape[0],size))
            w2v_feature_avg = np.zeros((df.shape[0],size))

            for i,line in tqdm(enumerate(df['appid'].values.tolist())):
                num = 0
                if line == '':
                    w2v_feature_avg[i,:] = np.zeros(size)
                else:
                    for word in line:
                        num += 1
                        vec = w2v[word] if word in vocab else np.zeros(size)
                        w2v_feature[i,:] += vec
                    w2v_feature_avg[i,:] = w2v_feature[i,:] / num
            w2v_avg = pd.DataFrame(w2v_feature_avg)
            w2v_avg = w2v_avg.add_prefix("W2V_AVG_{}_".format(prefix))
            w2v_avg['uid'] = df['uid']
            df_agg = w2v_avg

        elif model=='lda':
            lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=size)
            col = np.zeros((df.shape[0],20))
            ans = lda.get_document_topics(corpus)
            for i in tqdm(range(df.shape[0])):
                for j in ans[i]:
                    col[i][j[0]] = j[1]

            df_agg = pd.DataFrame(col)
            df_agg = df_agg.add_prefix("LDA_TOPIC_{}_".format(prefix))
            df_agg['uid'] = df['uid']

        elif model=='fasttext':
            if os.path.exists("../vector/fasttext.model"):
                fasttext = models.KeyedVectors.load_word2vec_format("../vector/fasttext.model", binary=False)
            else:
                fasttext = models.FastText(df['appid'].values, size=size, window=window, workers=40)
                fasttext.wv.save_word2vec_format("../vector/fasttext.model")
            vocab = list(fasttext.wv.vocab.keys())

            fasttext_feature = np.zeros((df.shape[0],size))
            fasttext_feature_avg = np.zeros((df.shape[0],size))

            for i,line in tqdm(enumerate(df['appid'].values.tolist())):
                num = 0
                if line == '':
                    fasttext_feature_avg[i,:] = np.zeros(size)
                else:
                    for word in line:
                        num += 1
                        vec = fasttext[word] if word in vocab else np.zeros(size)
                        fasttext_feature[i,:] += vec
                    fasttext_feature_avg[i,:] = fasttext_feature[i,:] / num
            fasttext_avg = pd.DataFrame(fasttext_feature_avg)
            fasttext_avg = fasttext_avg.add_prefix("FASTTEXT_AVG_{}".format(prefix))
            fasttext_avg['uid'] = df['uid']
            df_agg = fasttext_avg

        elif model=='doc2vec':
            if os.path.exists("../vector/d2v.model"):
                d2v = models.KeyedVectors.load_word2vec_format("../vector/d2v.model", binary=False)
            else:            
                docs = [TaggedDocument(words=i[1],tags=[str(i[0])]) for i in df[['uid','appid']].values]
                d2v = models.Doc2Vec(docs,size=size,window=window,workers=40)
                d2v.wv.save_word2vec_format("../vector/d2v.model")
            vocab = list(d2v.wv.vocab.keys())
            
            d2v_avg = []
            for i in tqdm(df['appid'].values):
                line = []
                for j in i:
                    line.append(d2v[j] if j in vocab else 0)
                d2v_avg.append(np.mean(line,axis=0))
            d2v_avg = pd.DataFrame(d2v_avg)
            d2v_avg = d2v_avg.add_prefix("d2v_AVG_{}".format(prefix))
            d2v_avg['uid'] = df['uid']
            df_agg = d2v_avg

        elif model=='lsi':
            lsi = models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=size)
            df_agg = []
            for i in tqdm(df['appid'].values):
                lsi_ = lsi[dictionary.doc2bow(i)]
                df_agg.append([tmp[1] for tmp in lsi[lsi_]])

            df_agg = pd.DataFrame(df_agg)
            df_agg = df_agg.add_prefix("LSI_TOPIC_{}_".format(prefix))
            df_agg['uid'] = df['uid']
            
        elif model=='glove':
            matrix =  Corpus()
            matrix.fit(df['appid'].values)
            glove = Glove(no_components=size, learning_rate=0.05)
            glove.fit(matrix.matrix,epochs=10,no_threads=30,verbose=1)
            glove.add_dictionary(matrix.dictionary)
            ans = []
            for i in tqdm(df['appid'].values):
                line = []
                for j in i:
                    line.append(glove.word_vectors[glove.dictionary[j]])
                ans.append(np.mean(line,axis=0))
            df_agg = pd.DataFrame(ans)
            df_agg = df_agg.add_prefix("Glove_AVG_{}".format(prefix))
            df_agg['uid'] = df['uid']
            
        df_agg.to_pickle("../pickle/{}_{}_emb.pickle".format(prefix,model))
    
    return df_agg

In [None]:
from tqdm import tqdm
lsi = get_gensim_feature(active,'lsi',32,10,'active')
w2v = get_gensim_feature(active,'word2vec',64,10,'active')
fasttext = get_gensim_feature(active,'fasttext',64,10,'active')
d2v = get_gensim_feature(active,'doc2vec',64,10,'active')
lda = get_gensim_feature(active,'lda',20,10,'active')

lsi_1 = get_gensim_feature(usage,'lsi',32,10,'usage')
w2v_1 = get_gensim_feature(usage,'word2vec',64,10,'usage')
fasttext_1 = get_gensim_feature(usage,'fasttext',64,10,'usage')
d2v_1 = get_gensim_feature(usage,'doc2vec',64,10,'usage')
lda_1 = get_gensim_feature(usage,'lda',20,10,'usage')

 99%|█████████▉| 4942930/4999341 [50:48<00:25, 2244.35it/s] 