In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"
active = pd.read_pickle("{}/user_app_active_flatten.pickle".format(pickle_path))
usage = pd.read_pickle("{}/user_app_usage.pickle".format(pickle_path))
active.shape,usage.shape

((173596669, 2), (840560515, 5))

In [3]:
# Graph Feature
import scipy.sparse
from scipy import linalg
from scipy.special import iv
import scipy.sparse as sp

from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD

import argparse
import time

class ProNE():
    def __init__(self, G, emb_size=128, step=10, theta=0.5, mu=0.2, n_iter=5, random_state=2019):
        self.G = G
        self.emb_size = emb_size
        self.G = self.G.to_undirected()
        self.node_number = self.G.number_of_nodes()
        self.random_state = random_state
        self.step = step
        self.theta = theta
        self.mu = mu
        self.n_iter = n_iter
        
        mat = scipy.sparse.lil_matrix((self.node_number, self.node_number))

        for e in tqdm(self.G.edges()):
            if e[0] != e[1]:
                mat[int(e[0]), int(e[1])] = 1
                mat[int(e[1]), int(e[0])] = 1
        self.mat = scipy.sparse.csr_matrix(mat)
        print(mat.shape)

    def get_embedding_rand(self, matrix):
        # Sparse randomized tSVD for fast embedding
        t1 = time.time()
        l = matrix.shape[0]
        smat = scipy.sparse.csc_matrix(matrix)  # convert to sparse CSC format
        print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)
        U, Sigma, VT = randomized_svd(smat, n_components=self.emb_size, n_iter=self.n_iter, random_state=self.random_state)
        U = U * np.sqrt(Sigma)
        U = preprocessing.normalize(U, "l2")
        print('sparsesvd time', time.time() - t1)
        return U

    def get_embedding_dense(self, matrix, emb_size):
        # get dense embedding via SVD
        t1 = time.time()
        U, s, Vh = linalg.svd(matrix, full_matrices=False, check_finite=False, overwrite_a=True)
        U = np.array(U)
        U = U[:, :emb_size]
        s = s[:emb_size]
        s = np.sqrt(s)
        U = U * s
        U = preprocessing.normalize(U, "l2")
        print('densesvd time', time.time() - t1)
        return U

    def fit(self, tran, mask):
        # Network Embedding as Sparse Matrix Factorization
        t1 = time.time()
        l1 = 0.75
        C1 = preprocessing.normalize(tran, "l1")
        neg = np.array(C1.sum(axis=0))[0] ** l1

        neg = neg / neg.sum()

        neg = scipy.sparse.diags(neg, format="csr")
        neg = mask.dot(neg)
        print("neg", time.time() - t1)

        C1.data[C1.data <= 0] = 1
        neg.data[neg.data <= 0] = 1

        C1.data = np.log(C1.data)
        neg.data = np.log(neg.data)

        C1 -= neg
        F = C1
        features_matrix = self.get_embedding_rand(F)
        return features_matrix

    def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):
        # NE Enhancement via Spectral Propagation
        print('Chebyshev Series -----------------')
        t1 = time.time()

        if order == 1:
            return a

        A = sp.eye(self.node_number) + A
        DA = preprocessing.normalize(A, norm='l1')
        L = sp.eye(self.node_number) - DA

        M = L - mu * sp.eye(self.node_number)

        Lx0 = a
        Lx1 = M.dot(a)
        Lx1 = 0.5 * M.dot(Lx1) - a

        conv = iv(0, s) * Lx0
        conv -= 2 * iv(1, s) * Lx1
        for i in range(2, order):
            Lx2 = M.dot(Lx1)
            Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0
            #         Lx2 = 2*L.dot(Lx1) - Lx0
            if i % 2 == 0:
                conv += 2 * iv(i, s) * Lx2
            else:
                conv -= 2 * iv(i, s) * Lx2
            Lx0 = Lx1
            Lx1 = Lx2
            del Lx2
            print('Bessell time', i, time.time() - t1)
        mm = A.dot(a - conv)
        self.embeddings = self.get_embedding_dense(mm, self.emb_size)
        return self.embeddings
    
    def transform(self):
        if self.embeddings is None:
            print("Embedding is not train")
            return {}
        self.embeddings = pd.DataFrame(self.embeddings)
        self.embeddings.columns = ['ProNE_Emb_{}'.format(i) for i in range(len(self.embeddings.columns))]
        self.embeddings = self.embeddings.reset_index().rename(columns={'index' : 'nodes'}).sort_values(by=['nodes'],ascending=True).reset_index(drop=True)

        return self.embeddings

In [4]:
from tqdm import tqdm
import networkx as nx
import igraph as ig

def get_graph_embedding(df,prefix):
    
    uid_lbl,appid_lbl = LabelEncoder(),LabelEncoder()
    df['new_uid'] = uid_lbl.fit_transform(df['uid'])
    df['new_appid'] = appid_lbl.fit_transform(df['appid'])
    df['new_appid'] += df['new_uid'].max() + 1
    
    print("Encoder Finished...")
    
    G = ig.Graph()
    G.add_vertices(df['new_appid'].max()+1)
    G.add_edges(df[['new_uid','new_appid']].values)
    print("Build Graph Finished...")
    evcent = G.evcent() # 计算图中节点的向量中心性
    shell_index = G.shell_index() # 计算图中节点度至少为K的最大子图
    degree = G.degree() # 总度数
    pagerank = G.pagerank() # pagerank
    # 以下4个计算的很慢，效果还不错，可以注释掉，观察evcent的效果
    closeness = G.closeness() # 计算节点与网络中其他所有节点的距离的平均值   
    betweenness = G.betweenness() # 计算节点的介值
    constraint = G.constraint()
    eccentricity = G.eccentricity() # 计算给定节点到图中其他节点的最短距离的最大值。
    
    G_stat = pd.DataFrame()
    G_stat['evcent'] = evcent
    G_stat['shell_index'] = shell_index
    G_stat['degree'] = degree
    G_stat['pagerank'] = pagerank
    print("PR Finished...")
    G_stat['closeness'] = closeness
    G_stat['betweenness'] = betweenness
    G_stat['constraint'] = constraint
    G_stat['eccentricity'] = eccentricity
    G_stat = G_stat.reset_index()
    G_stat = G_stat[G_stat['index'].isin(df['new_uid'])]
    G_stat['index'] = uid_lbl.inverse_transform(G_stat['index'])
    
    print("Graph Stat Finished...")
    G_stat.to_pickle("../pickle/Graph_Stat_{}.pickle".format(prefix))
    
    del G
    
    import gc
    gc.collect()
    
    G = nx.Graph()
    G.add_edges_from(df[['new_uid','new_appid']].values)
    model = ProNE(G,emb_size=32,n_iter=6,step=12)
    features_matrix = model.fit(model.mat, model.mat)
    model.chebyshev_gaussian(model.mat, features_matrix, model.step, model.mu, model.theta)
    emb = model.transform()
    fea = emb[emb['nodes'].isin(df['new_uid'])]
    fea['nodes'] = uid_lbl.inverse_transform(fea['nodes'])
    fea.rename(columns={'nodes' : 'uid'},inplace=True)
    del G
    gc.collect()
    print("Embedding Finished...")
    fea.to_pickle("../pickle/Graph_Bi_{}.pickle".format(prefix))
    
    return fea,G_stat

In [None]:
fea1,stat1 = get_graph_embedding(active,'active').set_index('uid').add_prefix("active_").reset_index()
fea0,stat0 = get_graph_embedding(usage,'usage').set_index('uid').add_prefix("usage_").reset_index()

# fea0.to_pickle("../pickle/usage_bi_graph_ProNE.pickle")
# fea1.to_pickle("../pickle/active_bi_graph_ProNE.pickle")
# stat0.to_pickle("../pickle/usage_graph_stat.pickle")
# stat1.to_pickle("../pickle/active_graph_stat.pickle")

Encoder Finished...
PR Finished...
Graph Stat Finished...
