In [1]:
import numpy as np
from numpy import linalg as LA
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import networkx as nx
import collections

In [2]:
all_users=pd.read_csv('data/usersdata.csv',sep='\t',names = ["User Id", "Gender", "Time", "Age Range","Spammer_Label"])

In [3]:
features = all_users
relations_1= pd.read_csv('data/filtered_relations_1.csv')
relations_2= pd.read_csv('data/filtered_relations_2.csv')
relations_4= pd.read_csv('data/filtered_relations_4.csv')
relations_5= pd.read_csv('data/filtered_relations.csv')
relations_7= pd.read_csv('data/filtered_relations_7.csv')

In [4]:
nodes = features[['User Id','Spammer_Label']] 
nodes_df = pd.DataFrame()
nodes_df["node"]=nodes["User Id"]

nodes_df["time"]=features["Time"]
nodes_df["age"]=(features["Age Range"]//10)
nodes_df["gender"]=features["Gender"]
nodes_df["pagerank"]=np.nan
#nodes_df["k_core"]=np.nan
#nodes_df["greedy_color"]=np.nan
nodes_df["in_deg"]=np.nan
nodes_df["out_deg"]=np.nan
nodes_df["deg"]=np.nan
nodes_df["cluster_coeff"]=np.nan
nodes_df["in_deg_cent"]=np.nan
nodes_df["out_deg_cent"]=np.nan
nodes_df["deg_cent"]=np.nan
nodes_df["triang_count"]=np.nan

In [5]:
def create_graph_feature(features,relations,num_rel,nodes_df):
    """Calculate the graph based features for one relation subgraph.

    Parameters
    ----------
        features  : Pandas DataFrame containing all the users Id (nodes) 

        relations : Pandas DataFrame containing all the edges for one relation subgraph
        
        num_rel   : integer indicating the relation index 
        
        nodes_df  : Pandas dataframe with the content based features
                    
    Returns
    -------
        df        : Pandas DataFrame: Containing all the nodes in rows and the computed features 
                    in column 
        
    """ 
    df=nodes_df
    strs='_'+str(num_rel)
    edges = relations[['src','dst']]
    nodes = features[['User Id','Spammer_Label']] 
    edges=edges.join(nodes,on='dst')
    edges=edges.join(nodes,on='src',rsuffix='_parent')
    edges=edges.drop(columns=['src','dst'])
    edges2=edges[['User Id_parent','User Id','Spammer_Label_parent']]
    edges2 = edges2.rename(columns={'User Id_parent': 'src'})
    edges2 = edges2.rename(columns={'User Id': 'trg'})
    edges2 = edges2.rename(columns={'Spammer_Label_parent': 'label'})
    graph_dir=nx.from_pandas_edgelist(edges2, source='src',target='trg',create_using=nx.DiGraph())
    graph_undir=nx.from_pandas_edgelist(edges2, source='src',target='trg')
    namegraph='relation'+strs
    nx.write_gexf(graph_dir,namegraph)
    
    page_rank_graph=nx.pagerank(graph_dir, alpha=0.85)
    od = collections.OrderedDict(sorted(page_rank_graph.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['pagerank'])
    df=df.join(od_df,rsuffix=strs,on='node')
    
    cluster_coef=nx.clustering(graph_undir)
    od = collections.OrderedDict(sorted(cluster_coef.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['cluster_coeff'])
    df=df.join(od_df,rsuffix=strs,on='node')
    
    #corenum=nx.core_number(graph_dir)
    #od = collections.OrderedDict(sorted(corenum.items()))
    #od_df=pd.DataFrame.from_dict(od, orient='index', columns=['k_core'])
    #df=df.join(od_df,rsuffix=strs,on='node')
    
    #greedy_color=nx.greedy_color(graph_dir)
    #od = collections.OrderedDict(sorted(greedy_color.items()))
    #od_df=pd.DataFrame.from_dict(od, orient='index', columns=['greedy_color'])
    #df=df.join(od_df,rsuffix=strs,on='node')
    
    out_deg_centr=nx.out_degree_centrality(graph_dir)
    in_deg_centr=nx.in_degree_centrality(graph_dir)
    deg_cent=nx.degree_centrality(graph_dir)
    out_deg=graph_dir.out_degree()
    in_deg=graph_dir.in_degree()
    deg=graph_dir.degree()
    
    od = collections.OrderedDict(sorted(in_deg_centr.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['in_deg_cent'])
    df=df.join(od_df,rsuffix=strs,on='node')
    od = collections.OrderedDict(sorted(out_deg_centr.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['out_deg_cent'])
    df=df.join(od_df,rsuffix=strs,on='node')
    od = collections.OrderedDict(sorted(deg_cent.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['deg_cent'])
    df=df.join(od_df,rsuffix=strs,on='node')
    
    in_deg=dict(in_deg)
    out_deg=dict(out_deg)
    deg=dict(deg)
    od = collections.OrderedDict(sorted(in_deg.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['in_deg'])
    df=df.join(od_df,rsuffix=strs,on='node')
    od = collections.OrderedDict(sorted(out_deg.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['out_deg'])
    df=df.join(od_df,rsuffix=strs,on='node')
    od = collections.OrderedDict(sorted(deg.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['deg'])
    df=df.join(od_df,rsuffix=strs,on='node')
    
    triang=nx.triangles(graph_undir)
    od = collections.OrderedDict(sorted(triang.items()))
    od_df=pd.DataFrame.from_dict(od, orient='index', columns=['triang_count'])
    df=df.join(od_df,rsuffix=strs,on='node')
    
    return df 

In [6]:
nodes_df=create_graph_feature(features,relations_1,1,nodes_df)
print("DONE")
nodes_df=create_graph_feature(features,relations_2,2,nodes_df)
print("DONE")
nodes_df=create_graph_feature(features,relations_4,4,nodes_df)
print("DONE")
nodes_df=create_graph_feature(features,relations_5,5,nodes_df)
print("DONE")
nodes_df=create_graph_feature(features,relations_7,7,nodes_df)
print("DONE")

DONE
DONE
DONE
DONE
DONE


In [10]:
nodes_df=nodes_df.drop(columns=['pagerank','cluster_coeff','in_deg','in_deg_cent','out_deg','out_deg_cent','deg','deg_cent','triang_count'])

In [11]:
nodes_df['label']=nodes['Spammer_Label']

In [21]:
nodes_df.shape

(5607447, 50)

In [20]:
num_of_nan=nodes_df.shape[1]-6
nodes_df2=nodes_df
nodes_df2=nodes_df2.dropna(thresh=(len(nodes_df2.columns) -num_of_nan))

In [22]:
nodes_df2.shape

(783122, 50)

In [23]:
nodes_df2.to_csv('datasets_final2.csv',index=False)