# Single timestep graph - Elliptic data set

In [1]:
# global imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import powerlaw
import seaborn as sns
from graphdatascience import GraphDataScience

# local imports
from GraphBuilder import extractAllTimestepsNodes, getTimestepEdgelist
from AnalysisManager import AnalysisManager
from NodeMetrics import NodeMetrics
from GraphMetrics import GraphMetrics

#### Neo4j modules

In [2]:
manager = AnalysisManager("neo4j", "elliptic", "../data", outputs_base_location="../")
connector = GraphDataScience("bolt://localhost:7687", auth=("neo4j", "elliptic"))
node_metrics = NodeMetrics(connector, "elliptic", "node", "TRANSACTION")
graph_metrics = GraphMetrics(connector, "elliptic", "node", "TRANSACTION")

#### Networkx graph

In [3]:
edgelist = pd.read_csv("../data/elliptic_txs_edgelist.csv")
transactions_features = pd.read_csv("../data/elliptic_txs_features.csv", header=None)
graphtype = nx.DiGraph()
graph = nx.from_pandas_edgelist(edgelist,
                                source="txId1",
                                target="txId2",
                                create_using=graphtype)

### Functions related to working with Networkx library

Note on shortest paths

The shortest paths are determined using Networkx class static method shortest_path_length() which returns pairs of node ID and a dictionary containing all its neighbours IDs and path lengths to them. <br>
In this case, the information is stored in a multidimensional array of heterogenious sizes of elements which store the origin node ID, destination node IDs and path lengths.

In [4]:
def getAllShortestPaths(graph):
    all_shortest_paths = []
    for (i,j) in nx.shortest_path_length(graph):
        all_shortest_paths.append([i, list(j.keys()), np.array(list(j.values()))])
    return all_shortest_paths

In [5]:
def getMeanOfShortestPaths(shortest_paths):
    summed_lengths = 0
    number_of_paths = 0
    for i in range(len(shortest_paths)):
        summed_lengths += (shortest_paths[i][2].sum())
        number_of_paths += len(shortest_paths[i][2])

    return summed_lengths / number_of_paths

In [6]:
def getMedianOfShortestPaths(shortest_paths):
    all_paths_lengths = []
    for i in range(len(shortest_paths)):
        for path_length in shortest_paths[i][2]:
            all_paths_lengths.append(path_length)
    all_paths_lengths = pd.Series(np.array(all_paths_lengths))
    return all_paths_lengths.median()

In [7]:
def getDiameter(shortest_paths):
    all_paths_lengths = []
    for i in range(len(shortest_paths)):
        for path_length in shortest_paths[i][2]:
            all_paths_lengths.append(path_length)
    all_paths_lengths = pd.Series(np.array(all_paths_lengths))
    return all_paths_lengths.max()

## Large-scale Network Properties

In [8]:
column_names = ["Type", "n", "m", "c", "S", "l", "alpha", "C"]
properties = ["Directed"]

Number of nodes

In [9]:
n = graph_metrics.getGraphSize()
properties.append(n)
print(n)

203769


Number of edges

In [10]:
m = graph_metrics.getNumberOfEdges().values[0][0]
properties.append(m)
print(m)

234355


Mean degree

In [11]:
c = node_metrics.getDegreeDistribution()["degree"].mean()
properties.append(c)
print(c)

1.1501013402431184


Fraction of nodes in the giant component (the largest component)

In [12]:
fractions = pd.DataFrame(graph_metrics.getFractionsWeaklyConnectedComponents())
fractions = fractions.reset_index()
fractions.columns = ["componentId", "fraction_of_nodes"]
S = fractions[fractions["fraction_of_nodes"] == fractions["fraction_of_nodes"].max()]["fraction_of_nodes"][0]
properties.append(S)
print(S)

0.03867124047328102


Mean distance between connected node pairs

In [13]:
l = getMeanOfShortestPaths(getAllShortestPaths(graph=graph))
properties.append(l)
print(l)

125.2837481234031


Exponent alpha

In [14]:
degree_distribution = node_metrics.getDegreeDistribution()
x = degree_distribution["degree"].values
data = powerlaw.Fit(x)
properties.append(data.alpha)

Calculating best minimal value for power law fit
xmin progress: 98%

Values less than or equal to 0 in data. Throwing out 0 or negative values


In [15]:
print(data.alpha)

3.785550920374367


Mean clustering coefficient

In [16]:
local_clustering_coefficients = node_metrics.getClusteringCoefficient()
C = local_clustering_coefficients["localClusteringCoefficient"].mean()
properties.append(C)
print(C)

0.01376219072424474


In [17]:
large_scale_structure_df = pd.DataFrame([properties])
large_scale_structure_df.columns = column_names
large_scale_structure_df.to_csv("../graph_large_scale_properties.csv")
large_scale_structure_df

Unnamed: 0,Type,n,m,c,S,l,alpha,C
0,Directed,203769,234355,1.150101,0.038671,125.283748,3.785551,0.013762


### Additional properties

Number of strongly connected components

In [18]:
len(graph_metrics.getFractionsStronglyConnectedComponents())

203769

Number of weakly connected components

In [19]:
len(graph_metrics.getFractionsWeaklyConnectedComponents())

49

Graph's diameter

In [20]:
getDiameter(getAllShortestPaths(graph=graph))

1248

## Centrality Metrics Analysis

In [21]:
degree_distribution = node_metrics.getDegreeDistribution()
degree_distribution.head()

Unnamed: 0,id,degree
0,2984918,472.0
1,89273,288.0
2,102570,122.0
3,3181,112.0
4,7952,99.0


In [22]:
pagerank_scores = node_metrics.getPageRankScores()
pagerank_scores.head()

Unnamed: 0,id,PageRankScore
0,225859042,42.279545
1,43388675,41.865629
2,99409352,40.702945
3,179084283,39.748616
4,30276715,37.46463


In [23]:
betweenness_scores = node_metrics.getBetweennessCentrality()
betweenness_scores.head()

Unnamed: 0,id,BetweennessScore
0,245736770,389376.0
1,245736765,389375.0
2,245736776,389375.0
3,245736761,389372.0
4,245736986,389372.0


In [24]:
eigenvector_scores = node_metrics.getEigenvectorCentrality()
eigenvector_scores.head()

Unnamed: 0,id,EigenvectorCentrality
0,245988039,0.431922
1,245988038,0.426715
2,222730806,0.300389
3,245932267,0.247939
4,245988040,0.245649


In [25]:
manager.getCentralityMetricsDataFrame("degree")

ValueError: 2 columns passed, passed data had 4 columns

In [None]:
manager.getCentralityMetricsDataFrame("EigenvectorCentrality")

In [None]:
manager.getCentralityMetricsDataFrame("PageRankScore")

In [None]:
manager.getCentralityMetricsDataFrame("BetweennessScore")

In [None]:
manager.plotGlobalCentralityMetricsDistribution("degree")

In [None]:
manager.plotGlobalCentralityMetricsDistribution("EigenvectorCentrality")

In [None]:
manager.plotGlobalCentralityMetricsDistribution("PageRankScore")

In [None]:
manager.plotGlobalCentralityMetricsDistribution("BetweennessScore")

In [None]:
manager.plotClassCentralityMetricsDistribution("degree", [1,2,3], {1: "illicit", 2: "licit", 3: "unknown"}, show=True)

In [None]:
manager.plotClassCentralityMetricsDistribution("EigenvectorCentrality", [1,2,3], {1: "illicit", 2: "licit", 3: "unknown"}, show=True)

In [None]:
manager.plotClassCentralityMetricsDistribution("PageRankScore", [1,2,3], {1: "illicit", 2: "licit", 3: "unknown"}, show=True)

In [None]:
manager.plotClassCentralityMetricsDistribution("BetweennessScore", [1,2,3], {1: "illicit", 2: "licit", 3: "unknown"}, show=True)

## Louvain Communities Analysis

In [None]:
def getClassesDistributionInAllCommunities(analysis_manager, save=False, output_directory=None):
    communities_distribution = analysis_manager.getNodeClassesInCommunities().groupby("communityId")["class"].value_counts()
    content = pd.DataFrame(communities_distribution)
    content.columns = ["frequency"]
    content = content.reset_index()

    if save & (output_directory is not None):
        try:
            with open(f"{output_directory}/classes_distribution_in_louvain_communities.txt", 'w') as fh:
                fh.write(f"{content.to_string(header=True, index=True)}\n")

        except Exception as e:
            print("Saving the contents failed. Error message: %s" % e)

    return content

def getSuspiciousCommunities(classes_distribution_in_communities):
    """
    :param classes_distribution_in_communities: a dataframe containing all communities' IDs and all transaction classes.
    :return: a dataframe of communities which can be considered suspicious, potentially a fraud ring.
    """
    communities_illicit = classes_distribution_in_communities[classes_distribution_in_communities["class"] == 1]["communityId"]
    communities_illicit = pd.DataFrame(communities_illicit)
    communities_illicit.columns = ["communityId"]
    return communities_illicit

def getClassesDistributionInSuspiciousCommunities(suspicious_communities, classes_distribution_in_communities):
    return pd.merge(suspicious_communities, classes_distribution_in_communities, on="communityId")

In [None]:
def getSuspiciousCommunitiesFlow(analysis_manager, suspicious_communities):

    transactions_flow = None

    classes_assigned = analysis_manager.getNodeClassesInCommunities()
    node_ids = pd.merge(classes_assigned, suspicious_communities, on="communityId")["id"]
    str1 = f"{list(node_ids)}"

    try:
        gds = GraphDataScience("bolt://localhost:7687", auth=("neo4j", "elliptic"))
        transactions_flow = gds.run_cypher("MATCH (n:node)-[r:TRANSACTION]->(m:node) WHERE n.id IN %s RETURN n.id AS txId1, n.class AS txId1_class, r.cost AS weight, m.id AS txId2, m.class AS txId2_class" % str1)
        gds.close()
    except Exception as e:
        print("Error occurred. Check if the database is online. Error message: %s" % e)

    return transactions_flow

### Degree distribution in communities

In [None]:
communities = manager.detectLouvainCommunities()
communities["id"] = communities["id"].astype("int64")

In [None]:
nodes_class = node_metrics.getNodesClasses()
nodes_class["id"] = nodes_class["id"].astype("int64")

In [None]:
degree_distribution_in_communities = pd.merge(communities, degree_distribution, on="id")
degree_class_in_communities = pd.merge(degree_distribution_in_communities, nodes_class, on="id")
degree_class_in_communities[degree_class_in_communities["class"] == 1]

### Transactions of each class - frequencies in communities

In [None]:
communities_classes_distribution = getClassesDistributionInAllCommunities(manager, save=True, output_directory="../")
suspicious_communities = getSuspiciousCommunities(communities_classes_distribution)
getClassesDistributionInSuspiciousCommunities(suspicious_communities, communities_classes_distribution)

### Studying the transactions flow

In [None]:
transaction_flow_in_suspicious_communities = getSuspiciousCommunitiesFlow(manager,
                                                                          suspicious_communities=suspicious_communities)

As seen below, because the transaction is directed from txId1 to txId2, we can see that illicit transactions changed their status, as their counts dropped for txId2 classes. The number of licit transactions (2) increased, the number of unknown class transaction decreased as well.

In [None]:
def getTransactionsFlowDf(value_counts_transactions):
    transactions_flow_df = pd.DataFrame(value_counts_transactions)
    transactions_flow_df= transactions_flow_df.reset_index()
    transactions_flow_df.columns = ["transaction_class", "frequency"]
    return transactions_flow_df

In [None]:
flow_from = getTransactionsFlowDf(transaction_flow_in_suspicious_communities.txId1_class.value_counts())
flow_from

In [None]:
flow_to = getTransactionsFlowDf(transaction_flow_in_suspicious_communities.txId2_class.value_counts())
flow_to

In [None]:
def plotTransactionsFlowClasses(value_counts_transactions_flow1, value_counts_transactions_flow2):
    fig, ax = plt.subplots(ncols=2, figsize=(8, 6))
    sns.barplot(data=value_counts_transactions_flow1, x="transaction_class", y="frequency", ax=ax[0])
    sns.barplot(data=value_counts_transactions_flow2, x="transaction_class", y="frequency", ax=ax[1])
    ax[0].set_xlabel("Transaction class", fontsize=12)
    ax[1].set_xlabel("Transaction class", fontsize=12)
    ax[0].set_ylabel("Frequency", fontsize=12)
    ax[1].set_ylabel("Frequency", fontsize=12)
    ax[0].set_title("Transaction origin's entity", fontsize=14)
    ax[1].set_title("Transaction destination's entity", fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
plotTransactionsFlowClasses(flow_from, flow_to)

In [None]:
connector.close()

## Notes

The graph's suspicious communities were visualized using Neo4jBrowser. The query was generated using additional code which prints the list of node IDs from a list and puts that in a formatted string of Cypher query. It is as follows:

In [None]:
print("""
MATCH (n:node)-[r:TRANSACTION]-(m) WHERE n.id IN ['232438397', '230432611', '230966618', '230449513', '230449518', '232047899', '3877856', '230437700', '3851887', '3877118', '230645825', '230649124', '3912536', '230438845', '230433274', '230438854', '230432970', '230440348', '230433507', '3880017', '232046508', '232061267', '3880228', '230814630', '8427177', '92491280', '230433276', '232047924', '230969378', '232042915', '230433803', '232052547', '231999952', '5686988', '230418063', '14090056', '3875041', '230432972', '232043907', '230456717', '232000564', '232000563', '3878606', '196535269', '71367715', '230556037', '231028826', '16754007', '219821008', '232039868', '230645829', '230649107', '3878718', '3881930', '3876512', '230647320', '22179034', '71377346', '230452718', '230969249', '230965656', '230643073', '3880822', '3874438', '3875725', '231992251', '231036638', '3875136', '3878694', '205337217', '232012623', '3820257', '230439793', '3903408', '2773281', '232438575', '3882627', '232051672', '230451747', '1234524', '230645826', '231028597', '230645772', '35025113', '230432810', '71373717', '3877156', '71369010', '230645824', '231004407', '231043940', '230420813', '71372878', '230434027', '205336060', '230645823', '231035659', '230456719', '214640616', '27429546', '230619786', '230645830', '231029351', '3408003', '230432792', '230594531', '231035673', '231992576', '230451744', '232012314', '3404992', '231029330', '232042947', '230471948', '230455503', '230436131', '27553029', '230454327', '2717498', '27489574', '230451730', '230389796', '61429103', '10983754', '17387772', '230453424', '3876550', '230453435', '3874463', '230451734', '51033972', '230411688', '226892542', '88367753', '55649387', '230451742', '232047017', '3205536', '232658952', '232673081', '62195631', '231990423', '12971085', '230397187', '230550396', '14847524', '3880150', '4959428', '10437353', '232377112', '86842626', '232377111', '24141114', '2876295', '230418806', '230428367', '230456623', '232345690', '230459342', '230332417', '230456624', '230531648', '230456618', '230531645', '230456625', '122282643', '10039990', '230352707', '230456622', '219822791', '219856741', '232345697', '89913568', '232345694', '17796937', '230390966', '232345692', '230683551', '52603109', '230429077', '9846194', '80855903', '233775069', '13341989', '232431892', '232431896', '232956033', '28032183', '230645812', '232956030', '234442906', '232339923', '232956032', '230428191', '232956034', '230428187', '55349030', '24155910', '230565413', '230417093', '211053597', '232793876', '232956040', '232906328', '234442914', '230625240', '232629023', '231573972', '232875618', '232639918', '232679753', '3324508', '184703182', '230713915', '49929726', '232903605', '230552340', '230452813', '232000574', '232000573', '232009674', '232009676', '87603321', '15192868', '232680483', '230593221', '230593717', '2920818', '231990430', '28514956', '232036152', '230581464', '230449494', '230449500', '230336180', '231990435', '230454323', '18907280', '230612860', '195312780', '204236566', '230454338', '230454693', '230454028', '230454330', '88541341', '219841229', '232906976', '230454697', '230454703', '232040303', '10414141', '16753577', '3904474', '3317903', '232014513', '232014511', '232947878', '232947876', '230590935', '121536582', '29115494', '3762140', '232404334', '233599196', '233419547', '16742787', '3395080'] RETURN n, m
""")