# Exploratory Data Analysis

In [None]:
import math, time, random, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [None]:
# Loading data

edges = pd.read_csv("https://raw.githubusercontent.com/AlbertoParravicini/high-performance-graph-analytics-2020/main/track-ml/data/polimi.case.graphs.edges.csv", low_memory=False)
nodes = pd.read_csv("https://raw.githubusercontent.com/AlbertoParravicini/high-performance-graph-analytics-2020/main/track-ml/data/polimi.case.graphs.vertices.csv", low_memory=False)

## Nodes

In [None]:
nodes.info()

In [None]:
nodes.describe()

In [None]:
nodes.head(5)

In [None]:
missingno.matrix(nodes, figsize = (14, 10))

In [None]:
nodes.isnull().sum()

In [None]:
nodes.shape

In [None]:
nodes.dtypes

## Edges

In [None]:
edges.info()

In [None]:
edges.describe()

In [None]:
edges.head(5)

In [None]:
missingno.matrix(edges, figsize = (14, 10))

In [None]:
edges.isnull().sum()

In [None]:
edges.shape

In [None]:
edges.dtypes

## More in-depth Analysis

In [None]:
print("Core Cases: ",len(nodes.CoreCaseGraphID.value_counts()))

In [None]:
print("Testing nodes: ",nodes.testingFlag.notnull().sum())

In [None]:
# Average number of nodes with the same Core case ID:

nodes.groupby(['CoreCaseGraphID']).size().mean()

In [None]:
# Average number of nodes with the same address

nodes.groupby(['Address']).size().mean()

In [None]:
nodes.groupby(['Name']).size()

In [None]:
# Nodes that only have an ID and a label and nothing else
# (without looking at core cases)

nodes[nodes['Revenue Size Flag'].isna() & 
      nodes['Account ID String'].isna() & 
      nodes['Address'].isna() &
      nodes['Person or Organisation'].isna() &
      nodes['Name'].isna() &
      nodes['Income Size Flag'].isna()
     ].shape[0]

In [None]:
# Address of nodes that are part of a core case

nodes.groupby(['Address', 'CoreCaseGraphID']).size()

In [None]:
# Account ID String of nodes that are part of a core case

nodes.groupby(['Account ID String', 'CoreCaseGraphID']).size()

In [None]:
# Name of nodes that are part of a core case

mask = nodes.loc[nodes['CoreCaseGraphID'].notnull()]
no_name = mask['Name'].isna().sum()
mask = mask.loc[mask['Name'].notnull()]
mask = mask[['Name','CoreCaseGraphID']]

mask.tail(20)

In [None]:
# Nodes that are part of a core case and don't have a Name

no_name

In [None]:
# Number of extended cases for each node type

accounts = nodes.loc[nodes['Label'] == 'Account']
accounts['ExtendedCaseGraphID'].notnull().sum()

In [None]:
accounts = nodes.loc[nodes['Label'] == 'Customer']
accounts['ExtendedCaseGraphID'].notnull().sum()

In [None]:
accounts = nodes.loc[nodes['Label'] == 'Address']
accounts['ExtendedCaseGraphID'].notnull().sum()

In [None]:
accounts = nodes.loc[nodes['Label'] == 'Derived Entity']
accounts['ExtendedCaseGraphID'].notnull().sum()

In [None]:
accounts = nodes.loc[nodes['Label'] == 'External Entity']
accounts['ExtendedCaseGraphID'].notnull().sum()

In [None]:
# So most fraudolent nodes are the "Account" and "Customer" nodes by far!
# Ranked:

# 1: Customer
# 2: Account
# 3: External Entity
# 4: Derived Entity
# 5: Address

# With just Customer, Account and Derived Entity I have 80% of all the cases.