In [1]:
import time
import numpy as np
import pandas as pd
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_squared_error, accuracy_score, plot_confusion_matrix, f1_score
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix

import stellargraph as sg
from stellargraph import datasets
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    Node2VecNodeGenerator,
    ClusterNodeGenerator,
)
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE, Dense

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model, optimizers, losses, metrics

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE, KMeansSMOTE, SMOTENC, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.metrics import classification_report_imbalanced

from collections import Counter, defaultdict
import multiprocessing
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [2]:
import sys
sys.path.append('../')
import utils

In [3]:
v_data, e_data, v_sets, e_sets, core_targets, ext_targets, core_testing = utils.load_for_jupyter()

Dataset already downloaded. Loading it from file system
LOADING DATA: 1.11 s
PREPROCESSING DATA: 420.04 s


In [4]:
st_sed = e_sets['is similar'].loc[e_sets['is similar'].weight == 1]

In [28]:
st_sed.shape

(36672, 3)

In [6]:
src = v_data[v_data.index.isin(st_sed.source.values)]
tgt = v_data[v_data.index.isin(st_sed.target.values)]

nodes = pd.concat([src, tgt])

In [7]:
st_sed

Unnamed: 0_level_0,source,target,weight
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
600111766073,100109641,3001157349,1.0
600111766438,100109660,3001151288,1.0
600111762366,100109664,3001153481,1.0
600111767992,100109673,3001175846,1.0
600111763004,100109676,3001180125,1.0
...,...,...,...
650111802554,20030017540,3001170561,1.0
650111779829,20030017541,3001165543,1.0
650111797101,20030017543,3001179268,1.0
650111816653,20030017543,3001140534,1.0


In [8]:
# first source
v_data.loc[v_data.index == 100109641]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100109641,Customer,,,,Person,VALEA GOODWILL FOUNDATION,high,2381.0,2381.0,0.0


In [9]:
# first target
v_data.loc[v_data.index == 3001157349]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3001157349,External Entity,,,,Person,Hqhlm01YYc,,,2381.0,0.0


In [10]:
# second source
v_data.loc[v_data.index == 100109660]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100109660,Customer,,,,Organisation,CHU MICHAEL H F,medium,,2032.0,0.0


In [11]:
# second target
v_data.loc[v_data.index == 3001151288]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3001151288,External Entity,,,,Organisation,dxa669qsKE,,,,


In [12]:
v_data.loc[v_data.ExtendedCaseGraphID == 2032]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15020048483,Account,mid_high,Yhi46S2waH,,,,,,2032.0,0.0
15020075907,Account,mid_high,mg9AWpnb6L,,,,,,2032.0,0.0
250113288,Address,,,gvN2MVSQnv,,,,,2032.0,0.0
100109660,Customer,,,,Organisation,CHU MICHAEL H F,medium,,2032.0,0.0
1001013097,Customer,,,,Organisation,CHE HONG,high,,2032.0,0.0
100102312,Customer,,,,Person,PECKORY INC.,medium,,2032.0,0.0
1001051857,Customer,,,,Organisation,NPV6wSFz5w,medium,,2032.0,0.0
1001035910,Customer,,,,Person,56zZjSZLUp,low,,2032.0,0.0
1001049076,Customer,,,,Organisation,3cx1PmLu44,low,,2032.0,0.0
3001130578,External Entity,,,,Organisation,HMDwsx6ska,,,2032.0,0.0


In [13]:
core_targets.loc[core_targets.CaseID == 2032]

Unnamed: 0_level_0,CaseID
NodeID,Unnamed: 1_level_1
1001029510,2032
15020035193,2032
15020035195,2032
15020039077,2032


In [14]:
# Let's look at the first hop of the "is similar" connections

# Vorrei sapere quanti nodi che hanno connessioni di tipo "is similar" estendono il loro CaseID.
# Cioè quanti nodi presenti nel source e target hanno CoreCase e ExtendedCase ID.

In [15]:
v_datan = v_data.fillna(0)

In [16]:
sources_with_core_case = 0
sources_with_ext_case = 0
targets_with_core_case = 0
targets_with_ext_case = 0
sources = dict()
targets = dict()

i = 0
for ind, row in e_data.loc[e_data.Label == 'is similar'].iterrows():
    if v_datan.loc[v_datan.index == row.from_id].CoreCaseGraphID.values[0] != 0:
        sources_with_core_case += 1
        sources[i] = ind
    if v_datan.loc[v_datan.index == row.to_id].CoreCaseGraphID.values[0] != 0:
        targets_with_core_case += 1
        targets[i] = ind
    if v_datan.loc[v_datan.index == row.from_id].ExtendedCaseGraphID.values[0] != 0:
        sources_with_ext_case += 1
    if v_datan.loc[v_datan.index == row.to_id].ExtendedCaseGraphID.values[0] != 0:
        targets_with_ext_case += 1
    i +=1
        

In [17]:
sources_with_core_case

7343

In [18]:
targets_with_core_case

2005

In [19]:
sources_with_ext_case

21015

In [20]:
targets_with_ext_case

12929

In [21]:
# Sembra che molte connessioni in cui il source ha un case ID abbiano il target con lo stesso Case ID.
# Con che accuracy succede questa cosa?

In [27]:
# try with the strong similarities

def print_similar():
    for ind, row in e_data.loc[e_data.Label == 'is similar'].iterrows():
        x = v_datan.loc[v_datan.index == row.from_id].CoreCaseGraphID.values[0]
        x2 = v_datan.loc[v_datan.index == row.from_id].ExtendedCaseGraphID.values[0]
        if x != 0 or x2 != 0:
            y = v_datan.loc[v_datan.index == row.to_id].CoreCaseGraphID.values[0]
            y2 = v_datan.loc[v_datan.index == row.to_id].ExtendedCaseGraphID.values[0]
            print(f"Edge {ind} has source node {row.from_id} and target node {row.to_id}.")
            print(f"\t sourceCoreID:{x} - sourceExtID:{x2} - targetCoreID:{y} - targetExtID:{y2}")
    return 1

#print_similar()

In [23]:
G = sg.StellarDiGraph(v_sets, e_sets)

In [24]:
lil_adj = G.to_adjacency_matrix().tolil()
adjlist = [tuple(neighbours) for neighbours in lil_adj.rows]

In [26]:
lil_adj

<319376x319376 sparse matrix of type '<class 'numpy.float64'>'
	with 921858 stored elements in List of Lists format>

## Conclusion: it's possible that the target node has the same case ID, but it's not always the case.

## No simple heuristics can be used to determine this