In [1]:
import time
import numpy as np
import pandas as pd
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_squared_error, accuracy_score, plot_confusion_matrix, f1_score
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix

import stellargraph as sg
from stellargraph import datasets
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    Node2VecNodeGenerator,
    ClusterNodeGenerator,
)
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE, Dense

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model, optimizers, losses, metrics

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE, KMeansSMOTE, SMOTENC, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.metrics import classification_report_imbalanced

from collections import Counter, defaultdict
import multiprocessing
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [2]:
import sys
sys.path.append('../')
import utils

In [3]:
v_data, e_data, v_sets, e_sets, core_targets, ext_targets, core_testing = utils.load_for_jupyter()

Dataset already downloaded. Loading it from file system
LOADING DATA: 1.10 s
PREPROCESSING: 1.11 s


In [11]:
st_sed = e_sets['is similar'].loc[e_sets['is similar'].weight == 1]

In [25]:
st_sed.shape

(36672, 3)

In [27]:
src = v_data[v_data.index.isin(st_sed.source.values)]
tgt = v_data[v_data.index.isin(st_sed.target.values)]

nodes = pd.concat([src, tgt])

In [34]:
st_sed

Unnamed: 0_level_0,source,target,weight
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
600111766073,100109641,3001157349,1.0
600111766438,100109660,3001151288,1.0
600111762366,100109664,3001153481,1.0
600111767992,100109673,3001175846,1.0
600111763004,100109676,3001180125,1.0
...,...,...,...
650111802554,20030017540,3001170561,1.0
650111779829,20030017541,3001165543,1.0
650111797101,20030017543,3001179268,1.0
650111816653,20030017543,3001140534,1.0


In [35]:
v_data.loc[v_data.index == 100109641]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100109641,Customer,,,,Person,VALEA GOODWILL FOUNDATION,high,2381.0,2381.0,0.0


In [36]:
v_data.loc[v_data.index == 100109660]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100109660,Customer,,,,Organisation,CHU MICHAEL H F,medium,,2032.0,0.0


In [37]:
v_data.loc[v_data.ExtendedCaseGraphID == 2032]

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15020048483,Account,mid_high,Yhi46S2waH,,,,,,2032.0,0.0
15020075907,Account,mid_high,mg9AWpnb6L,,,,,,2032.0,0.0
250113288,Address,,,gvN2MVSQnv,,,,,2032.0,0.0
100109660,Customer,,,,Organisation,CHU MICHAEL H F,medium,,2032.0,0.0
1001013097,Customer,,,,Organisation,CHE HONG,high,,2032.0,0.0
100102312,Customer,,,,Person,PECKORY INC.,medium,,2032.0,0.0
1001051857,Customer,,,,Organisation,NPV6wSFz5w,medium,,2032.0,0.0
1001035910,Customer,,,,Person,56zZjSZLUp,low,,2032.0,0.0
1001049076,Customer,,,,Organisation,3cx1PmLu44,low,,2032.0,0.0
3001130578,External Entity,,,,Organisation,HMDwsx6ska,,,2032.0,0.0


In [39]:
core_targets.loc[core_targets.CaseID == 2032]

Unnamed: 0_level_0,CaseID
NodeID,Unnamed: 1_level_1
1001029510,2032
15020035193,2032
15020035195,2032
15020039077,2032
