In [1]:
import time
import json

import pandas as pd
import numpy as np
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_squared_error, accuracy_score, plot_confusion_matrix, f1_score
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix

import stellargraph as sg
from stellargraph import datasets
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    Node2VecNodeGenerator,
    ClusterNodeGenerator,
)
from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE, Dense

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Model, optimizers, losses, metrics

from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE, KMeansSMOTE, SMOTENC, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.metrics import classification_report_imbalanced

from collections import Counter, defaultdict
import multiprocessing
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [2]:
import sys
sys.path.append('../')
import utils

In [3]:
v_sets, e_sets, core_targets, ext_targets, core_testing = utils.load_for_jupyter_raw()

Dataset already downloaded. Loading it from file system
LOADING DATA: 0.96 s


# 0. Replace CoreCaseID and ExtCaseID with CSV data

In [4]:
#todo
v_sample = v_sets
e_sample = e_sets

# 0.1 Add Extra Features: Node Degree (see Node Degree feature notebook)

In [5]:
source_data = e_sets.groupby('from_id').count().to_id
source_data = pd.DataFrame(source_data)
source_data = source_data.rename(columns={'to_id': 'source_degree'})
source_data = source_data.rename_axis('node_id')

target_data = e_sets.groupby('to_id').count().from_id
target_data = pd.DataFrame(target_data)
target_data = target_data.rename(columns={'from_id': 'target_degree'})
target_data = target_data.rename_axis('node_id')

v_sample = pd.merge(v_sample, source_data, left_index=True, right_index=True, how='left')
v_sample = pd.merge(v_sample, target_data, left_index=True, right_index=True, how='left')

v_sample['source_degree'] = v_sample['source_degree'].fillna(0)
v_sample['target_degree'] = v_sample['target_degree'].fillna(0)

# 0.2 Preprocess data

In [6]:
## Preprocessing pipeline

v_sample.CoreCaseGraphID = v_sample.CoreCaseGraphID.fillna(0)
v_sample.ExtendedCaseGraphID = v_sample.ExtendedCaseGraphID.fillna(0)
v_sets = defaultdict()
for v_type in list(pd.Categorical(v_sample.Label).categories):
    v_sets[v_type] = v_sample[v_sample.Label == v_type]
    v_sets[v_type] = v_sets[v_type].drop(['Label']+list(v_sets[v_type].columns[v_sets[v_type].isnull().all()]), axis=1)
    v_sets[v_type].testingFlag = v_sets[v_type].testingFlag.fillna(-1)

e_sets = defaultdict()
for e_type in list(pd.Categorical(e_sample.Label).categories):
    e_sets[e_type] = e_sample[e_sample.Label == e_type]
    e_sets[e_type] = e_sets[e_type].drop(['Label']+list(e_sets[e_type].columns[e_sets[e_type].isnull().all()]), axis=1)
    e_sets[e_type] = e_sets[e_type].rename(columns={'from_id':'source', 'to_id':'target'})
    
#? 3: Logical conversion of categorical features

#Revenue Size Flag: low, mid_low, medium, mid_high, high -> 1,2,3,4,5
conversion = {'low':0.1, 'mid_low':0.3, 'medium':0.6, 'mid_high':0.8, 'high':1}
for i in v_sets:
    if 'Revenue Size Flag' in list(v_sets[i].columns):
        v_sets[i]['Revenue Size Flag']=v_sets[i]['Revenue Size Flag'].map(conversion)

#Income Size Flag: low, medium, high -> 1,2,3
conversion = {'low':0.1, 'medium':0.5, 'high':1}
for i in v_sets:
    if 'Income Size Flag' in list(v_sets[i].columns):
        v_sets[i]['Income Size Flag']=v_sets[i]['Income Size Flag'].map(conversion)

#Similarity Strength: weak, medium, strong -> 1,2,3
conversion = {'weak':0.1, 'medium':0.5, 'strong':1}
for i in e_sets:
    if 'Similarity Strength' in list(e_sets[i].columns):
        e_sets[i]['Similarity Strength']= e_sets[i]['Similarity Strength'].map(conversion)
        e_sets[i] = e_sets[i].rename(columns={'Similarity Strength':'weight'})

#Amount Flag: small, medium, large -> 1,50,500 -> treated as weights
conversion = {'small':0.1, 'medium':0.5, 'large':1}
for i in e_sets:
    if 'Amount Flag' in list(e_sets[i].columns):
        e_sets[i]['Amount Flag']=e_sets[i]['Amount Flag'].map(conversion)
        e_sets[i] = e_sets[i].rename(columns={'Amount Flag':'weight'})

#? 4: One-hot encoding for categorical features

# get_dummies for one-hot encoding
for i in v_sets:
    if 'Person or Organisation' in list(v_sets[i].columns):
        v_sets[i] = pd.get_dummies(v_sets[i], columns=['Person or Organisation'])

#? 5: String features

# Attempt 1: remove them
for i in v_sets:
    if 'Account ID String' in list(v_sets[i].columns):
        v_sets[i] = v_sets[i].drop('Account ID String', axis=1)
    if 'Address' in list(v_sets[i].columns):
        v_sets[i] = v_sets[i].drop('Address', axis=1)
    if 'Name' in list(v_sets[i].columns):
        v_sets[i] = v_sets[i].drop('Name', axis=1)

In [7]:
v_sets['Account']

Unnamed: 0_level_0,Revenue Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag,source_degree,target_degree
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1502000,0.8,0.0,0.0,-1.0,2.0,7.0
1502001,0.1,0.0,0.0,-1.0,3.0,5.0
1502002,0.1,2492.0,0.0,0.0,5.0,6.0
1502003,0.8,0.0,0.0,-1.0,5.0,11.0
1502004,0.1,0.0,0.0,-1.0,3.0,3.0
...,...,...,...,...,...,...
15020149208,0.1,0.0,0.0,-1.0,0.0,1.0
15020149211,0.8,0.0,0.0,-1.0,0.0,1.0
15020151147,0.3,0.0,0.0,-1.0,0.0,1.0
15020151148,0.8,0.0,0.0,-1.0,0.0,1.0


In [8]:
# Sort based on testingFlag
for i in v_sets:
    v_sets[i] = v_sets[i].sort_values('testingFlag')
    print(i,":")
    print(v_sets[i].testingFlag.value_counts())
    v_sets[i] = v_sets[i].drop('testingFlag', axis=1)

Account :
-1.0    126863
 0.0     13769
 1.0      1244
Name: testingFlag, dtype: int64
Address :
-1.0    28432
 0.0     1568
Name: testingFlag, dtype: int64
Customer :
-1.0    42127
 0.0    13650
 1.0      449
Name: testingFlag, dtype: int64
Derived Entity :
-1.0    27286
 0.0     3925
 1.0       63
Name: testingFlag, dtype: int64
External Entity :
-1.0    55207
 0.0     4757
 1.0       36
Name: testingFlag, dtype: int64


In [9]:
# Removing ExtendedCaseID:

for i in v_sets:
    v_sets[i] = v_sets[i].drop('ExtendedCaseGraphID', axis=1)

v_sets['Account']

Unnamed: 0_level_0,Revenue Size Flag,CoreCaseGraphID,source_degree,target_degree
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1502000,0.8,0.0,2.0,7.0
15020057228,0.6,0.0,6.0,8.0
15020057227,0.6,0.0,2.0,8.0
15020057226,0.8,0.0,2.0,5.0
15020057225,0.6,0.0,6.0,7.0
...,...,...,...,...
15020070563,0.3,427.0,2.0,4.0
1502002233,0.1,3549.0,4.0,7.0
15020052758,0.3,3573.0,4.0,2.0
15020135827,0.1,3786.0,0.0,1.0


# 1. Train HinSAGE on all the nodes

Note: Embedding of the Accounts only for this stage. 
It'pretty easy to just repeat the process for other node categories and concatenate the results. For now I am trying with the Accounts.

In [10]:
batch_size = 500
epochs = 100
num_samples = [8, 4]
dropout = 0.7
hinsage_layer_sizes = [32, 32]

In [11]:
G = sg.StellarDiGraph(v_sets, e_sets)

In [12]:
generator = HinSAGENodeGenerator(
    G, 
    batch_size, 
    num_samples,
    head_node_type="Account"
)

In [13]:
hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes,
    activations=['relu', 'softmax'],
    generator=generator, 
    bias=True,
    normalize="l2",
    dropout=dropout
)

In [14]:
def run_deep_graph_infomax(base_model, generator, epochs):
    t0 = time.time()
    corrupted_generator = CorruptedGenerator(generator)
    gen = corrupted_generator.flow(G.nodes(node_type="Account"))
    infomax = DeepGraphInfomax(base_model, corrupted_generator)

    x_in, x_out = infomax.in_out_tensors()

    # Train DGI
    model = Model(inputs=x_in, outputs=x_out)
    model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3))
    es = EarlyStopping(monitor="loss", min_delta=0, patience=15)
    history = model.fit(gen, epochs=epochs, verbose=1, callbacks=[es])
    sg.utils.plot_history(history)

    x_emb_in, x_emb_out = base_model.in_out_tensors()
    if generator.num_batch_dims() == 2:
        x_emb_out = tf.squeeze(x_emb_out, axis=0)

    t1 = time.time()
    print(f'Time required: {t1-t0:.2f} s ({(t1-t0)/60:.1f} min)')
    
    return x_emb_in, x_emb_out, model

In [None]:
# Run Deep Graph Infomax

x_emb_in, x_emb_out, model = run_deep_graph_infomax(hinsage, generator, epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
 40/284 [===>..........................] - ETA: 1:53 - loss:

# 2. Use the model to predict the embedding of the training and cross validation set

In [None]:
# take the training + cv nodes from v_sets['Account']
# aka the nodes with testingFlag = 0

train_cv_set = v_sets['Account'][126863:126863+13769]
train_cv_set

In [None]:
# Perform embeddings on them

emb_model = Model(inputs=x_emb_in, outputs=x_emb_out)
train_cv_embs = emb_model.predict(
    generator.flow(train_cv_set.index.values)
)

In [None]:
train_cv_embs[52]

# 3. TSNE on train + cv set

In [None]:
train_cv_embs_2d = pd.DataFrame(
    TSNE(n_components=2).fit_transform(train_cv_embs), 
    index=train_cv_set.index.values
)

In [None]:
# Coloring based on ExtendedCaseGraphID

# these are the training+cv indexes
node_ids = train_cv_set.index.values.tolist()

# these are the training+cv Extended case ID
ext_targets_2 = v_sample.loc[[int(node_id) for node_id in node_ids]].ExtendedCaseGraphID 

label_map = {l: i*10 for i, l in enumerate(np.unique(ext_targets_2), start=10) if pd.notna(l)}
node_colours = [label_map[target] if pd.notna(target) else 0 for target in ext_targets_2]

In [None]:
alpha = 0.7

fig, ax = plt.subplots(figsize=(15, 15))
ax.scatter(
    train_cv_embs_2d[0],
    train_cv_embs_2d[1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)
ax.set(aspect="equal")
plt.title("TSNE visualization of HinSAGE embeddings with Deep Graph Infomax - coloring on ExtendedCaseGraphID")
plt.show()

In [None]:
# node degree based coloring

# these are the training+cv source degrees
ext_targets_3 = v_sample.loc[[int(node_id) for node_id in node_ids]].source_degree

label_map = {l: i*100 for i, l in enumerate(np.unique(ext_targets_3), start=10) if pd.notna(l)}
node_colours = [label_map[target] if pd.notna(target) else 0 for target in ext_targets_3]

In [None]:
alpha = 0.7

fig, ax = plt.subplots(figsize=(15, 15))
ax.scatter(
    train_cv_embs_2d[0],
    train_cv_embs_2d[1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)
ax.set(aspect="equal")
plt.title("TSNE visualization of HinSAGE embeddings with Deep Graph Infomax - coloring based on node source degree")
plt.show()

In [None]:
# account core case ID based coloring

# these are the training+cv core case IDs
ext_targets_5 = v_sample.loc[[int(node_id) for node_id in node_ids]]['CoreCaseGraphID']

label_map = {l: i*100 for i, l in enumerate(np.unique(ext_targets_5), start=10) if pd.notna(l)}
node_colours = [label_map[target] if pd.notna(target) else 0 for target in ext_targets_5]

In [None]:
alpha = 0.7

fig, ax = plt.subplots(figsize=(15, 15))
ax.scatter(
    train_cv_embs_2d[0],
    train_cv_embs_2d[1],
    c=node_colours,
    cmap="jet",
    alpha=alpha,
)
ax.set(aspect="equal")
plt.title("TSNE visualization of HinSAGE embeddings with Deep Graph Infomax - coloring based on CoreCaseGraphID")
plt.show()

# 4. Create training and cross validation sets

Note: I am not using fancy splitting methods since I want to keep track of the order of the nodes. 

In [None]:
# very rudimentary and shitty splitting:

n_embs = train_cv_embs.shape[0]

train_set = train_cv_embs[:10000]
train_labels = ext_targets_2.values[:10000]

cv_set = train_cv_embs[-3769:]
cv_labels = ext_targets_2.values[-3769:]

The train_set is nothing more than the embedding of the account nodes of the training set.

This means that I can get the ID of the first node just by incrementing the index of
the train_set by 10000 and look at the train_cv_set dataframe.

In [None]:
# this should be the first node in the train_set
train_cv_set.iloc[0]

In [None]:
# this should be its embeddings
train_set[0]

In [None]:
train_labels[0]

In [None]:
# the same can be said about the CV set
train_cv_set.iloc[10000]

In [None]:
cv_set[0]

In [None]:
cv_labels[0]

If this is correct, the extended ID of node 15020030225 must be 135.

And the extended ID of node 15020041132 must be 3449.

In [None]:
v_sample.loc[15020030225]

In [None]:
v_sample.loc[15020041132]

Confirmed. The labels are correct.

# 5. Train a classifier to predict ExtendedGraphCaseID

In [None]:
classifiers = {
    'mnb': MultinomialNB(),
    'gnb': GaussianNB(),
    'svm1': SVC(kernel='linear'),
    'svm2': SVC(kernel='rbf'),
    'svm3': SVC(kernel='sigmoid'),
    #'mlp1': MLPClassifier(),
    #'mlp2': MLPClassifier(hidden_layer_sizes=[100, 100]),
    'ada': AdaBoostClassifier(),
    'dtc': DecisionTreeClassifier(),
    'rfc': RandomForestClassifier(),
    #'gbc': GradientBoostingClassifier(),
    'lr': LogisticRegression(multi_class="auto", solver="lbfgs", max_iter=200)
}

In [None]:
f1_scores = dict()
accs = dict()

for clf_name in classifiers:
    t0 = time.time()
    clf = classifiers[clf_name]
    clf.fit(train_set, train_labels)
    cv_pred = clf.predict(cv_set)
    clf_f1 = f1_score(cv_labels, cv_pred, average='weighted')
    clf_acc = (cv_pred == cv_labels).mean()
    f1_scores[clf_name] = clf_f1
    accs[clf_name] = clf_acc
    t1 = time.time()
    print(f"Classifier: {clf_name} - f1: {clf_f1:.3f} - acc: {clf_acc:.3f} - time: {t1-t0:.0f} s")

# First optimizations

The classes are extremely imbalanced. An easy technique could be to add class weight directly to the model

In [None]:
# It looks like a gradient boosting classifier works well

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

#cw = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
#sample_weights = compute_sample_weight('balanced', y=train_labels)


In [None]:
sample_weights = np.ones(10000)
sample_weights[0] = 0.09

In [None]:
#classifier = LogisticRegression(
#    verbose=0,
#    solver='lbfgs',
#    #max_iter=200,
#)
classifier = DecisionTreeClassifier()

classifier.fit(
    X=train_set,
    y=train_labels,
    sample_weight=sample_weights,
)

In [None]:
cv_pred = classifier.predict(cv_set)
f1_avg = f1_score(cv_labels, cv_pred, average='micro')
acc = (cv_pred == cv_labels).mean()

print(f"f1: {f1_avg:.3f} - acc: {acc:.3f}")

In [None]:
confusion_matrix = pd.crosstab(
    cv_labels,
    cv_pred,
    rownames=['True'],
    colnames=['Predicted'],
    margins=True
)

# Understanding why it sucks so bad

Clearly, there are many problems.

One of the main ones is the distributions of the labels. The data is severely imbalanced. It makes no sense to train a logistic classifier on data with a class that is 120x times the size of the second biggest class.

Ways to overcome data imbalance for classification:

- Super & Under sampling techniques, like SMOTE and ADASYN
- XGBoost with tuned parameters for imbalanced classification (aka weight classes differently)
- Ensembles: basically parallelizing classifiers that each can manage a small imbalance instead of having a strong imbalance on one classifier.

Another problem is the quality of the embeddings: they are just bad. Some embedded vectors have really obvious forms like all 0's and only one dimension with non-zero value.

And at last: Feature quality. I can't blame the embedding model for producing very bad embeddings when the graph it's using has basically 1 feature per node type. For instance, Account nodes only have the Revenue Size Flag and CoreCase ID. Not enough for producing good results. 

Let's take a look at the labels and predictions:

In [None]:
# CV set labels (ext case)
pd.DataFrame(cv_labels).value_counts()

In [None]:
# CV set predictions
pd.DataFrame(cv_pred).value_counts()

In [None]:
confusion_matrix

In [None]:
# Sparse confusion matrix visualization

sns.set(rc={'figure.figsize':(6, 10)})
plt.spy(confusion_matrix, precision = 0.1, markersize = 2)