# Trying to implement MAGNN on financial data database

paper repo: https://github.com/cynricfu/MAGNN/

specifc notebook: https://github.com/cynricfu/MAGNN/blob/master/preprocess_IMDB.ipynb

In [48]:
import os
import sys
import pathlib

import pandas as pd
import numpy as np
import networkx as nx

import scipy
import scipy.sparse
import scipy.io

from collections import defaultdict, Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [293]:
save_prefix = '../data/preprocessed/MAGNN_test/'
num_node_types = 5

In [42]:
# load raw data
sys.path.append('../data')
from dataset_pointers import graph_edges, graph_nodes

edges = pd.read_csv(graph_edges, low_memory=False, index_col='edge_id')
nodes = pd.read_csv(graph_nodes, low_memory=False, index_col='node_id')

In [44]:
# take a small sample of the dataset
# here I'm taking 10000 edges and the nodes associated with them

edges_sample = edges.sample(n=10000)
nodes_list = list(pd.Categorical(list(edges_sample.to_id)+list(edges_sample.from_id)).categories)
nodes_sample = nodes[nodes.index.isin(nodes_list)]

In [45]:
# set missing core/extended case to 0

nodes_sample.CoreCaseGraphID = nodes_sample.CoreCaseGraphID.fillna(0)
nodes_sample.ExtendedCaseGraphID = nodes_sample.ExtendedCaseGraphID.fillna(0) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [209]:
# Create separate dataframes for each node and edge labels

v_sets = defaultdict()
for v_type in list(pd.Categorical(nodes_sample.Label).categories):
    v_sets[v_type] = nodes_sample[nodes_sample.Label == v_type]
    v_sets[v_type] = v_sets[v_type].drop(['Label', 'testingFlag']+list(v_sets[v_type].columns[v_sets[v_type].isnull().all()]), axis=1)
    
e_sets = defaultdict()
for e_type in list(pd.Categorical(edges_sample.Label).categories):
    e_sets[e_type] = edges_sample[edges_sample.Label == e_type]
    e_sets[e_type] = e_sets[e_type].drop(['Label']+list(e_sets[e_type].columns[e_sets[e_type].isnull().all()]), axis=1)
    e_sets[e_type] = e_sets[e_type].rename(columns={'from_id':'source', 'to_id':'target'})


In [210]:
# convert string data in numerical data where possible

#1. "logical" conversion
#Revenue Size Flag: low, mid_low, medium, mid_high, high -> 1,2,3,4,5
conversion = {'low':1, 'mid_low':2, 'medium':3, 'mid_high':4, 'high':5}
for i in v_sets:
    if 'Revenue Size Flag' in list(v_sets[i].columns):
        v_sets[i]['Revenue Size Flag']=v_sets[i]['Revenue Size Flag'].map(conversion)
        
#Income Size Flag: low, medium, high -> 1,2,3
conversion = {'low':1, 'medium':2, 'high':3}
for i in v_sets:
    if 'Income Size Flag' in list(v_sets[i].columns):
        v_sets[i]['Income Size Flag']=v_sets[i]['Income Size Flag'].map(conversion)
        
#Similarity Strength: weak, medium, strong -> 1,2,3
conversion = {'weak':1, 'medium':2, 'strong':3}
for i in e_sets:
    if 'Similarity Strength' in list(e_sets[i].columns):
        e_sets[i]['Similarity Strength']=e_sets[i]['Similarity Strength'].map(conversion)

#Amount Flag: small, medium, large -> 10,100,1000 (just to change the logic, the final choice is up to you) -> treated as weights
conversion = {'small':10, 'medium':100, 'large':1000}
for i in e_sets:
    if 'Amount Flag' in list(e_sets[i].columns):
        e_sets[i]['Amount Flag']=e_sets[i]['Amount Flag'].map(conversion)
        e_sets[i] = e_sets[i].rename(columns={'Amount Flag':'weight'})

#2. one-hot encoding
#Person or Organisation: create 2 bool columns, one for Person, one for Organisation (could have just created a single boolean column: 0->Person, 1->Organization)
for i in v_sets:
    if 'Person or Organisation' in list(v_sets[i].columns):
        v_sets[i] = pd.get_dummies(v_sets[i], columns=['Person or Organisation'])

#3. more complex transformations (i.e. from strings to numbers) -> the limit is your imagination!
#Vertices: Account ID String, Address, Name
#Fast solution: dropping non numerical attributes, but we're loosing lot of information
for i in v_sets:
    if 'Account ID String' in list(v_sets[i].columns):
        v_sets[i] = v_sets[i].drop('Account ID String', axis=1)
    if 'Address' in list(v_sets[i].columns):
        v_sets[i] = v_sets[i].drop('Address', axis=1)
    if 'Name' in list(v_sets[i].columns):
        v_sets[i] = v_sets[i].drop('Name', axis=1)

In [268]:
# Quick Meta-path exploration
# Note: I'm taking these from a sample of nodes but it's big enough to give me the most traversed meta paths
# Note: I'm taking only first-order metapaths but it might be interesting to use more deep ones

In [273]:
# source of nodes 'money transfer'
mt = e_sets['money transfer'].source.values
nodes.loc[mt].Label.value_counts()

Account           4328
Derived Entity    1125
Name: Label, dtype: int64

In [274]:
# source of nodes 'has account'
mt = e_sets['has account'].source.values
nodes.loc[mt].Label.value_counts()

Customer    2910
Name: Label, dtype: int64

In [275]:
# source of nodes 'has address'
mt = e_sets['has address'].source.values
nodes.loc[mt].Label.value_counts()

Customer    435
Name: Label, dtype: int64

In [276]:
# source of nodes 'is similar'
mt = e_sets['is similar'].source.values
nodes.loc[mt].Label.value_counts()

Derived Entity    1081
Customer           121
Name: Label, dtype: int64

In [269]:
# destination of nodes 'money transfer'
mt = e_sets['money transfer'].target.values
nodes.loc[mt].Label.value_counts()

Account           4345
Derived Entity    1108
Name: Label, dtype: int64

In [270]:
# destination of nodes 'has account'
mt = e_sets['has account'].target.values
nodes.loc[mt].Label.value_counts()

Account    2910
Name: Label, dtype: int64

In [271]:
# destination of nodes 'has address'
mt = e_sets['has address'].target.values
nodes.loc[mt].Label.value_counts()

Address    435
Name: Label, dtype: int64

In [272]:
# destination of nodes 'is similar'
mt = e_sets['is similar'].target.values
nodes.loc[mt].Label.value_counts()

External Entity    1202
Name: Label, dtype: int64

In [277]:
# Metapaths:

# Account -(money transfer)-> Account
# Account -(money transfer)-> Derived Entity
# Derived Entity -(money transfer)-> Account
# Derived Entity -(money transfer)-> Derived Entity

# Customer -(has account)-> Account

# Customer -(has address)-> Address

# Customer -(is simialr)-> External Entity
# Derived Entity -(is simialr)-> External Entity

In [278]:
# Metapaths by source node:

# Account -(money transfer)-> Account
# Account -(money transfer)-> Derived Entity
# Customer -(has account)-> Account
# Customer -(has address)-> Address
# Customer -(is simialr)-> External Entity
# Derived Entity -(money transfer)-> Account
# Derived Entity -(money transfer)-> Derived Entity
# Derived Entity -(is simialr)-> External Entity

# Other probable metapaths?
# Account -(money transfer)-> Account -(money transfer)-> Account
# Account -(money transfer)-> Derived Entity -(money transfer)-> Account

# with numerical categories:
# 0 = account, 1 = customer, 2 = derived entity, 3 = external entity, 4 = address

# 0 -(money transfer)-> 0
# 0 -(money transfer)-> 2
# 1 -(has account)-> 0
# 1 -(has address)-> 4
# 1 -(is simialr)-> 3
# 2 -(money transfer)-> 0
# 2 -(money transfer)-> 2
# 2 -(is simialr)-> 3

# Final metapaths:

# 0 0
# 0 2
# 0 0 0
# 0 2 0

# 1 0
# 1 4
# 1 3

# 2 0
# 2 2
# 2 3


In [237]:
e_sets['has account']

Unnamed: 0_level_0,source,target
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1
500001112232,1001041973,15020061082
5000012903,10010725,1502012903
50000111812,100102931,15020111812
500111616882,1001044580,15020067178
50000172224,1001025590,15020033281
...,...,...
500111676788,1001021992,15020041981
50000186044,1001029368,15020034895
500111636214,1001014428,15020026815
5000013753,10010938,1502013754


In [212]:
e_sets['has address']

Unnamed: 0_level_0,source,target
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1
550111737091,1001035991,250119394
550111722402,1001010259,250119474
550111738390,1001032300,2501112204
550111738461,1001044396,2501126329
550111753798,1001029281,2501124442
...,...,...
550111743542,1001031460,250111190
550111739831,100103940,2501115763
550111744874,1001037211,2501110725
550111750815,100104940,25011962


In [213]:
e_sets['is similar']

Unnamed: 0_level_0,source,target,Similarity Strength
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
650111846035,20030021454,3001142695,3
650111796671,20030021739,3001132338,2
650111820626,20030014321,3001132654,2
650111829863,2003002346,3001135010,1
650111780496,2003008088,3001181444,1
...,...,...,...
650111858657,20030021779,3001150001,1
650111801549,2003003055,3001174987,2
650111823824,20030060,3001139913,2
650111849652,20030021484,3001160847,1


In [214]:
e_sets['money transfer']

Unnamed: 0_level_0,source,target,weight
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
450111511478,20030021674,1502005275,1000
350111287294,15020046291,15020026316,1000
400111398836,150200353,20030012280,10
450111532716,2003006369,15020026774,1000
350111137688,15020060969,1502002892,1000
...,...,...,...
350111229027,15020080656,15020045585,100
350111333343,15020046368,15020086815,10
350111334655,15020055869,15020086257,1000
450111561183,20030016606,15020080716,10


In [215]:
v_sets['Account']

Unnamed: 0_level_0,Revenue Size Flag,CoreCaseGraphID,ExtendedCaseGraphID
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1502000,4,0.0,0.0
1502007,2,0.0,0.0
15020011,4,0.0,0.0
15020013,2,1566.0,0.0
15020014,1,0.0,0.0
...,...,...,...
15020148639,3,0.0,0.0
15020148652,2,0.0,0.0
15020148755,4,0.0,0.0
15020148805,5,0.0,0.0


In [216]:
v_sets['Customer']

Unnamed: 0_level_0,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,Person or Organisation_Organisation,Person or Organisation_Person
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100109666,2,0.0,0.0,1,0
100109678,1,0.0,1025.0,1,0
100109681,1,0.0,0.0,0,1
100109707,2,0.0,0.0,0,1
100109723,2,0.0,0.0,0,1
...,...,...,...,...,...
1001028629,3,0.0,1364.0,1,0
1001028653,2,764.0,0.0,0,1
1001028662,2,0.0,0.0,0,1
1001028719,3,0.0,0.0,0,1


In [217]:
v_sets['Derived Entity']

Unnamed: 0_level_0,CoreCaseGraphID,ExtendedCaseGraphID,Person or Organisation_Organisation,Person or Organisation_Person
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
200300142,0.0,0.0,1,0
200300163,0.0,0.0,1,0
200300167,3635.0,3635.0,0,1
200300170,0.0,0.0,1,0
200300196,0.0,1761.0,1,0
...,...,...,...,...
20030017484,0.0,0.0,0,1
20030017489,0.0,0.0,0,1
20030017493,0.0,0.0,0,1
20030017541,0.0,117.0,1,0


In [218]:
v_sets['External Entity']

Unnamed: 0_level_0,CoreCaseGraphID,ExtendedCaseGraphID,Person or Organisation_Organisation,Person or Organisation_Person
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3001130065,1940.0,0.0,1,0
3001130096,0.0,0.0,0,1
3001130099,0.0,1668.0,0,1
3001130144,0.0,2896.0,1,0
3001130158,0.0,0.0,0,1
...,...,...,...,...
3001177251,0.0,0.0,0,1
3001177467,0.0,0.0,0,1
3001177468,0.0,0.0,1,0
3001177515,0.0,0.0,1,0


In [219]:
v_sets['Address']

Unnamed: 0_level_0,CoreCaseGraphID,ExtendedCaseGraphID
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1
250117255,0.0,0.0
250117536,0.0,0.0
250117599,0.0,0.0
250117724,0.0,0.0
250117968,0.0,0.0
...,...,...
250118471,0.0,0.0
250118477,0.0,0.0
250118560,0.0,0.0
250118625,0.0,0.0


In [220]:
# build the adjacency matrix for the graph consisting of 
# Accounts, Customers, Derived entities, Extenral entities and Addresses
# 0 = account, 1 = customer, 2 = derived entity, 3 = external entity, 4 = address

dim = 0
for set in v_sets:
    dim += len(v_sets[set])

type_mask = np.zeros((dim), dtype=int)
it = 0
for i in range(nodes_sample.Label.value_counts().count()-1):
    it += nodes_sample.Label.value_counts()[i]
    type_mask[it:it+nodes_sample.Label.value_counts()[i+1]] = i+1

In [226]:
# ????

v_sets['Account'].iterrows()

<generator object DataFrame.iterrows at 0x14026bc10>

In [295]:
adjM = np.zeros((dim, dim), dtype=int)

''' TODO
for movie_idx, row in v_sets['Account'].iterrows():
    if row['director_name'] in directors:
        director_idx = directors.index(row['director_name'])
        adjM[movie_idx, len(movies) + director_idx] = 1
        adjM[len(movies) + director_idx, movie_idx] = 1
    if row['actor_1_name'] in actors:
        actor_idx = actors.index(row['actor_1_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_2_name'] in actors:
        actor_idx = actors.index(row['actor_2_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_3_name'] in actors:
        actor_idx = actors.index(row['actor_3_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
'''

adjM

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [286]:
# Meta paths: using category numbers previously set:

# 0 0
# 0 2
# 0 0 0
# 0 2 0

# 1 0
# 1 4
# 1 3

# 2 0
# 2 2
# 2 3

expected_metapaths = [
    [(0, 0), (0, 2), (0, 0, 0), (0, 2, 0)],
    [(1, 0), (1, 4), (1, 3)],
    [(2, 0), (2, 2), (2, 3)],
    [],
    [],
]

In [281]:
def get_metapath_neighbor_pairs(M, type_mask, expected_metapaths):
    """
    :param M: the raw adjacency matrix
    :param type_mask: an array of types of all node
    :param expected_metapaths: a list of expected metapaths
    :return: a list of python dictionaries, consisting of metapath-based neighbor pairs and intermediate paths
    """
    outs = []
    for metapath in expected_metapaths:
        # consider only the edges relevant to the expected metapath
        mask = np.zeros(M.shape, dtype=bool)
        for i in range((len(metapath) - 1) // 2):
            temp = np.zeros(M.shape, dtype=bool)
            temp[np.ix_(type_mask == metapath[i], type_mask == metapath[i + 1])] = True
            temp[np.ix_(type_mask == metapath[i + 1], type_mask == metapath[i])] = True
            mask = np.logical_or(mask, temp)
        partial_g_nx = nx.from_numpy_matrix((M * mask).astype(int))

        # only need to consider the former half of the metapath
        # e.g., we only need to consider 0-1-2 for the metapath 0-1-2-1-0
        metapath_to_target = {}
        for source in (type_mask == metapath[0]).nonzero()[0]:
            for target in (type_mask == metapath[(len(metapath) - 1) // 2]).nonzero()[0]:
                # check if there is a possible valid path from source to target node
                has_path = False
                single_source_paths = nx.single_source_shortest_path(
                    partial_g_nx, source, cutoff=(len(metapath) + 1) // 2 - 1)
                if target in single_source_paths:
                    has_path = True

                #if nx.has_path(partial_g_nx, source, target):
                if has_path:
                    shortests = [p for p in nx.all_shortest_paths(partial_g_nx, source, target) if
                                 len(p) == (len(metapath) + 1) // 2]
                    if len(shortests) > 0:
                        metapath_to_target[target] = metapath_to_target.get(target, []) + shortests
        metapath_neighbor_paris = {}
        for key, value in metapath_to_target.items():
            for p1 in value:
                for p2 in value:
                    metapath_neighbor_paris[(p1[0], p2[0])] = metapath_neighbor_paris.get((p1[0], p2[0]), []) + [
                        p1 + p2[-2::-1]]
        outs.append(metapath_neighbor_paris)
    return outs

In [287]:
def get_networkx_graph(neighbor_pairs, type_mask, ctr_ntype):
    indices = np.where(type_mask == ctr_ntype)[0]
    idx_mapping = {}
    for i, idx in enumerate(indices):
        idx_mapping[idx] = i
    G_list = []
    for metapaths in neighbor_pairs:
        edge_count = 0
        sorted_metapaths = sorted(metapaths.items())
        G = nx.MultiDiGraph()
        G.add_nodes_from(range(len(indices)))
        for (src, dst), paths in sorted_metapaths:
            for _ in range(len(paths)):
                G.add_edge(idx_mapping[src], idx_mapping[dst])
                edge_count += 1
        G_list.append(G)
    return G_list

In [294]:
for i in range(num_node_types):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)

# too computationally slow
#for i in range(num_node_types):
    #neighbor_pairs = get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])
    #G_list = get_networkx_graph(neighbor_pairs, type_mask, i)



In [None]:
# COMMPONENTS

# 1. adjacency matrix of all node types
# 2. meta-path based networks (adjacency lists)
# 3. all nodes features
# 4. all nodes type labels
# 5. movie genre labels
# 6. movie train/validation/test split

In [297]:
# STOP:

