# Blogs Dataset

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from datetime import datetime
import re

In [2]:
# Load the CSV file into a pandas dataframe
# only load the first 10000 rows
df = pd.read_csv("./modularity_aware_gae/data/IBM_AML/HI-Small_Trans.csv", nrows=100)

# Check for null values
null_values = df.isnull().sum()
print(null_values)

Timestamp             0
From Bank             0
Account               0
To Bank               0
Account.1             0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0
dtype: int64


In [3]:
# Create a mapping from account IDs to integers
account_to_int = {account: idx for idx, account in enumerate(set(df['Account']).union(set(df['Account.1'])))}
int_to_account = {idx: account for account, idx in account_to_int.items()}
df2 = df.copy()
df2['Account'] = df2['Account'].map(account_to_int)
df2['Account.1'] = df2['Account.1'].map(account_to_int)

# Create a new graph with integer node labels
G2 = nx.from_pandas_edgelist(df2, source='Account', target='Account.1', edge_attr='Amount Paid', create_using=nx.Graph())

# save the graph G to a file without edge attributes (Amount Paid)
nx.write_edgelist(G2, "./modularity_aware_gae/data/IBM_AML/IBM.edgelist", data=False)

In [4]:
# working out the label set from the IBM df
# we will consider a list of False with size of the number of unique values in the union of the two columns
# we will first consider all the accounts that have atleast one transaction with Is Laundering = 1
# let's set the value of the list to True for these accounts each number should
# correspond to the index of the account in the account_to_int dictionary
# we will then save this list to a file

# Create a list of False values with the size of the number of unique accounts
labels = [False] * len(account_to_int)

# Find accounts with at least one laundering transaction
laundering_accounts = set(df2[df2['Is Laundering'] == 1]['Account'])
laundering_accounts = laundering_accounts.union(set(df2[df2['Is Laundering'] == 1]['Account.1']))

# Set the label to True for accounts with laundering transactions
for account in laundering_accounts:
    labels[account] = True

# Save the labels to a file
with open("./modularity_aware_gae/data/IBM_AML/IBM-labels.csv", 'w') as f:
    for label in labels:
        f.write(f"{str(label)}\n")



In [6]:
%pwd
%cd modularity_aware_gae/modularity_aware_gae
!python train.py --dataset_path=../data/IBM_AML/IBM.edgelist --labelset_path=../data/IBM_AML/IBM-labels.csv --features=False --task=task_2 --model=gcn_vae --iterations=200 --learning_rate=0.01 --hidden=32 --dimension=16 --beta=0.75 --lamb=0.5 --gamma=2 --s_reg=10 --fastgae=False --nb_run=1
%cd ../..

c:\Users\Moher\Personal\PhD\Projects\AML GNN\modularity_aware_gae\modularity_aware_gae

 
 
 
[MODULARITY-AWARE GRAPH AUTOENCODERS]
 
 
 

EXPERIMENTAL SETTING 

- Graph dataset: None
- Mode name: gcn_vae
- Number of models to train: 1
- Number of training iterations for each model: 200
- Learning rate: 0.01
- Dropout rate: 0.0
- Use of node features in the input layer: False
- Dimension of the GCN hidden layer: 32
- Dimension of the output layer: 16
- lambda: 0.5
- beta: 0.75
- gamma: 2.0
- s: 10
- FastGAE: no 

Final embedding vectors will be evaluated on:
- Task 2, i.e., joint community detection and link prediction

 
 
 

Using custom dataset from: ../data/IBM_AML/IBM.edgelist
Using custom labels from: ../data/IBM_AML/IBM-labels.csv
LOADING DATA

Loading custom dataset
- Number of nodes: 116
- Number of communities: 1
- Use of node features: False
Done! 
 
 
 

EXPERIMENTS ON MODEL 1 / 1 

STEP 1/3 - PREPROCESSING STEPS 

Masking some edges from the training graph, for link predic

2025-03-05 22:33:31.395583: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_100.dll'; dlerror: cudart64_100.dll not found
2025-03-05 22:33:31.395857: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-03-05 22:33:33.694183: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library nvcuda.dll
2025-03-05 22:33:33.720031: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: 
name: NVIDIA GeForce RTX 4080 major: 8 minor: 9 memoryClockRate(GHz): 2.505
pciBusID: 0000:01:00.0
2025-03-05 22:33:33.721169: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_100.dll'; dlerror: cudart64_100.dll not found
2025-03-05 22:33:33.722109: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cubl