In [2]:
!pip install torch lightning numpy kaggle wandb
!pip install polars  -U
!pip install -U torch-geometric
#!pip install git+https://github.com/rusty1s/pytorch_geometric.git

Collecting lightning
  Downloading lightning-2.1.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting wandb
  Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.3.0.post0-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.1.3-py3-none-any.whl (777 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitP

In [3]:
from google.colab import files

# Carica il file kaggle.json
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"alexxxyy47","key":"1f2860ea591d24fc6810f02a5403dd91"}'}

In [4]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [5]:
!kaggle datasets download -d ealaxi/paysim1
!unzip paysim1.zip
!rm paysim1.zip
!mkdir models

Downloading paysim1.zip to /content
100% 178M/178M [00:02<00:00, 99.7MB/s]
100% 178M/178M [00:02<00:00, 72.6MB/s]
Archive:  paysim1.zip
  inflating: PS_20174392719_1491204439457_log.csv  


In [6]:
import pandas as pd, sys, plotly.graph_objects as go, plotly.express as px, numpy as np, torch, random as rnd, torch.nn as nn, lightning as l, wandb as wndb
from torch.utils.data import Dataset, DataLoader
from sklearn.utils import shuffle
from torch_geometric import seed_everything
import polars as pl
from torch_geometric.data import Data
import pdb
from torch_geometric.nn import GCNConv
import torchmetrics
from torch.nn import Linear, ReLU
from torch_geometric.nn import Sequential  as GSequential, GCNConv, GATConv
from torchmetrics.classification import BinaryAccuracy, BinaryF1Score, BinaryPrecision, BinaryRecall

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [7]:
# PARAMETERS

DEVICE = "cuda"
SEED = 42

rnd.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
#torch.backends.cudnn.deterministic = False
# torch.backends.cudnn.deterministic = True
seed_everything(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
ACCELERATOR =  "gpu" if torch.cuda.is_available() else "cpu"
POS_SIZE = 150
NEG_SIZE = 1200



In [8]:
# UTILS FUNCTIONS

def load_dataframe( dataset_file : str):
    return pl.read_csv(dataset_file)


def find_null_or_empty_records( dataframe: pd.DataFrame):
    n = len(dataframe)
    for index, row in dataframe.iterrows():
        print_progress_bar(index/n)
        # Controlla se ci sono valori nulli o vuoti nel record
        if row.isnull().any() or any(map(lambda x: x == '', row)):
            # Stampa il record
            print(f"Record con valori nulli o vuoti:\n{row}\n")

def print_progress_bar(percentuale, lunghezza_barra=20):
    blocchi_compilati = int(lunghezza_barra * percentuale)
    barra = "[" + "=" * (blocchi_compilati - 1) + ">" + " " * (lunghezza_barra - blocchi_compilati) + "]"
    sys.stdout.write(f"\r{barra} {percentuale * 100:.2f}% completo")
    sys.stdout.flush()


def compute_kind_inconsistence(dataframe):
    return {"inconsistent orig balance": len(dataframe.query('abs(oldbalanceOrg - newbalanceOrig) != amount'))/len(dataframe),
            "inconsistent dest balance": len(dataframe.query('abs(oldbalanceDest - newbalanceDest) != amount'))/len(dataframe),
            "zero cash transaction": len(dataframe.query('amount == 0 '))/len(dataframe),
            "self-transaction": len(dataframe.query('nameOrig == nameDest'))/len(dataframe)
            }

def plot_histogram(to_plot):


    # Converti il dizionario in un array di valori
    values = list(to_plot.values())

    # Crea un istogramma
    fig = go.Figure(data=[go.Bar(x=list(to_plot.keys()), y=values)])

    # Mostra l'istogramma
    fig.show()



def plot_categories(dataframe):
    # Calcola la frequenza di ogni categoria nella colonna 'type'
    counts = dataframe['type'].value_counts().reset_index()

    # Rinomina le colonne
    counts.columns = ['type', 'count']

    counts['count'] = counts['count'] / counts['count'].sum()

    # Crea l'istogramma con Plotly Express
    fig = px.bar(counts, x='type', y='count', title='Istogramma delle categorie nella colonna "type"')

    # Mostra il plot
    fig.show()

def create_name_dict(df):
  df1 = df.select(pl.col("nameOrig").alias('name'))
  df2 = df.select(pl.col("nameDest").alias('name'))
  df = pl.concat([df1,df2])
  df = df.unique()
  names = list(df['name'])
  return dict(zip(names,list(range(len(names)))))


def divide_dataset(dataset_file,train_prc,val_prc):
  #breakpoint()
  dataframe = load_dataframe(dataset_file)
  transaction_types = {
      "CASH_IN": 0,
      "CASH_OUT": 1,
      "DEBIT": 2,
      "PAYMENT": 3,
      "TRANSFER": 4
  }

  dataframe = dataframe.with_columns(pl.col("type").replace(transaction_types).cast(pl.Int64).alias("type"),
                                     (pl.col('step')%24).alias('step'))

  id_df  = pl.DataFrame({'id': list(range(len(dataframe)))})

  dataframe = pl.concat([dataframe, id_df], how="horizontal")

  d_neg = dataframe.filter((pl.col('amount') != 0) & (pl.col('isFraud') == 0))
  neg_data_train = d_neg.sample(int(len(d_neg)*train_prc))
  d_neg = d_neg.filter(~pl.col('id').is_in(neg_data_train.select(pl.col('id'))))

  d_pos = dataframe.filter((pl.col('amount') != 0) & (pl.col('isFraud') == 1))
  pos_data_train = d_pos.sample(int(len(d_pos)*train_prc))
  d_pos = d_pos.filter(~pl.col('id').is_in(pos_data_train.select(pl.col('id'))))


  neg_data_val = d_neg.sample(int(len(d_neg)*val_prc))
  d_neg = d_neg.filter(~pl.col('id').is_in(neg_data_val.select(pl.col('id'))))

  pos_data_val = d_pos.sample(int(len(d_pos)*val_prc))
  d_pos = d_pos.filter(~pl.col('id').is_in(pos_data_val.select(pl.col('id'))))

  neg_data_train = neg_data_train.select(pl.exclude('id'))
  pos_data_train = pos_data_train.select(pl.exclude('id'))

  neg_data_val = neg_data_val.select(pl.exclude('id'))
  pos_data_val = pos_data_val.select(pl.exclude('id'))

  d_neg = d_neg.select(pl.exclude('id'))
  d_pos = d_pos.select(pl.exclude('id'))

  return (neg_data_train, pos_data_train), (neg_data_val, pos_data_val), (d_neg,d_pos )

def list_to_dataframe(data):
  rows = []
  for row in data:
    el = {
        'step': row[0],
        'type': int(row[1]),
        'amount': row[2] ,
        'nameOrig': row[3],
        'oldbalanceOrg': row[4],
        'newbalanceOrig': row[5],
        'nameDest': row[6],
        'oldbalanceDest': row[7],
        'newbalanceDest': row[8],
        'isFraud': row[9],
        'isFlaggedFraud': row[10]
     }
    rows.append(el)
  return pl.DataFrame(rows)



def collate( data ):
  if type(data) is list:
  #breakpoint()
    data = list_to_dataframe(data)


  name_d = create_name_dict(data)
  x = torch.tensor([[1] if y.startswith("M") else [0] for y in name_d.keys()], dtype=torch.float).to(DEVICE)
  data = data.with_columns(pl.col('nameOrig').replace(name_d).cast(pl.Int64).alias('nameOrig'), pl.col('nameDest').replace(name_d).cast(pl.Int64).alias('nameDest'))
  edges = data.select(pl.col('nameOrig','nameDest'))
  edge_index = torch.tensor(edges.to_numpy(), dtype=torch.int64).t().contiguous().to(DEVICE)
  y = torch.tensor(data.select(pl.col('isFraud')).to_numpy(), dtype=torch.float).to(DEVICE)
  #edge_attr =  torch.tensor(data.select(pl.col('amount')).to_numpy(), dtype=torch.float).to(self.device)
  edge_attr =  torch.tensor(data.select(pl.col('step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest')).to_numpy(), dtype=torch.float).to(DEVICE)
  data_graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
  return data, data_graph




In [None]:
dataframe = load_dataframe("PS_20174392719_1491204439457_log.csv")
dataframe = dataframe.cast({"isFraud": pl.Int8})

In [None]:
d = create_name_dict(dataframe)

In [None]:
len(d.keys())

9073900

In [None]:
divide_dataset("PS_20174392719_1491204439457_log.csv",0.7,0.1)

((shape: (4_448_084, 11)
  ┌──────┬──────┬───────────┬─────────────┬───┬──────────────┬──────────────┬─────────┬──────────────┐
  │ step ┆ type ┆ amount    ┆ nameOrig    ┆ … ┆ oldbalanceDe ┆ newbalanceDe ┆ isFraud ┆ isFlaggedFra │
  │ ---  ┆ ---  ┆ ---       ┆ ---         ┆   ┆ st           ┆ st           ┆ ---     ┆ ud           │
  │ i64  ┆ str  ┆ f64       ┆ str         ┆   ┆ ---          ┆ ---          ┆ i64     ┆ ---          │
  │      ┆      ┆           ┆             ┆   ┆ f64          ┆ f64          ┆         ┆ i64          │
  ╞══════╪══════╪═══════════╪═════════════╪═══╪══════════════╪══════════════╪═════════╪══════════════╡
  │ 16   ┆ 4    ┆ 2.7249e6  ┆ C111177078  ┆ … ┆ 0.0          ┆ 2.8515e6     ┆ 0       ┆ 0            │
  │ 15   ┆ 0    ┆ 122161.91 ┆ C348307229  ┆ … ┆ 2018844.6    ┆ 1.1433e6     ┆ 0       ┆ 0            │
  │ 16   ┆ 4    ┆ 115519.78 ┆ C106220047  ┆ … ┆ 0.0          ┆ 126625.6     ┆ 0       ┆ 0            │
  │ 20   ┆ 1    ┆ 268483.68 ┆ C1882402481 ┆ … ┆ 

In [None]:
dataframe.columns


['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'isFlaggedFraud']

In [None]:
len(dataframe.filter(pl.col('amount') == 0))

16

In [None]:
#| (pl.col('nameDest').str.starts_with('M'))   (abs(pl.col('oldbalanceOrg') - pl.col('newbalanceOrig') )) == abs( pl.col('oldbalanceDest') - pl.col('newbalanceDest'))) |
print(len(dataframe.filter( (pl.col('nameDest').str.starts_with('M'))  )))
print(len(dataframe.filter( (pl.col('nameOrig').str.starts_with('M'))  )))
print(len(dataframe.filter( (pl.col('isFraud') == 1)  )))

2151495
0
8213


In [None]:
print(len(dataframe.filter( (pl.col('nameDest').str.starts_with('M'))  |   (abs(pl.col('oldbalanceOrg') - pl.col('newbalanceOrig') ) == abs( pl.col('oldbalanceDest') - pl.col('newbalanceDest')) )           )       ))

2393661


In [None]:
print(len(dataframe.filter( (pl.col('isFraud') == 1) & (~pl.col('nameDest').str.starts_with('M'))  &   (abs(pl.col('oldbalanceOrg') - pl.col('newbalanceOrig') ) != abs( pl.col('oldbalanceDest') - pl.col('newbalanceDest')) )           )       ))

6036


In [None]:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3] })

In [None]:
print(df.sample(1))
print(df.sample(1))

shape: (1, 2)
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 2   ┆ 2   │
└─────┴─────┘
shape: (1, 2)
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 2   ┆ 2   │
└─────┴─────┘


In [9]:
class FraudDetectionDataset(Dataset):

    def __init__(self,neg_data, pos_data,device):
      # mean = neg_data['oldbalanceOrg'].mean()
      # self.neg_data = neg_data.with_columns( (pl.col('oldbalanceOrg')/mean).alias('oldbalanceOrg'))
      # mean = neg_data['newbalanceOrig'].mean()
      # self.neg_data = self.neg_data.with_columns((pl.col('newbalanceOrig')/mean).alias('newbalanceOrig'))
      # mean = neg_data['oldbalanceDest'].mean()
      # self.neg_data =  self.neg_data.with_columns((pl.col('oldbalanceDest')/mean).alias('oldbalanceDest'))
      # mean = neg_data['newbalanceDest'].mean()
      # self.neg_data =  self.neg_data.with_columns((pl.col('newbalanceDest')/mean).alias('newbalanceDest'))
      # mean = pos_data['oldbalanceOrg'].mean()
      # self.pos_data = pos_data.with_columns( (pl.col('oldbalanceOrg')/mean).alias('oldbalanceOrg'))
      # mean = pos_data['newbalanceOrig'].mean()
      # self.pos_data = pos_data.with_columns((pl.col('newbalanceOrig')/mean).alias('newbalanceOrig'))
      # mean = pos_data['oldbalanceDest'].mean()
      # self.pos_data = pos_data.with_columns((pl.col('oldbalanceDest')/mean).alias('oldbalanceDest'))
      # mean = pos_data['newbalanceDest'].mean()
      # self.pos_data = pos_data.with_columns((pl.col('newbalanceDest')/mean).alias('newbalanceDest'))
      self.neg_data = neg_data
      self.pos_data = pos_data
      self.device = device



    def collate(self, data ):
      #breakpoint()
      data = list_to_dataframe(data)
      pos = self.pos_data.sample(self.pos_num)
      data = pl.concat([pos, data])
      name_d = create_name_dict(data)
      x = torch.tensor([[1] if y.startswith("M") else [0] for y in name_d.keys()], dtype=torch.float).to(self.device)
      data = data.with_columns(pl.col('nameOrig').replace(name_d).cast(pl.Int64).alias('nameOrig'), pl.col('nameDest').replace(name_d).cast(pl.Int64).alias('nameDest'))
      edges = data.select(pl.col('nameOrig','nameDest'))
      edge_index = torch.tensor(edges.to_numpy(), dtype=torch.int64).t().contiguous().to(self.device)
      y = torch.tensor(data.select(pl.col('isFraud')).to_numpy(), dtype=torch.float).to(self.device)
      #edge_attr =  torch.tensor(data.select(pl.col('amount')).to_numpy(), dtype=torch.float).to(self.device)
      edge_attr =  torch.tensor(data.select(pl.col('step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest')).to_numpy(), dtype=torch.float).to(self.device)
      data_graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
      return data, data_graph




    def __getitem__(self, index):
      return self.neg_data.row(index)

    def __len__(self):
      return len(self.neg_data)

    def get_dataloader(self, batch_size, pos_num):
      self.pos_num = pos_num
      return DataLoader(self, batch_size=batch_size, shuffle=True, collate_fn = self.collate)





In [10]:
class FraudDetectionDatasetUndersampling(Dataset):

    def __init__(self,neg_data, pos_data,device, neg_perc):
      neg_data = neg_data.sample(int(len(neg_data)*neg_perc))

      self.data = pl.concat([pos_data, neg_data])
      # mean = self.data['oldbalanceOrg'].mean()
      # self.data = self.data.with_columns( (pl.col('oldbalanceOrg')/mean).alias('oldbalanceOrg'))
      # mean = self.data['newbalanceOrig'].mean()
      # self.data = self.data.with_columns((pl.col('newbalanceOrig')/mean).alias('newbalanceOrig'))
      # mean = self.data['oldbalanceDest'].mean()
      # self.data = self.data.with_columns((pl.col('oldbalanceDest')/mean).alias('oldbalanceDest'))
      # mean = self.data['newbalanceDest'].mean()
      # self.data = self.data.with_columns((pl.col('newbalanceDest')/mean).alias('newbalanceDest') )
      self.device = device



    def collate(self, data ):
      if type(data) is list:
      #breakpoint()
        data = list_to_dataframe(data)


      name_d = create_name_dict(data)
      x = torch.tensor([[1] if y.startswith("M") else [0] for y in name_d.keys()], dtype=torch.float).to(self.device)
      data = data.with_columns(pl.col('nameOrig').replace(name_d).cast(pl.Int64).alias('nameOrig'), pl.col('nameDest').replace(name_d).cast(pl.Int64).alias('nameDest'))
      edges = data.select(pl.col('nameOrig','nameDest'))
      edge_index = torch.tensor(edges.to_numpy(), dtype=torch.int64).t().contiguous().to(self.device)
      y = torch.tensor(data.select(pl.col('isFraud')).to_numpy(), dtype=torch.float).to(self.device)
      #edge_attr =  torch.tensor(data.select(pl.col('amount')).to_numpy(), dtype=torch.float).to(self.device)
      edge_attr =  torch.tensor(data.select(pl.col('step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest')).to_numpy(), dtype=torch.float).to(self.device)
      data_graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
      return data, data_graph




    def __getitem__(self, index):
      return self.data.row(index)

    def __len__(self):
      return len(self.data)

    def get_dataloader(self, batch_size, pos_num):
      self.pos_num = pos_num
      return DataLoader(self, batch_size=batch_size, shuffle=True, collate_fn = self.collate)





In [11]:
def train(model, epochs, train_dataloader, val_dataloader, loss, optimizer, f1,model_name, scheduler=None):
  best_f1 = 0

  for epoch in range(epochs):
    # Addestramento
    model.train()
    train_loss_epoch = []
    i = 1
    for batch_inputs in train_dataloader:

        print_progress_bar(i/len(train_dataloader))
        i+=1
        outputs = model(batch_inputs)
        train_loss = loss(outputs, batch_inputs[1].y)
        train_loss_epoch.append(train_loss)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        if not scheduler is None:
          scheduler.step()

    val_loss, f1_score, acc, rec, prec = validate(model, val_dataloader,loss,f1)
    if f1_score > best_f1:
      best_f1 = f1_score

      torch.save(model.state_dict(), "models/"+ model_name + "_f1=" + str(float(best_f1)) + ".pth")
    wndb.log({"Training Loss": sum(train_loss_epoch)/len(train_loss_epoch), "f1": f1_score, "val loss": val_loss, "acc": acc, "rec": rec,"prec": prec})
    print(f'Epoch [{epoch+1}/{epochs}], Training Loss: {sum(train_loss_epoch)/len(train_loss_epoch)}, Validation Loss: {val_loss}, f1 score = {f1_score}')




def validate(model, dataloader, loss, f1):
  accuracy = BinaryAccuracy().to(DEVICE)
  precision = BinaryPrecision().to(DEVICE)
  recall = BinaryRecall().to(DEVICE)
  model.eval()
  with torch.no_grad():
    val_loss_out = []
    f1_out = []
    rec_out = []
    acc_out = []
    prec_out = []
    i=1
    for batch_inputs in dataloader:
      print_progress_bar(i/len(dataloader))
      i+=1
      val_outputs = model(batch_inputs)
      val_loss = loss(val_outputs, batch_inputs[1].y)
      val_f1 = f1(val_outputs, batch_inputs[1].y)
      val_acc = accuracy(val_outputs, batch_inputs[1].y)
      val_rec = recall(val_outputs, batch_inputs[1].y)
      val_prec = precision(val_outputs, batch_inputs[1].y)
      rec_out.append(val_rec)
      prec_out.append(val_prec)
      acc_out.append(val_acc)
      f1_out.append(val_f1)
      val_loss_out.append(val_loss)
  return sum(val_loss_out)/len(val_loss_out), sum(f1_out)/len(f1_out),sum(acc_out)/len(acc_out),sum(rec_out)/len(rec_out),sum(prec_out)/len(prec_out)


In [None]:
class ModuleCallback(l.Callback):

  def on_train_epoch_end(self, trainer, pl_module):

      epoch_mean = float(torch.stack(pl_module.train_loss).mean())
      print("training_epoch_mean loss = ", epoch_mean)
      wndb.log({"train_loss": epoch_mean})
      # free up the memory
      pl_module.train_loss.clear()

  def on_validation_epoch_end(self,trainer, pl_module):

    mean_loss = float(torch.stack(pl_module.val_loss).mean())
    mean_f1 = float(torch.stack(pl_module.f1_score).mean())
    mean_acc = float(torch.stack(pl_module.acc).mean())
    mean_prec = float(torch.stack(pl_module.prec).mean())
    mean_rec = float(torch.stack(pl_module.rec).mean())

    print("val_loss = ", mean_loss)
    print("f1 = ", mean_f1)
    print("acc = ", mean_acc)
    print("prec = ", mean_prec)
    print("rec = ", mean_rec)
    wndb.log({"val_loss": mean_loss, "f1": mean_f1, "acc": mean_acc, "prec": mean_prec, "rec": mean_rec })



In [None]:
class GraphNN(nn.Module):

  def __init__(self,in_size, out_size, h_size, deep,activation,device):
    super(GraphNN, self).__init__()
    self.activation = activation
    if deep == 1:
      self.layers = [GCNConv(in_size,out_size).to(device)]
    else:
      self.layers = [GCNConv(in_size,h_size).to(device)]
      for _ in range(deep-2):
        self.layers.append(GCNConv(h_size,h_size).to(device))
      self.layers.append(GCNConv(h_size,out_size).to(device))


  def forward(self,data):
    edge_index = data.edge_index
    edge_attr = data.edge_attr
    x = data.x
    for layer in self.layers[:-1]:
      x = self.activation(layer(x, edge_index, edge_attr))

    return self.layers[-1](x, edge_index, edge_attr)









In [None]:
class LinearNN(nn.Module):
  def __init__(self,in_size, out_size, h_size, deep,activation):
    super(LinearNN, self).__init__()
    if deep == 1:
      layers = [nn.Linear(in_size,out_size), activation]
    else:
      layers = [nn.Linear(in_size,h_size), activation]
      for _ in range(deep-2):
        layers.append(nn.Linear(h_size,h_size))
        layers.append(activation)
      layers.append(nn.Linear(h_size,out_size))
    self.linear = nn.Sequential(*layers)

  def forward(self,data):
    return self.linear(data)

In [None]:
class FraudDetectionModuleFixed(l.LightningModule):

  def __init__(self,g_in_size,g_h_size,g_out_size,in_size,h_size,out_size, lr, wd, device):
    super(FraudDetectionModule, self).__init__()
    self.device_used = device
    self.gnn1 = GCNConv(g_in_size,g_h_size).to(device)
    self.gnn2 = GCNConv(g_h_size,g_out_size).to(device)
    self.classifier = nn.Sequential(nn.Linear(in_size,h_size),nn.ReLU(),nn.Linear(h_size,out_size))
    self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.loss = nn.BCEWithLogitsLoss()
    self.accuracy = BinaryAccuracy()
    self.precision = BinaryPrecision()
    self.recall = BinaryRecall()
    self.f1 = BinaryF1Score()
    self.lr = lr
    self.wd = wd
    self.acc = []
    self.prec = []
    self.rec = []
    self.f1_score = []
    self.train_loss = []
    self.val_loss = []



  def forward(self,data):
    #breakpoint()

    edge_index = data[1].edge_index
    edge_attr = data[1].edge_attr
    x = data[1].x
    train_edges = data[0]


    train_features = torch.tensor(train_edges.select(pl.col('step','type','amount')).to_numpy(), dtype=torch.float ).to(self.device_used)

    x = self.relu(self.gnn1(x, edge_index, edge_attr))
    x = self.relu(self.gnn2(x, edge_index, edge_attr))

    from_nodes = torch.nan_to_num(x.squeeze()[edge_index[0,:].squeeze()])
    dest_nodes = torch.nan_to_num(x.squeeze()[edge_index[1,:].squeeze()])

    to_classify = torch.cat((from_nodes,dest_nodes,train_features), dim=1)

    out = self.classifier(to_classify)
    return self.sigmoid(out)

  def training_step(self, batch, batch_idx):

    z = self.forward(batch)
    y = batch[1].y

    loss = self.loss(z,y)
    self.train_loss.append(loss)
    self.log("train_loss", loss, prog_bar=True)
    return loss


  def validation_step(self, batch, batch_idx):
    with torch.no_grad():
      #breakpoint()

      #print("validation")

      z = self.forward(batch)

      #breakpoint()
      val_loss = self.loss(z,batch[1].y)
      acc = self.accuracy(z, batch[1].y)
      prec = self.precision(z, batch[1].y)
      rec = self.recall(z, batch[1].y)
      f1 = self.f1(z,batch[1].y)

      self.acc.append(acc)
      self.prec.append(prec)
      self.rec.append(rec)
      self.f1_score.append(f1)
      self.val_loss.append(val_loss)

      #wndb.log({"val_loss": val_loss,"f1-score":f1})
      self.log_dict({"val_loss": val_loss,"f1-score":f1}, prog_bar=True)


  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.wd)
    return optimizer


In [None]:
class FraudDetectionModule(l.LightningModule):

  def __init__(self,gnn,linear, lr, wd, device):
    super(FraudDetectionModule, self).__init__()
    self.device_used = device
    self.gnn = gnn
    self.classifier = linear
    self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.loss = nn.BCEWithLogitsLoss()
    self.accuracy = BinaryAccuracy()
    self.precision = BinaryPrecision()
    self.recall = BinaryRecall()
    self.f1 = BinaryF1Score()
    self.lr = lr
    self.wd = wd
    self.acc = []
    self.prec = []
    self.rec = []
    self.f1_score = []
    self.train_loss = []
    self.val_loss = []



  def forward(self,data):
    #breakpoint()
    edge_index = data[1].edge_index

    train_edges = data[0]


    train_features = torch.tensor(train_edges.select(pl.col('step','type','amount')).to_numpy(), dtype=torch.float ).to(self.device_used)

    x = self.relu(self.gnn(data[1]))

    from_nodes = torch.nan_to_num(x.squeeze()[edge_index[0,:].squeeze()])
    dest_nodes = torch.nan_to_num(x.squeeze()[edge_index[1,:].squeeze()])

    to_classify = torch.cat((from_nodes,dest_nodes,train_features), dim=1)

    out = self.classifier(to_classify)
    return self.sigmoid(out)

  def training_step(self, batch, batch_idx):

    z = self.forward(batch)
    y = batch[1].y

    loss = self.loss(z,y)
    self.train_loss.append(loss)
    self.log("train_loss", loss, prog_bar=True)
    return loss


  def validation_step(self, batch, batch_idx):
    with torch.no_grad():
      #breakpoint()

      #print("validation")

      z = self.forward(batch)

      #breakpoint()
      val_loss = self.loss(z,batch[1].y)
      acc = self.accuracy(z, batch[1].y)
      prec = self.precision(z, batch[1].y)
      rec = self.recall(z, batch[1].y)
      f1 = self.f1(z,batch[1].y)

      self.acc.append(acc)
      self.prec.append(prec)
      self.rec.append(rec)
      self.f1_score.append(f1)
      self.val_loss.append(val_loss)

      #wndb.log({"val_loss": val_loss,"f1-score":f1})
      self.log_dict({"val_loss": val_loss,"f1-score":f1}, prog_bar=True)


  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.wd)
    return optimizer


In [12]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
ACCELERATOR =  "gpu" if torch.cuda.is_available() else "cpu"
POS_SIZE = 200
NEG_SIZE = 1000

IN_GNN = 1
H_GNN = 64
OUT_GNN = 10
DEEP_GNN = 2
ACTIVATION_GNN = nn.ReLU()
IN_NN = 23
OUT_NN = 1
H_NN = 32
DEEP_NN = 2
ACTIVATION_NN = nn.ReLU()
LR = 1e-3
WD = 1e-5

In [None]:
class FraudDetectionModuleOldBasic(nn.Module):

  def __init__(self,gnn_in_size, gnn_out_size, linear_in_size, linear_out_size, device):
    super(FraudDetectionModuleOld, self).__init__()
    self.gnn = GCNConv(gnn_in_size,gnn_out_size)
    self.classifier = nn.Linear(linear_in_size, linear_out_size)
    self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.device = device

  def forward(self,data):
    #breakpoint()
    edge_index = data[1].edge_index
    edge_attr = data[1].edge_attr
    x = data[1].x
    train_edges = data[0]


    train_features = torch.tensor(train_edges.select(pl.col('step','type','amount')).to_numpy(), dtype=torch.float ).to(self.device)

    x = self.relu(self.gnn(x, edge_index, edge_attr))

    from_nodes = torch.nan_to_num(x.squeeze()[edge_index[0,:].squeeze()])
    dest_nodes = torch.nan_to_num(x.squeeze()[edge_index[1,:].squeeze()])

    to_classify = torch.cat((from_nodes,dest_nodes,train_features), dim=1)

    out = self.classifier(to_classify)
    return out










In [None]:
class FraudDetectionModuleOld(nn.Module):

  def __init__(self, device, dropout):
    super(FraudDetectionModuleOld, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.gnn = GCNConv(1,128)
    #self.gnn1 = GCNConv(128,128)
    self.gnn2 = GCNConv(128,64)
    self.linear =  nn.Linear(135, 128)
    self.linear2 = nn.Linear(128, 128)
    self.classifier = nn.Linear(128, 1)
    #self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.device = device


  def forward(self,data):
    #breakpoint()
    edge_index = data[1].edge_index
    edge_attr = data[1].edge_attr
    x = data[1].x
    train_edges = data[0]



    train_features = torch.tensor(train_edges.select(pl.col('step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest')).to_numpy(), dtype=torch.float ).to(self.device)

    x = self.relu(self.gnn(x, edge_index, edge_attr))
    x = self.dropout(x)
    # x = self.relu(self.gnn1(x, edge_index, edge_attr))
    # x = self.dropout(x)
    x = self.relu(self.gnn2(x, edge_index, edge_attr))
    x = self.dropout(x)

    from_nodes = torch.nan_to_num(x.squeeze()[edge_index[0,:].squeeze()])
    dest_nodes = torch.nan_to_num(x.squeeze()[edge_index[1,:].squeeze()])

    to_classify = torch.cat((from_nodes,dest_nodes,train_features), dim=1)
    to_classify = self.relu(self.linear(to_classify))
    to_classify = self.dropout(to_classify)
    to_classify = self.relu(self.linear2(to_classify))
    to_classify = self.dropout(to_classify)

    out = self.classifier(to_classify)
    return out


In [13]:
class FraudDetectionModuleConv(nn.Module):

  def __init__(self, device, dropout):
    super(FraudDetectionModuleConv, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.gnn = GATConv(1,128,edge_dim=7)
    self.gnn1 = GATConv(128,128,edge_dim=7)
    self.gnn2 = GATConv(128,64,edge_dim=7)#,heads=2)
    self.linear =  nn.Linear(128 + 7, 128)
    self.linear2 = nn.Linear(128, 128)
    self.classifier = nn.Linear(128, 1)
    #self.sigmoid = nn.Sigmoid()
    self.relu = nn.ReLU()
    self.device = device


  def forward(self,x, edge_index,edge_weight ):
    #data = collate(pl.from_pandas(data))

    #breakpoint()
    # edge_index = data[1].edge_index
    # edge_attr = data[1].edge_attr
    # x = data[1].x
    # train_edges = data[0]



    # train_features = torch.tensor(train_edges.select(pl.col('step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest')).to_numpy(), dtype=torch.float ).to(self.device)

    edge_attr = edge_weight
    train_features = edge_weight


    x = self.relu(self.gnn(x, edge_index, edge_attr))
    x = self.dropout(x)
    x = self.relu(self.gnn1(x, edge_index, edge_attr))
    x = self.dropout(x)
    x = self.relu(self.gnn2(x, edge_index, edge_attr))
    x = self.dropout(x)

    from_nodes = torch.nan_to_num(x.squeeze()[edge_index[0,:].squeeze()])
    dest_nodes = torch.nan_to_num(x.squeeze()[edge_index[1,:].squeeze()])

    to_classify = torch.cat((from_nodes,dest_nodes,train_features), dim=1)
    to_classify = self.relu(self.linear(to_classify))
    to_classify = self.dropout(to_classify)
    to_classify = self.relu(self.linear2(to_classify))
    to_classify = self.dropout(to_classify)

    out = self.classifier(to_classify)
    return out


In [14]:
train_set, validation_set, test_set = divide_dataset("PS_20174392719_1491204439457_log.csv",0.7,0.1)

train_dataset =  FraudDetectionDataset(train_set[0], train_set[1], DEVICE)
validation_dataset =  FraudDetectionDatasetUndersampling(validation_set[0], validation_set[1], DEVICE,1)
test_dataset =  FraudDetectionDatasetUndersampling(test_set[0], test_set[1], DEVICE,1)

train_loader = train_dataset.get_dataloader(750,274)
validation_loader = validation_dataset.get_dataloader(1024,100)
test_loader = test_dataset.get_dataloader(1024,100)

In [None]:
gnn = GraphNN(IN_GNN, OUT_GNN, H_GNN, DEEP_GNN, ACTIVATION_GNN, DEVICE)
linear = LinearNN(IN_NN, OUT_NN, H_NN, DEEP_NN, ACTIVATION_NN)
gnn.to(DEVICE)
linear.to(DEVICE)
model = FraudDetectionModule(gnn,linear,LR,WD, DEVICE)
model.to(DEVICE)

modelFixed = FraudDetectionModuleFixed(IN_GNN,H_GNN,OUT_GNN,IN_NN,H_NN,OUT_NN,LR,WD,DEVICE)


FraudDetectionModule(
  (gnn): GraphNN(
    (activation): ReLU()
  )
  (classifier): LinearNN(
    (linear): Sequential(
      (0): Linear(in_features=23, out_features=32, bias=True)
      (1): ReLU()
      (2): Linear(in_features=32, out_features=1, bias=True)
    )
  )
  (sigmoid): Sigmoid()
  (relu): ReLU()
  (loss): BCEWithLogitsLoss()
  (accuracy): BinaryAccuracy()
  (precision): BinaryPrecision()
  (recall): BinaryRecall()
  (f1): BinaryF1Score()
)

In [None]:
DEVICE

trainer = l.Trainer(deterministic=True, max_epochs=40, accelerator=ACCELERATOR, callbacks=[ModuleCallback()])


INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
#torch.use_deterministic_algorithms(False)


wndb.init(
    project="datamining-hw4",

    # track hyperparameters and run metadata
    config={
    "learning_rate": LR ,
    "weight decay": WD
    })


trainer.fit(model, train_loader, validation_loader)

wndb.finish()

In [15]:
model = FraudDetectionModuleConv( DEVICE, 0)
model.to(DEVICE)

FraudDetectionModuleConv(
  (dropout): Dropout(p=0, inplace=False)
  (gnn): GATConv(1, 128, heads=1)
  (gnn1): GATConv(128, 128, heads=1)
  (gnn2): GATConv(128, 64, heads=1)
  (linear): Linear(in_features=135, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=128, bias=True)
  (classifier): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
)

In [16]:
model.load_state_dict(torch.load('GAT3_model_f1=0.6326690316200256.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [16]:
val_loss, f1_score, acc, rec, prec = validate(model,test_loader,nn.BCEWithLogitsLoss(),torchmetrics.classification.BinaryF1Score().to(DEVICE))

# (tensor(0.0055, device='cuda:0'),
#  tensor(0.6112, device='cuda:0'),
#  tensor(0.9992, device='cuda:0'),
#  tensor(0.7349, device='cuda:0'),
#  tensor(0.5518, device='cuda:0'))



(tensor(0.0055, device='cuda:0'),
 tensor(0.6112, device='cuda:0'),
 tensor(0.9992, device='cuda:0'),
 tensor(0.7349, device='cuda:0'),
 tensor(0.5518, device='cuda:0'))



In [20]:


from torch_geometric.explain import GNNExplainer, AttentionExplainer
from torch_geometric.explain import Explainer, ModelConfig


In [23]:
my_input = next(iter(train_loader))

x = my_input[1].x
edge_index = my_input[1].edge_index
edge_weight = my_input[1].edge_attr
y = torch.tensor(my_input[0].select(pl.col('isFraud')).to_numpy(), dtype=torch.float)

explainer_att = AttentionExplainer()
config = ModelConfig("binary_classification","edge", "raw")

edge_mask = torch.zeros(x.shape[0], dtype=torch.bool)
edge_mask[:10] = True


explainer = GNNExplainer(epoch=200)#model,explainer_att,"model",config)

explaination = explainer(model, x,  edge_index)


TypeError: GNNExplainer.forward() missing 1 required keyword-only argument: 'target'

In [51]:
x.shape[0]

2048

In [17]:
!pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: captum
Successfully installed captum-0.7.0


In [21]:
from torch_geometric.explain.algorithm import AttentionExplainer, GNNExplainer
from torch_geometric.explain import Explainer, ModelConfig



config = ModelConfig("binary_classification","edge", "raw")

alg = AttentionExplainer()

explainer = Explainer(model,alg,"model",model_config=config, edge_mask_type=MaskType("attributes"))

ValueError: 'edge_mask_type' needs be None or of type 'object' (got 'attributes')

In [18]:
from captum.attr import Saliency, IntegratedGradients

#my_input = next(iter(train_loader))
#test_dataset.collate(test_dataset.data)
# x = input[1].x
# edge_index = input[1].edge_index
# edge_attr = input[1].edge_attr

# target = input[1].y

my_input = collate(validation_dataset.data)#next(iter(test_loader))


y = torch.tensor(my_input[0].select(pl.col('isFraud')).to_numpy(), dtype=torch.float)


def model_forward(edge_mask, data):
    x = data[1].x
    edge_index = data[1].edge_index
    edge_weight = data[1].edge_attr
    #batch = torch.zeros(data.x.shape[0], dtype=int).to(DEVICE)
    out = model(x, edge_index, edge_weight)#, batch, edge_mask)
    return out


def explain(method, data, target=0):
    input_mask = torch.ones(data[1].edge_index.shape[1]).requires_grad_(True).to(DEVICE)
    if method == 'ig':
        ig = IntegratedGradients(model_forward)
        mask = ig.attribute(input_mask, target=target,
                            additional_forward_args=(data,),
                            internal_batch_size=data[1].edge_index.shape[1])
    elif method == 'saliency':
        saliency = Saliency(model_forward)
        mask = saliency.attribute(input_mask, target=target,
                                  additional_forward_args=(data,))
    else:
        raise Exception('Unknown explanation method')

    edge_mask = np.abs(mask.cpu().detach().numpy())
    if edge_mask.max() > 0:  # avoid division by zero
        edge_mask = edge_mask / edge_mask.max()
    return edge_mask




In [19]:
print(explain('ig',my_input))

RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.

In [None]:
import torch
import torch_geometric.nn as pygnn
import torch_geometric.data as pygdata
import GraphSVX



# Inizializza il modello, l'input e il target

input = ...  # Dati di input
target = ...  # Target

# Crea il grafico PyTorch Geometric
edge_index, _ = torch.load(input.edge_index)
edge_weight = torch.ones(edge_index.shape[1])
data = pygdata.Data(x=input.x, edge_index=edge_index, edge_attr=edge_weight)

# Crea il grafico GraphSVX
g = graphsvx.Graph(data)

# Crea l'attributore di importanza basato su gradienti
explainer = graphsvx.Explainer(model, g)

# Esegui l'attributo
attributions, delta = explainer.attribute(inputs=data, target=target)

# Interpretazione degli output
attributions.shape  # (num_edges,)
delta  # Convergence delta

# attributions contiene gli attributi calcolati utilizzando l'attributore basato su gradienti
# di GraphSVX. Il delta è un indicatore della convergenza dell'algoritmo di attributo.
# Il valore di delta dovrebbe diminuire con l'aumentare del numero di campioni n_samples.




In [24]:
!pip install -U lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/275.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=58e421cb2ae524beefa3ee73ec21ac0c676d4e025fa333c0b6f30821794d8e80
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


torch.Tensor

In [28]:
import lime
import lime.lime_tabular

my_input = collate(test_dataset.data)

def wrapped_model(edge_weight):
    """
    Takes in input a numpy array and outputs numpy array with the prediction.
    Necessary since both shap and LIME use numpy arrays to pass parameters.
    """
    model.eval()
    x = my_input[1].x
    edge_index = my_input[1].edge_index

    return model(x, edge_index, edge_weight)

edge_weight = my_input[1].edge_attr
y = torch.tensor(my_input[0].select(pl.col('isFraud')).to_numpy(), dtype=torch.float)

lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    edge_weight, mode="binary_classification",
    class_names=["Median house price"],
    training_labels=y,
    feature_names=['step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'],
    verbose=True)



TypeError: min() received an invalid combination of arguments - got (out=NoneType, axis=NoneType, ), but expected one of:
 * ()
 * (Tensor other)
 * (int dim, bool keepdim)
      didn't match because some of the keywords were incorrect: out, axis
 * (name dim, bool keepdim)
      didn't match because some of the keywords were incorrect: out, axis


In [None]:
exp = lime_explainer.explain_instance(x_test[i], wrapped_model)

In [None]:


my_input = next(iter(train_loader))

x = my_input[1].x
edge_index = my_input[1].edge_index
edge_weight = my_input[1].edge_attr
y = torch.tensor(my_input[0].select(pl.col('isFraud')).to_numpy(), dtype=torch.float)



explainer = lime.lime_tabular.LimeTabularExplainer(edge_weight, feature_names=['step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'])


exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1)





In [33]:
my_input = test_dataset.collate(test_dataset.data)

In [29]:
!pip install shap

Collecting shap
  Downloading shap-0.44.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (533 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m533.5/533.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.44.0 slicer-0.0.7


In [37]:
import shap

my_input = collate(test_dataset.data)#next(iter(test_loader))

x = my_input[1].x
edge_index = my_input[1].edge_index
edge_weight = my_input[1].edge_attr
y = torch.tensor(my_input[0].select(pl.col('isFraud')).to_numpy(), dtype=torch.float)

def wrapped_model(edge_weight):
    """
    Takes in input a numpy array and outputs numpy array with the prediction.
    Necessary since both shap and LIME use numpy arrays to pass parameters.
    """
    model.eval()
    x = my_input[1].x
    edge_index = my_input[1].edge_index

    return model(x, edge_index, edge_weight)


explainer = shap.Explainer(wrapped_model, edge_weight, feature_names=['step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'])
shap_values = explainer.shap_values(edge_weight)

TypeError: 'Tensor' object is not callable

In [32]:
type(explainer)

shap.explainers._permutation.PermutationExplainer

In [38]:

import lime
import lime.lime_tabular

def wrapped_model(x):
    """
    Takes in input a numpy array and outputs numpy array with the prediction.
    Necessary since both shap and LIME use numpy arrays to pass parameters.
    """
    model.eval()
    data = test_dataset.collate(test_dataset.data)
    return model(data)

x_test = torch.tensor(test_dataset.data.select(pl.col('step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest')).to_numpy(), dtype=torch.float)       #test_dataset.collate(test_dataset.data)
y_test = torch.tensor(test_dataset.data.select(pl.col('isFraud')).to_numpy(), dtype=torch.float)




lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    x_test, mode="binary_classification",
    class_names=["Median house price"],
    training_labels=y_test,
    feature_names=['step','type','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'],
    verbose=True)



TypeError: min() received an invalid combination of arguments - got (out=NoneType, axis=NoneType, ), but expected one of:
 * ()
 * (Tensor other)
 * (int dim, bool keepdim)
      didn't match because some of the keywords were incorrect: out, axis
 * (name dim, bool keepdim)
      didn't match because some of the keywords were incorrect: out, axis


In [12]:
!pip install rdkit




In [13]:
!pip install git+https://github.com/c-feldmann/rdkit_heatmaps
!pip install git+https://github.com/AndMastro/edgeshaper

Collecting git+https://github.com/c-feldmann/rdkit_heatmaps
  Cloning https://github.com/c-feldmann/rdkit_heatmaps to /tmp/pip-req-build-zcy4di5g
  Running command git clone --filter=blob:none --quiet https://github.com/c-feldmann/rdkit_heatmaps /tmp/pip-req-build-zcy4di5g
  Resolved https://github.com/c-feldmann/rdkit_heatmaps to commit 3ed507ab837caaa1c10d2ae5fdf31d1cd135a777
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/AndMastro/edgeshaper
  Cloning https://github.com/AndMastro/edgeshaper to /tmp/pip-req-build-i7e_a3kd
  Running command git clone --filter=blob:none --quiet https://github.com/AndMastro/edgeshaper /tmp/pip-req-build-i7e_a3kd
  Resolved https://github.com/AndMastro/edgeshaper to commit 59a4d25295a53a8996d68d5f0c6f57a93be2f817
[31mERROR: git+https://github.com/AndMastro/edgeshaper does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [None]:
# from edgeshaper import edgeshaper
import edgeshaper



input = test_dataset.collate(test_dataset.data)
x = input[1].x
edge_index = input[1].edge_index
edge_attr = input[1].edge_attr

target = input[1].y

explainer = edgeshaper.Edgeshaper(model,x,edge_index, edge_weight = edge_attr )

attributions, delta = explainer.explain()


# for data_tmp in train_loader:
#   data = data_tmp
#   break

# edge_index = data[1].edge_index
# x = data[1].x
# device = "cuda" or "cpu"
# target_class = data[1].y.to(dtype=torch.long) #class label for which to perform explanations

# edges_explanations = edgeshaper(model, x, edge_index, M = 100, target_class = target_class, device = "cpu", edge_weight = data[1].edge_attr)

  0%|          | 0/1717905 [00:00<?, ?it/s]

In [None]:
wndb.init(
    project="datamining-hw4",

    # track hyperparameters and run metadata
    config={
    "learning_rate": LR ,
    "weight decay": WD
    })
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=WD)
train(model,27,train_loader,validation_loader,nn.BCEWithLogitsLoss(), optimizer, torchmetrics.classification.BinaryF1Score().to(DEVICE),"GAT3_model",torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[4], gamma=0.1) )
wndb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training Loss,█▄▁
acc,█▄▁
f1,▂█▁
prec,▄█▁
rec,▁█▄
val loss,▁▄█

0,1
Training Loss,0.01257
acc,0.99905
f1,0.57785
prec,0.50777
rec,0.73797
val loss,0.00555




first_model =       FraudDetectionModuleOld(
        (dropout): Dropout(p=0.2, inplace=False)
        (gnn): GCNConv(1, 128)
        (gnn1): GCNConv(128, 128)
        (gnn2): GCNConv(128, 64)
        (linear): Linear(in_features=135, out_features=128, bias=True)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (classifier): Linear(in_features=128, out_features=1, bias=True)
        (sigmoid): Sigmoid()
        (relu): ReLU()
      ), fake validation
      