In [None]:
!pip install torch lightning numpy kaggle wandb torch-geometric

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
from google.colab import files

# Carica il file kaggle.json
files.upload()


KeyboardInterrupt: ignored

In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d ealaxi/paysim1
!unzip paysim1.zip

In [None]:
import pandas as pd, sys, plotly.graph_objects as go, plotly.express as px, numpy as np, torch, random as rnd, torch.nn as nn, lightning as L, wandb as wndb, pdb
from torch.utils.data import Dataset, DataLoader
from sklearn.utils import shuffle
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
from matplotlib import pyplot as plt
import networkx as nx
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
from torch import optim
from torchmetrics.classification import BinaryF1Score


In [None]:
# PARAMETERS

DEVICE = "cuda"
SEED = 42

rnd.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

DEPTH = 3
LR = 1e-3
WD = 1e-5
H_SIZE = 512

"aaf831dabc88d936d4e6b439b798bb4cb42814ea"

#wndb.login()



In [None]:
# UTILS FUNCTIONS

def load_dataframe( dataset_file : str):
    return pd.read_csv(dataset_file)


def find_null_or_empty_records( dataframe: pd.DataFrame):
    n = len(dataframe)
    for index, row in dataframe.iterrows():
        print_progress_bar(index/n)
        # Controlla se ci sono valori nulli o vuoti nel record
        if row.isnull().any() or any(map(lambda x: x == '', row)):
            # Stampa il record
            print(f"Record con valori nulli o vuoti:\n{row}\n")

def print_progress_bar(percentuale, lunghezza_barra=20):
    blocchi_compilati = int(lunghezza_barra * percentuale)
    barra = "[" + "=" * (blocchi_compilati - 1) + ">" + " " * (lunghezza_barra - blocchi_compilati) + "]"
    sys.stdout.write(f"\r{barra} {percentuale * 100:.2f}% completo")
    sys.stdout.flush()


def compute_kind_inconsistence(dataframe):
    return {"inconsistent orig balance": len(dataframe.query('abs(oldbalanceOrg - newbalanceOrig) != amount'))/len(dataframe),
            "inconsistent dest balance": len(dataframe.query('abs(oldbalanceDest - newbalanceDest) != amount'))/len(dataframe),
            "zero cash transaction": len(dataframe.query('amount == 0 '))/len(dataframe),
            "self-transaction": len(dataframe.query('nameOrig == nameDest'))/len(dataframe)
            }

def plot_histogram(to_plot):


    # Converti il dizionario in un array di valori
    values = list(to_plot.values())

    # Crea un istogramma
    fig = go.Figure(data=[go.Bar(x=list(to_plot.keys()), y=values)])

    # Mostra l'istogramma
    fig.show()



def plot_categories(dataframe):
    # Calcola la frequenza di ogni categoria nella colonna 'type'
    counts = dataframe['type'].value_counts().reset_index()

    # Rinomina le colonne
    counts.columns = ['type', 'count']

    counts['count'] = counts['count'] / counts['count'].sum()

    # Crea l'istogramma con Plotly Express
    fig = px.bar(counts, x='type', y='count', title='Istogramma delle categorie nella colonna "type"')

    # Mostra il plot
    fig.show()


def build_names_conversion_index(df):
  all_names = pd.concat([df['nameOrig'], df['nameDest']])


  # Applica factorize per ottenere identificatori univoci
  unique_names, name_indices = pd.factorize(all_names)


  # Costruisci il dizionario di mapping
  name_to_id = dict(zip(name_indices, unique_names))
  #breakpoint()
  return name_to_id




In [None]:
class FraudDetectionDataset(Dataset):

    def __init__(self,dataset_file : str):
        dataframe = load_dataframe(dataset_file)
        dataframe["isFraud"] = dataframe["isFraud"].astype(bool)
        dataframe["isFlaggedFraud"] = dataframe["isFlaggedFraud"].astype(bool)
        self.raw_data = dataframe
        self.transaction_types = {
            "CASH_IN": 0,
            "CASH_OUT": 1,
            "DEBIT": 2,
            "PAYMENT": 3,
            "TRANSFER": 4
        }

    def analize_data(self,find_empty_records = True):
        print("----HEAD----")
        print(self.raw_data.head())
        print("----INFO----")
        print(self.raw_data.info())
        print("----DESCRIBE----")
        print(self.raw_data.describe())
        if find_empty_records:
          find_null_or_empty_records(self.raw_data)

    def extract_inconsistent_transactions(self):
        condiction = "abs(oldbalanceOrg - newbalanceOrig) != amount | abs(oldbalanceDest - newbalanceDest) != amount | amount == 0 | nameOrig == nameDest"

        return self.raw_data.query(condiction)


    def remove_inconsistent(self):
        self.raw_data = self.raw_data.query("abs(oldbalanceOrg - newbalanceOrig) == abs(oldbalanceDest - newbalanceDest) & amount != 0")


    def convert_type(self):
      self.raw_data['type'] = self.raw_data['type'].apply(lambda x: self.transaction_types[x])
      self.raw_data['isFlaggedFraud'] = self.raw_data['isFlaggedFraud'].astype(int)
      self.raw_data['isFraud'] = self.raw_data['isFraud'].astype(int)


    def build_graph(self, val_prc, test_prc):
      edge_index = []
      #breakpoint()
      node_names = build_names_conversion_index(self.raw_data)
      id = 0
      x = []
      self.raw_data['nameOrig'] = self.raw_data['nameOrig'].apply(lambda x: node_names[x])
      self.raw_data['nameDest'] = self.raw_data['nameDest'].apply(lambda x: node_names[x])
      ordered_items = sorted(list(node_names.items()), key=lambda x: x[1])
      for node in ordered_items:
        x.append(1 if node[0].startswith('M') else 0)

      edge_attr = torch.tensor(self.raw_data[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
       'isFlaggedFraud']].values, dtype=torch.float)
      y = torch.tensor(self.raw_data['isFraud'], dtype=torch.long)
      self.raw_data = None
      x = torch.tensor(x, dtype=torch.long)
      edge_index = torch.tensor(edge_index, dtype=torch.long)

      data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
      data.y = y
      self.split = T.RandomNodeSplit(num_val=val_prc, num_test=test_prc)
      self.data = [self.split(data)]


    def __getitem__(self, index):
      return self.data[index]

    def __len__(self):
      return 1






In [None]:
dataset = FraudDetectionDataset("PS_20174392719_1491204439457_log.csv")
#inconsistent_data = dataset.extract_inconsistent_transactions()

In [None]:
d = compute_kind_inconsistence(inconsistent_data)
plot_histogram(d)
names_distinct = set(dataset.raw_data["nameOrig"]).union(set(dataset.raw_data["nameDest"]))

In [None]:
print(dataset.raw_data.columns)
dataset.convert_type()
dataset.build_graph(0.1,0.2)




Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')


In [None]:
def convert_to_networkx(graph):
	g = to_networkx(graph, node_attrs=['x'])

	return g

def plot_graph(g):
	plt.figure(figsize=(9,7))
	nx.draw_spring(g, node_size=30, arrows=False)
	plt.show()



g = convert_to_networkx(dataset.data[0])
plot_graph(g)
del g


KeyboardInterrupt: ignored

KeyboardInterrupt: ignored

<Figure size 900x700 with 0 Axes>

In [None]:
class FraudDetectionModule(L.LightningModule):

  def __init__(self,input_size, hidden_layer_size, depth, lr=1e-3, wd=5e-4):
    super().__init__()
    self.layers = []
    self.layers.append(GCNConv(input_size, hidden_layer_size))
    for _ in range(depth-2):
      self.layers.append(GCNConv(hidden_layer_size, hidden_layer_size))
    self.classifier = nn.Linear(hidden_layer_size, 2)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.lr = lr
    self.wd = wd
    self.validation_step_outputs = []
    self.f1 = BinaryF1Score()



    self.loss = nn.CrossEntropyLoss()


  def forward(self,data):
    edge_index = data.edge_index
    edge_attr = data.edge_attr
    x = data.x
    for cnn in self.layers:
      x = self.relu(cnn(x, edge_index, edge_attr))
    return self.sigmoid(self.classifier(x))



  def training_step(self, batch, batch_idx):
    x, y = batch
    z = self.model(x)
    loss = self.loss(z[batch.train_mask],batch.y[batch.train_mask])
    # Logging to TensorBoard (if installed) by default
    self.log("train_loss", loss, prog_bar=True)
    return loss


  def validation_step(self, batch, batch_idx):
    x, y = batch
    z = self.model(x)
    val_loss = self.loss(z[batch.val_mask],batch.y[batch.val_mask])
    f1 = self.f1(z[batch.val_mask],batch.y[batch.val_mask])
    wndb.log({"val_loss": val_loss,"f1-score":f1})
    self.log_dict({"val_loss": val_loss,"f1-score":f1}, prog_bar=True)


  def configure_optimizers(self):
    optimizer = optim.Adam(self.parameters(), lr=self.lr, wd=self.wd)
    return optimizer




In [None]:



wndb.init(
    project="datamining-hw4",

    # track hyperparameters and run metadata
    config={
    "learning_rate": LR ,
    "weight decay": WD ,
    "layers": DEPTH ,
    "hidden layers size": H_SIZE
    })

loader = DataLoader(dataset, batch_size=1, shuffle=False)

model = FraudDetectionModule(dataset.num_node_features,H_SIZE,DEPTH,LR,WD)

trainer = L.Trainer()



In [None]:
trainer.fit(model, loader, loader)
wndb.finish()

In [None]:
dataset.analize_data(find_empty_records=False)
print("inconsistent fraud")
print(len(inconsistent_data.query('isFraud == 1'))/len(inconsistent_data))
print("inconsistent not fraud")
print(len(inconsistent_data.query('isFraud == 0'))/len(inconsistent_data))
print("given > gotten")
print(len(dataset.raw_data.query("abs(oldbalanceOrg - newbalanceOrig) > abs(oldbalanceDest - newbalanceDest)"))/len(inconsistent_data))
print("given < gotten")
print(len(dataset.raw_data.query("abs(oldbalanceOrg - newbalanceOrig) < abs(oldbalanceDest - newbalanceDest)"))/len(inconsistent_data))
print("distinct origin name")
print(dataset.raw_data["nameOrig"].nunique())
print("distinct names")
print(len(names_distinct))
print("isFlaggedFraud and isFraud")
print(len(dataset.raw_data.query("isFlaggedFraud & isFraud"))/len(dataset.raw_data.query("isFraud")))
plot_categories(dataset.raw_data)
