In [245]:
import networkx as nx
import json
import joblib
import gzip
from matplotlib import pyplot as plt
import os
import torch
from torch_geometric.data import InMemoryDataset
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy import sparse
from torch_geometric.data import Data
from torch_sparse import SparseTensor

In [2]:
%matplotlib inline

# Embedding for nodes

In [111]:
def ngram_embedding(word, n=3, size=2000):
    ngram_indices = {}
    for i in range(len(word) - n + 1):
        ngram = "".join(word[i+j] for j in range(n))
        idx = sum([(ord(c) << 16 * j) % size for j, c in enumerate(reversed(ngram))]) % size
        ngram_indices[idx] = ngram_indices.get(idx, 0) + 1
    return ngram_indices

In [123]:
def ngram_embedding_batch(word_list, n=3, size=2000):
    batch_indices, batch_values = [], []
    for i, word in enumerate(word_list):
        ngram_indices = ngram_embedding(word, n, size)
        batch_indices += [[i, j] for j in ngram_indices.keys()]
        batch_values += list(ngram_indices.values())
    emb = torch.sparse_coo_tensor(np.array(batch_indices).T, batch_values, (len(word_list), size))
    return emb

# Embedding for edges

In [125]:
edges_emb = OneHotEncoder()

In [179]:
edges_types = [
    "CHILD",
    "NEXT",
    "NEXT_USE",
    "LAST_LEXICAL_USE",
    "OCCURRENCE_OF",
    "SUBTOKEN_OF",
    "COMPUTED_FROM",
    "RETURNS_TO"
]

In [180]:
edges_emb.fit([[t] for t in edges_types])

OneHotEncoder()

# Embedding for labels
We've used only 2 error codes for Flake8: `F821`, `F841`

In [167]:
class LabelEncoder(OneHotEncoder):
    def transform(self, x):
        try:
            return super().transform(x)
        except ValueError:
            emb_len = len(self.get_feature_names())
            out = sparse.csr_matrix((1, emb_len), dtype=np.long)
            if not self.sparse:
                return out.toarray()
            else:
                return out
    

In [168]:
label_encoder = LabelEncoder()

In [169]:
label_encoder.fit([["F821"], ["F841"]])

LabelEncoder()

In [174]:
label_encoder.transform(["F821"])

<1x2 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [122]:
train_data = []
filename = "/Users/petrovao/Downloads/py150_files/20_dir/graph-dataset/all-graphs000.jsonl.gz"

with gzip.open(filename, "rb") as file:
    for line in file.readlines():
        train_data.append(json.loads(line))
    

In [140]:
train_data[0].keys()

dict_keys(['nodes', 'edges', 'token-sequence', 'supernodes', 'labels', 'variable_mask', 'filename'])

In [281]:
import torch_geometric.transforms as T

def scipy_csr_to_torch(csr, dtype):
    coo = csr.tocoo()
    values = coo.data
    indices = np.vstack((coo.row, coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo.shape

    return torch.sparse_coo_tensor(i, v, torch.Size(shape), dtype=dtype)


class Py150KDataset(InMemoryDataset):
    def __init__(self, root, transform=T.ToSparseTensor(), pre_transform=None):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load("processed/data.pt")

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ['data.pt']

    def process(self):
#         Read data into huge `Data` list.

        data_list = []
        for graph in train_data:
            x = ngram_embedding_batch(graph["nodes"], n=3, size=2000)

            edges, edges_types = [], []
            for relationship, connections in graph["edges"].items():
                for parent, children in connections.items():
                    edges += [(int(parent), child) for child in children]
                    edges_types += [relationship for _ in children]

            edge_index = np.array(edges).T
            edge_attr = edges_emb.transform([[edge_type] for edge_type in edges_types])

            y = np.vstack([label_encoder.transform(l).toarray() for l in graph["labels"]])

            data_list.append(
                Data(
                    x=SparseTensor.from_torch_sparse_coo_tensor(x), 
                    edge_index=torch.tensor(edge_index, dtype=torch.long), 
                    edge_attr=SparseTensor.from_scipy(edge_attr), 
                    y=torch.tensor(y, dtype=torch.long),
                    variables_mask = torch.tensor(graph["variable_mask"], dtype=torch.long)
                )
            )


        data, slices = self.collate(data_list)
        torch.save((data, slices), "processed/data.pt")
        pass

In [282]:
data = Py150KDataset(".")

Processing...
Done!


In [290]:
for i in range(len(data)):
    print(data.get(i))

Data(edge_attr=[716, 8, nnz=716], edge_index=[2, 716], variables_mask=[377], x=[377, 2000, nnz=1740], y=[377, 2])
Data(edge_attr=[593, 8, nnz=593], edge_index=[2, 593], variables_mask=[302], x=[302, 2000, nnz=1268], y=[302, 2])
Data(edge_attr=[1242, 8, nnz=1242], edge_index=[2, 1242], variables_mask=[637], x=[637, 2000, nnz=1927], y=[637, 2])
Data(edge_attr=[635, 8, nnz=635], edge_index=[2, 635], variables_mask=[371], x=[371, 2000, nnz=1176], y=[371, 2])
Data(edge_attr=[192, 8, nnz=192], edge_index=[2, 192], variables_mask=[115], x=[115, 2000, nnz=401], y=[115, 2])
Data(edge_attr=[149, 8, nnz=149], edge_index=[2, 149], variables_mask=[86], x=[86, 2000, nnz=365], y=[86, 2])
Data(edge_attr=[702, 8, nnz=702], edge_index=[2, 702], variables_mask=[427], x=[427, 2000, nnz=1390], y=[427, 2])
Data(edge_attr=[385, 8, nnz=385], edge_index=[2, 385], variables_mask=[243], x=[243, 2000, nnz=1218], y=[243, 2])
Data(edge_attr=[807, 8, nnz=807], edge_index=[2, 807], variables_mask=[493], x=[493, 2000,

In [284]:
dir(data)

['__add__',
 '__class__',
 '__class_getitem__',
 '__data_list__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__indices__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_download',
 '_is_protocol',
 '_process',
 'collate',
 'copy',
 'data',
 'download',
 'get',
 'index_select',
 'indices',
 'len',
 'num_classes',
 'num_edge_features',
 'num_features',
 'num_node_features',
 'pre_filter',
 'pre_transform',
 'process',
 'processed_dir',
 'processed_file_names',
 'processed_paths',
 'raw_dir',
 'raw_file_names',
 'raw_paths',
 'root',
 'shuffle',
 'slices',
 'transform']

In [285]:
from torch_geometric.data import DataLoader

In [286]:
data_size = len(data)
train_loader = DataLoader(data[:int(data_size * 0.8)], batch_size=16, shuffle=True)
test_loader = DataLoader(data[int(data_size * 0.8):], batch_size=16, shuffle=True)

In [287]:
for batch in train_loader:
    print(batch)

TypeError: list indices must be integers or slices, not tuple

In [254]:
from typing import List, Union

import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import TransformerConv

class GNNConvolution(nn.Module):

    def __init__(
        self, 
        input_dim: int, 
        edge_dim: int, 
        hidden_dims: Union[int, List[int], None], 
        output_dim: int, 
        dropout: float = 0.25, 
        layer_norm: bool = True
    ):
        super(GNNConvolution, self).__init__()
        if hidden_dims is None:
            hidden_dims = []
        elif not isinstance(hidden_dims, list):
            hidden_dims = [hidden_dims]
        self.num_layers = len(hidden_dims) + 1
        self.dropout = dropout
        self.convs = nn.ModuleList()
        self.layer_norm = layer_norm
        self.lns = nn.ModuleList()

        i_dim = input_dim
        for i in range(len(hidden_dims)):
            self.convs.append(self.build_conv_layer(i_dim, edge_dim, hidden_dims[i]))
            if self.layer_norm:
                self.lns.append(nn.LayerNorm(hidden_dims[i]))
            i_dim = hidden_dims[i]
        self.convs.append(self.build_conv_layer(i_dim, edge_dim, output_dim))
        
        self.bce_loss = nn.BCEWithLogitsLoss(pos_weight=torch.ones([output_dim]))

    def build_conv_layer(self, input_dim, edge_dim, hidden_dim):
        return TransformerConv(input_dim, hidden_dim, edge_dim=edge_dim)

    def forward(self, node_attr, edge_index, edge_attr):
        for i in range(self.num_layers):
            node_attr = self.convs[i](node_attr, edge_index, edge_attr)
            node_attr = F.relu(node_attr)
            node_attr = F.dropout(node_attr, p=self.dropout, training=self.training)
            if self.layer_norm and i < self.num_layers - 1:
                node_attr = self.lns[i](node_attr)
        return node_attr
    
    def loss(self, logits, labels, variable_mask):
        loss_val = self.bce_loss(logits[variable_mask], labels[variable_mask])
        return loss_val

In [255]:
input_dim = 2000
edge_dim = 8
hidden_dims = [128, 64, 32]
output_dim = 2

model = GNNConvolution(input_dim, edge_dim, hidden_dims, output_dim)

In [256]:
def batch_predict(model, batch, threshold=0.9):
    with torch.no_grad():
        node_attr, edge_index, edge_attr = batch.x, batch.edge_index, batch.edge_attr
        logits = model(node_attr, edge_index, edge_attr, batch.batch)
        pred = torch.sigmoid(logits).detach().cpu().numpy() > threshold
        labels = batch.labels.cpu()

    jaccard_score_val = jaccard_score(labels, yara_pred, average="weighted", zero_division=0)
    hamming_score_val = hamming_loss(labels, yara_pred)

    return jaccard_score_val, hamming_score_val


def predict_for_loader(model, test_loader):
    jaccard_score_total, hamming_loss_total = 0, 0
    for data in test_loader:
        data.to(DEVICE)
        jaccard_score_val, hamming_score_val = batch_predict(model, data)

        jaccard_score_total += jaccard_score_val
        hamming_loss_total += hamming_loss_total

    total = len(test_loader.dataset)
    return (metric / total for metric in [jaccard_score_total, hamming_loss_total])

In [257]:
opt = torch.optim.Adam(model.parameters(), lr=0.01)

# train
for epoch in range(200):
    total_loss = 0
    model.train()
    for batch in train_loader:
        #print(batch.train_mask, '----')
        opt.zero_grad()
        embedding, pred = model(batch)
        label = batch.y
        loss = model.loss(pred, label)
        loss.backward()
        opt.step()
        total_loss += loss.item() * batch.num_graphs
    total_loss /= len(train_loader.dataset)

    if epoch % 10 == 0:
        jaccard, hamming = predict_for_loader(model, test_loader)
        print(
            f"Epoch {epoch}. "
            f"Loss: {total_loss:.4f}. "
            f"Jaccard avg: {train_jaccard/batch.num_graphs:.4f}/{test_jaccard_avg:.4f}. "
            f"Hamming avg: {train_hamming/batch.num_graphs:.4f}/{test_hamming_avg:.4f}. "
        )
        writer.add_scalar("predict_for_loader accuracy", test_acc, epoch)

TypeError: list indices must be integers or slices, not tuple