In [1]:
from torch_geometric.data import Data
import torch
from torch_geometric.data import InMemoryDataset, download_url
from tensorflow import keras
import matplotlib.pyplot as plt
from torch.nn import BatchNorm1d
from torch.nn import Sequential as Seq, Linear, ReLU
from torch_geometric.nn import GCNConv
from tqdm.notebook import tqdm_notebook
import numpy as np
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool
import torch.nn.functional as F
from sklearn.metrics import f1_score
from torch_geometric.nn import GINConv, global_add_pool, global_mean_pool
from torch.nn import Sequential, Linear, ReLU


In [2]:
root = '/Users/maxperozek/GNN-research'

In [3]:
class MultiTargetData(Data):
    def __cat_dim__(self, key, value, *args, **kwargs):
        if key == 'y':
            return None
        else:
            return super().__cat_dim__(key, value, *args, **kwargs)

In [4]:
class WICO(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [f'{root}/GNN-exp-pipeline/data/full_wico.pt']

    @property
    def processed_file_names(self):
        return ['processed_wico.pt']

    def download(self):
        # Download to `self.raw_dir`.
        # download_url(url, self.raw_dir)
        pass
    def process(self):
        # Read data into huge `Data` list.
        data_list = torch.load(self.raw_file_names[0])

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [5]:
wico = WICO('test-wico', pre_filter=lambda data: data.y != 2)

In [6]:
wico.data.y.unique()

tensor([0, 1])

In [7]:
# Batch norm
m = BatchNorm1d(3, affine=False)

In [8]:
wico.data.x = m(wico.data.x)

In [9]:
wico.data.x

tensor([[-0.0997, -0.0968,  0.0868],
        [-0.1299,  0.7427,  0.4153],
        [-0.1432,  1.1625,  2.0575],
        ...,
        [-0.1601, -0.9363, -0.2417],
        [-0.1597, -1.7759, -0.8986],
        [ 0.6019, -2.6154, -1.2270]])

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
y = wico.data.y.numpy()

In [12]:
total = len(y)

In [13]:
real = len([v for v in y if v == 0])

In [14]:
fake = len([v for v in y if v == 1])

In [15]:
eps = 0.005

In [16]:
# weights = [(1-real/total)-eps, (1-fake/total)+eps]
weights = [(1-real/total), (1-fake/total)]

In [17]:
weights

[0.1413864104323953, 0.8586135895676047]

In [18]:
# fake_news_idx = np.where(wico.data.y == 1)[0]

In [19]:
# real_news_idx = np.where(wico.data.y == 0)[0][:412]

In [20]:
# idx = np.concatenate((fake_news_idx, real_news_idx))

In [21]:
# wico = wico[idx]

In [22]:
wico

WICO(2914)

In [23]:
l = len(wico)
y = [data.y.numpy()[0] for data in wico]

In [24]:
train_idx, valid_idx = train_test_split(np.arange(l), test_size=0.2, shuffle=True, stratify=y)

In [25]:
len(train_idx)

2331

In [26]:
len(valid_idx)

583

In [27]:
batch_size = 128
train_loader = DataLoader(wico[train_idx], batch_size=batch_size, shuffle=False)
test_loader = DataLoader(wico[valid_idx], batch_size=batch_size, shuffle=False)



In [28]:
class GIN(torch.nn.Module):

    def __init__(self, dim_features, dim_target, config):
        super(GIN, self).__init__()

        self.config = config
        self.dropout = config['dropout']
        self.embeddings_dim = [config['hidden_units'][0]] + config['hidden_units']
        self.no_layers = len(self.embeddings_dim)
        self.first_h = []
        self.nns = []
        self.convs = []
        self.linears = []

        train_eps = config['train_eps']
        if config['aggregation'] == 'sum':
            self.pooling = global_add_pool
        elif config['aggregation'] == 'mean':
            self.pooling = global_mean_pool

        for layer, out_emb_dim in enumerate(self.embeddings_dim):

            if layer == 0:
                self.first_h = Sequential(Linear(dim_features, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU(),
                                    Linear(out_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU())
                self.linears.append(Linear(out_emb_dim, dim_target))
            else:
                input_emb_dim = self.embeddings_dim[layer-1]
                self.nns.append(Sequential(Linear(input_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU(),
                                      Linear(out_emb_dim, out_emb_dim), BatchNorm1d(out_emb_dim), ReLU()))
                self.convs.append(GINConv(self.nns[-1], train_eps=train_eps))  # Eq. 4.2

                self.linears.append(Linear(out_emb_dim, dim_target))

        self.nns = torch.nn.ModuleList(self.nns)
        self.convs = torch.nn.ModuleList(self.convs)
        self.linears = torch.nn.ModuleList(self.linears)  # has got one more for initial input

    def forward(self, x, edge_index, batch):
        # x, edge_index, batch = data.x, data.edge_index, data.batch

        out = 0

        for layer in range(self.no_layers):
            if layer == 0:
                x = self.first_h(x)
                out += F.dropout(self.pooling(self.linears[layer](x), batch), p=self.dropout)
            else:
                # Layer l ("convolution" layer)
                x = self.convs[layer-1](x, edge_index)
                out += F.dropout(self.linears[layer](self.pooling(x, batch)), p=self.dropout, training=self.training)

        return out

In [29]:
# Model
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, input_dim, targets):
        super(GCN, self).__init__()
        # torch.manual_seed(12345)
        self.conv1 = GCNConv(input_dim, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        self.mlp = Linear(hidden_channels, targets)
        self.mlp = Sequential(Linear(hidden_channels, hidden_channels), BatchNorm1d(hidden_channels), ReLU(), Linear(hidden_channels, hidden_channels), BatchNorm1d(hidden_channels), ReLU(), Linear(hidden_channels, targets))

    def forward(self, x, edge_index, batch):
        # Convolution layers 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        # x = x.relu()
        # x = self.conv3(x, edge_index)
        # x = x.relu()
        # x = self.conv4(x, edge_index)
        
    
        # pooling
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.mlp(x)
        
        return x

In [30]:
import sys
sys.path.insert(0, '/Users/maxperozek/GNN-research/HGP-SL')


In [31]:
sys.path

['/Users/maxperozek/GNN-research/HGP-SL',
 '/Users/maxperozek/GNN-research/GNN-exp-pipeline',
 '/Users/maxperozek/opt/anaconda3/envs/comp_gr_thy/lib/python39.zip',
 '/Users/maxperozek/opt/anaconda3/envs/comp_gr_thy/lib/python3.9',
 '/Users/maxperozek/opt/anaconda3/envs/comp_gr_thy/lib/python3.9/lib-dynload',
 '',
 '/Users/maxperozek/opt/anaconda3/envs/comp_gr_thy/lib/python3.9/site-packages']

In [32]:
from models import Model

In [35]:
class Args:
    seed = 777
    batch_size = 128
    lr = 0.001
    weight_decay = 0.001
    nhid = 128
    sample_neighbor = True
    sparse_attention = True
    structure_learning = True
    pooling_ratio = 0.5
    dropout_ratio = 0.0
    lamb = 1.0
    device = 'cpu'
    epochs = 1000
    patience = 100
    num_features = 3
    num_classes = 2

args=Args()

model = Model(args)

In [34]:
model = GCN(hidden_channels=32, input_dim=3, targets=2)

In [None]:
config = {
    'hidden_units':[32,32,32],
    'train_eps': False,
    'aggregation':'sum',
    'dropout':0.0
         }
model = GIN(3, 2,config)

In [36]:

# optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
loss_fn = torch.nn.CrossEntropyLoss(weight=torch.Tensor(weights))
epochs = 1000

In [None]:
for e in tqdm_notebook(range(epochs)):
    
    model.train()
    loss_sum = []
    batches = 0
    for _, batch in enumerate(train_loader):
        optimizer.zero_grad()

        # model_out = model(batch.x.float(), batch.edge_index, batch.batch)
        model_out = model(batch)
        y = keras.utils.to_categorical(batch.y, 2)
        loss = loss_fn(model_out, torch.Tensor(y))
        loss_sum.append(float(loss.detach()))
        batches += 1
        loss.backward()
        optimizer.step()
    
    if e % 5 == 0:
        
        model.eval()
        correct = 0
        with torch.no_grad():
            for data in test_loader:
                # out = model(data.x.float(), data.edge_index, data.batch)
                out = model(data)
                # print(f'out {out}')
                # print(f'argmax {torch.argmax(out, dim=1).numpy()}')
                cor_list = (torch.argmax(out, dim=1).numpy() == data.y.numpy())
                # print(f'bools {cor_list}')
                # print(f'labels {data.y}')
                correct += cor_list.sum()
                f1 = f1_score(data.y.numpy(), torch.argmax(out, dim=1).numpy())

        print(f'\neval acc: {correct / len(test_loader.dataset)} \n f1 score: {f1}\n')
        
    print(f'epoch {e} loss: {sum(loss_sum)/batches}')     

  0%|          | 0/1000 [00:00<?, ?it/s]


eval acc: 0.8593481989708405 
 f1 score: 0.0

epoch 0 loss: 0.1654699068320425
epoch 1 loss: 0.16212909472616097
epoch 2 loss: 0.16005229949951172
epoch 3 loss: 0.15845600476390437
epoch 4 loss: 0.15711144004997454

eval acc: 0.6518010291595198 
 f1 score: 0.3333333333333333

epoch 5 loss: 0.15598332960354655
epoch 6 loss: 0.15499134479384674
epoch 7 loss: 0.1540942517550368
epoch 8 loss: 0.1533148951436344
epoch 9 loss: 0.1525225455039426

eval acc: 0.6209262435677531 
 f1 score: 0.3157894736842105

epoch 10 loss: 0.15204647771622004
epoch 11 loss: 0.1512065129844766
epoch 12 loss: 0.15066308920320712
epoch 13 loss: 0.15008881256768578
epoch 14 loss: 0.14950530466280484

eval acc: 0.6295025728987993 
 f1 score: 0.3243243243243243

epoch 15 loss: 0.14893485566503123
epoch 16 loss: 0.1483229300693462
epoch 17 loss: 0.14773595333099365
epoch 18 loss: 0.14713866459695915
epoch 19 loss: 0.14655591194566928

eval acc: 0.6518010291595198 
 f1 score: 0.3333333333333333

epoch 20 loss: 0.1460

In [333]:
tens = torch.Tensor(-1 * np.random.random(5,))
tens

tensor([-0.6791, -0.6866, -0.3721, -0.5100, -0.0794])

In [334]:
tens.relu()

tensor([0., 0., 0., 0., 0.])

In [335]:
wico

WICO(2914)

In [352]:
misinfo_size = []
misinfo_nedges = []
for w in [g for g in wico if g.y == 1]:
    misinfo_size.append(w.x.shape[0])
    misinfo_nedges.append(len(w.edge_index.T))

In [356]:
print(f'avg nodes misinfo: {sum(misinfo_size)/len(misinfo_size)}')
print(f'avg edges misinfo: {sum(misinfo_nedges)/len(misinfo_nedges)}')


avg nodes misinfo: 46.29126213592233
avg edges misinfo: 142.3131067961165


In [357]:
realinfo_size = []
realinfo_nedges = []
for w in [g for g in wico if g.y == 0]:
    realinfo_size.append(w.x.shape[0])
    realinfo_nedges.append(len(w.edge_index.T))

In [358]:
print(f'avg nodes misinfo: {sum(realinfo_size)/len(realinfo_size)}')
print(f'avg edges misinfo: {sum(realinfo_nedges)/len(realinfo_nedges)}')

avg nodes misinfo: 62.80815347721823
avg edges misinfo: 132.29056754596323
