In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from dgl.data.citation_graph import CiteseerGraphDataset
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl

from src.models.gcn import GCN
from src.data.datasets import CiteSeerDataset

Using backend: pytorch


In [3]:
train_ds = CiteSeerDataset(subset="train")[0]
test_ds = CiteSeerDataset(subset="test")[0]
feats = train_ds["feats"]
labels = train_ds["labels"]
train_mask = train_ds["node_mask"]
test_mask = test_ds["node_mask"]

X_train, y_train = feats[train_mask].numpy(), labels[train_mask].numpy()
X_test, y_test = feats[test_mask].numpy(), labels[test_mask].numpy()

Loading from cache failed, re-processing.


  r_inv = np.power(rowsum, -1).flatten()


Finished data loading and preprocessing.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Loading from cache failed, re-processing.
Finished data loading and preprocessing.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.


# Baseline models

In [4]:
rows = []

log_reg = LogisticRegression(C=1e4)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
acc = accuracy_score(y_test, y_pred)
rows.append({"model": "logistic reg",
             "accuracy": acc})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
svc = SVC(C=1.0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
rows.append({"model": "SVM",
             "accuracy": acc})

# GCN with identity features

In [6]:
def collate_fn(batch):
    return batch[0]

test_dataloader = DataLoader(
    CiteSeerDataset(subset="test", no_node_features=True),
    batch_size=1,
    collate_fn=collate_fn,
)

model = GCN.load_from_checkpoint("models/gcn_no_features/checkpoints/epoch=53.ckpt")
trainer = pl.Trainer(logger=False)
results = trainer.test(model, test_dataloaders=test_dataloader)[0]
rows.append({"model": "GCN w/ identity feats",
            "accuracy": results["test/accuracy"]})

Loading from cache failed, re-processing.


  r_inv = np.power(rowsum, -1).flatten()
GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Finished data loading and preprocessing.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/accuracy': tensor(0.3600), 'test/loss': tensor(1.5829)}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 1/1 [00:00<00:00, 23.11it/s]




# GCN with node features

In [7]:
test_dataloader = DataLoader(
    CiteSeerDataset(subset="test", no_node_features=False),
    batch_size=1,
    collate_fn=collate_fn,
)

model = GCN.load_from_checkpoint("models/gcn_with_features/checkpoints/epoch=209.ckpt")
results = trainer.test(model, test_dataloaders=test_dataloader)[0]
rows.append({"model": "GCN w/ node feats",
            "accuracy": results["test/accuracy"]})

Loading from cache failed, re-processing.
Finished data loading and preprocessing.
  NumNodes: 3327
  NumEdges: 9228
  NumFeats: 3703
  NumClasses: 6
  NumTrainingSamples: 120
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/accuracy': tensor(0.7150), 'test/loss': tensor(1.0257)}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 1/1 [00:00<00:00, 22.13it/s]


In [8]:
pd.DataFrame(rows)

Unnamed: 0,model,accuracy
0,logistic reg,0.59
1,SVM,0.593
2,GCN w/ identity feats,0.36
3,GCN w/ node feats,0.715
