# Exercise Sheet 1

### 1.1 Load a Graph

In [1]:
import igraph
import numpy as np

In [2]:
g = igraph.Graph.Read_Pickle('ogbn-arxiv.pickle')

#### 1.

In [3]:
print('number of vertices:', g.vcount())
print('number of edges:', g.ecount())

number of vertices: 169343
number of edges: 1166243


#### 2.

In [4]:
# get list of in degree for each node (= number citations)
indegree_list = g.indegree()

# get ids of 5 most cited articles
most_cited = np.argsort(indegree_list)[-5:]

# print their subject areas
print('subjects of 5 most cited articles:')
for id in most_cited:
    print(*g.vs['label'][id])

subjects of 5 most cited articles:
16
24
16
16
24


#### 3.

In [5]:
# get nodes with an edge to the most cited articles (= articles citing the most cited ones)
citing_ids = []
for id in most_cited:
    citing_ids += g.predecessors(id)

# remove any duplicates
citing_ids = np.unique(citing_ids)

# get the subject area of each article
citing_subj = np.array(g.vs['label'])[citing_ids]

# count occurence of each subject area
subjects, counts = np.unique(citing_subj, return_counts=True)

# print subject area with maximum occurrences
print('subject area that most articles referencing the 5 most cited articles belong to:',
      subjects[np.argmax(counts)])

subject area that most articles referencing the 5 most cited articles belong to: 16


## 2 Split the Graph into Training and Test Data Set

In [6]:
# get vertices by year
vert_year = np.array(g.vs['year']).squeeze()
# seperate vertices into two sets V_1, V_2
vert_ids_1 = np.argwhere(vert_year < 2019)
vert_ids_2 = np.argwhere(vert_year >= 2019)

## 3 Vertex Features

### 3.1 Triangle counts


#### 1.
Let $G$ be a graph and $v \in V(G)$.

**Claim**: $\Delta(v) = |E(G[\mathcal{N}(v)])|$

**Proof**:

For each pair
$$u, w \in \mathcal{N}(v): \{u, v\}, \{v, w\} \in E(G)$$
by definition of a neighborhood.

Therefore
$${u, v, w}\; \text{form a triangle} \;\Leftrightarrow \;{u, w} \in E(G[\mathcal{N}(v)]),$$

so the number of triangles that include $v$ is equal to the number of edges in the neigborhood of $v$:

$$\Delta(v) = |E(G[\mathcal{N}(v)])|.$$





#### 2.

In [19]:
n_triangles = []
# for each vertex
for id in range(g.vcount()):
    # get its neighborhood (ignoring edge directions)
    neighborhood = g.neighbors(id)
    # create undirected subgraph induced by neigborhood
    subg = g.induced_subgraph(neighborhood)
    subg.to_undirected()
    # append number of edges in subgraph to triangle counts
    n_triangles.append(subg.ecount())

# convert to numpy array
n_triangles = np.array(n_triangles)    

### 3.2 Other Vertex Features

In [18]:
from time import time

In [23]:
# 1. degree
start = time()
degree = g.degree(range(g.vcount()))
print(f'computing degree took {time() - start} seconds')

# 2. page rank score
start = time()
page_rank = g.pagerank() 
print(f'computing page rank took {time() - start} seconds')

# 3. coreness
start = time()
coreness = g.coreness()
print(f'computing coreness took {time() - start} seconds')

# 4. eigenvector centrality
start = time()
ev_centrality = g.eigenvector_centrality()
print(f'computing eigenvector centrality took {time() - start} seconds')

# 5. indegree
start = time()
indegree = g.indegree(range(g.vcount()))
print(f'computing indegree took {time() - start} seconds')

# 6. outdegree
start = time()
outdegree = g.outdegree(range(g.vcount()))
print(f'computing outdegree took {time() - start} seconds')

computing degree took 0.018128156661987305 seconds
computing page rank took 0.3403968811035156 seconds
computing coreness took 0.153914213180542 seconds
computing eigenvector centrality took 1.0849132537841797 seconds
computing indegree took 0.018941640853881836 seconds
computing outdegree took 0.01842641830444336 seconds


In [9]:
# combine all features into a matrix
feature_mat = np.stack((degree, page_rank, coreness, ev_centrality, indegree, triangleness), axis=1)

## 4 Node Classification

In [10]:
from sklearn import tree
from tqdm import tqdm

In [11]:
# let's use a decision tree as the classifier
classifier = tree.DecisionTreeClassifier()

#### 1.

In [12]:
# prepare the data:
# document representation as predictor (x), subject as response (y)
x_train = np.array(g.vs['attr'])[vert_ids_1].squeeze()
y_train = np.array(g.vs['label'])[vert_ids_1].squeeze()

x_test = np.array(g.vs['attr'])[vert_ids_2].squeeze()
y_test = np.array(g.vs['label'])[vert_ids_2].squeeze()

In [13]:
# fit the tree to the train data
classifier = classifier.fit(x_train, y_train)

In [14]:
# test the tree on the test data and report accuracy
accuracy = classifier.score(x_test, y_test)
print('accuracy:', accuracy)

accuracy: 0.2679258481986709


#### 2.

In [49]:
# prepare the data
# create induced subgraphs
g_train = g.induced_subgraph(vert_ids_1.squeeze())
g_test = g.induced_subgraph(vert_ids_2.squeeze())

x = []
for subg in [g_train, g_test]:
    attr_array = np.array(subg.vs['attr'])
    feature_list = []

    for vert_id in range(subg.vcount()):
        # get features of neighbors and self
        neighbor_ids = subg.neighbors(vert_id) + [vert_id]
        neighbor_feats = attr_array[neighbor_ids]
        # compute their mean
        feat_mean = np.mean(neighbor_feats, axis=0)
        feature_list.append(feat_mean)
    # combine mean features and raw features
    x.append(np.concatenate((attr_array, feature_list), axis=1))

# seperate into train and test splits
x_train = x[0]
x_test = x[1]


In [50]:
# fit the classifier
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(x_train, y_train)

In [51]:
# test and report accuracy
accuracy = classifier.score(x_test, y_test)
print('accuracy:', accuracy)

accuracy: 0.28625804991461434
