# Exercise Sheet 1

### 1.1 Load a Graph

In [79]:
import igraph
import numpy as np

In [80]:
g = igraph.Graph.Read_Pickle('ogbn-arxiv.pickle')

#### 1.

In [81]:
print('number of vertices:', g.vcount())
print('number of edges:', g.ecount())

number of vertices: 169343
number of edges: 1166243


#### 2.

In [82]:
# get list of in degree for each node (= number citations)
indegree_list = g.indegree()

# get ids of 5 most cited articles
most_cited = np.argsort(indegree_list)[-5:]

# print their subject areas
print('subjects of 5 most cited articles:')
for id in most_cited:
    print(*g.vs['label'][id])

subjects of 5 most cited articles:
16
24
16
16
24


#### 3.

In [83]:
# get nodes with an edge to the most cited articles (= articles citing the most cited ones)
citing_ids = []
for id in most_cited:
    citing_ids += g.predecessors(id)

# remove any duplicates
citing_ids = np.unique(citing_ids)

# get the subject area of each article
citing_subj = np.array(g.vs['label'])[citing_ids]

# count occurence of each subject area
subjects, counts = np.unique(citing_subj, return_counts=True)

# print subject area with maximum occurrences
print('subject area that most articles referencing the 5 most cited articles belong to:',
      subjects[np.argmax(counts)])

subject area that most articles referencing the 5 most cited articles belong to: 16


## 2 Split the Graph into Training and Test Data Set

In [84]:
# get vertices by year
vert_year = np.array(g.vs['year'])
# seperate vertices into two sets V_1, V_2
vert_id_1 = np.argwhere(vert_year < 2019)
vert_id_2 = np.argwhere(vert_year >= 2019)

## 3 Vertex Features

### 3.1 Triangle counts


#### 1.
Let $G$ be a graph and $v \in V(G)$.

Claim: $\Delta(v) = |E(G[\mathcal{N}(v)])|$

Proof:

For each pair
$$u, w \in \mathcal{N}(v): \{u, v\}, \{v, w\} \in E(G)$$
by definition of a neighborhood.

Therefore
$${u, v, w}\; \text{form a triangle} \;\Leftrightarrow \;{u, w} \in E(G[\mathcal{N}(v)]),$$

so the number of triangles that include $v$ is equal to the number of edges in the neigborhood of $v$:

$$\Delta(v) = |E(G[\mathcal{N}(v)])|.$$





#### 2.

In [85]:
n_triangles = []
# for each vertex
for id in range(g.vcount()):
    # get its neighborhood (ignoring edge directions)
    neighborhood = g.neighbors(id)
    # create undirected subgraph induced by neigborhood
    subg = g.induced_subgraph(neighborhood)
    subg.to_undirected()
    # append number of edges in subgraph to triangle counts
    n_triangles.append(subg.ecount())

# convert to numpy array
n_triangles = np.array(n_triangles)    

### 3.2 Other Vertex Features

In [103]:
# 1. degree
degree = g.degree(range(g.vcount()))

# 2. page rank score
page_rank = g.pagerank()  

# 3. coreness
coreness = g.coreness()

# 4. eigenvector centrality
ev_centrality = g.eigenvector_centrality()

# 5. indegree
indegree = g.indegree(range(g.vcount()))

# 6. number of triangles a vertex is part of
triangleness = n_triangles

In [109]:
# combine all features into a matrix
feature_mat = np.stack((degree, page_rank, coreness, ev_centrality, indegree, triangleness), axis=1)