In [1]:
import random
import networkx as nx

In [2]:
G = nx.read_edgelist("../data/25_1.txt", nodetype = int)

In [3]:
# We need to keep track of the nodes and the edges we have sampled, so we store them in sets.
# Then we need a seed node where to start the exploration. Here I decided to just pick a random node.
sampled_nodes = set()
sampled_edges = set()
curnode = random.choice(list(G.nodes))

In [4]:
# We continue until we sampled 2000 nodes.
while len(sampled_nodes) <= 2000:
   # First we get the neighbors of the node we're currently exploring
   neighbors = list(G.neighbors(curnode))
   if not curnode in sampled_nodes: # This is true if we never sampled this node before. This means we never added its connections to sampled_edges
      sampled_nodes.add(curnode) # This will allow us to remember we sampled this node
      # We update the set of sampled edges. We need to have a canonical representation of the edge because the network is undirected,
      # so if we already saw the edge because we sampled the neighbor, we might have stored the edge as (neighbor, curnode) rather than
      # (curnode, neighbor). With this min-max trick, this is not an issue.
      sampled_edges |= set([(min(curnode, neighbor), max(curnode, neighbor)) for neighbor in neighbors])
   curnode = random.choice(neighbors) # We move on to sampling a random neighbor of the current node, because we're doing a random walk.


In [5]:
G_smpl = nx.Graph(list(sampled_edges))
print(len(G_smpl.nodes))

63167
