# Make a dataset of subgraphs sampled from a larger network

This notebook shows how to format the Wikipedia link dataset gathered from `get_wiki_data.ipynb` as a network using NetworkX and sample egocentric subgraphs from it.

In [None]:
import networkx as nx
import pickle as pkl
import random

In [None]:
def egoGraph(graph, node=None, returnNodeName=False):
    """ 
    Get the ego graph centred at the specified node, or a random node if 
    no node specified. Distance 1 from centre only.
    
    graph: the graph to sample from
    node: the name of the node at the centre of the graph, or None to choose a random node
    returnNodeName: whether to also return the name of the centre node. If randomly
        selecting a node, this will be necessary to know exactly which one was selected
    """
    if node is None:
        node = random.choice(list(graph.nodes()))
    egoGraph = nx.generators.ego.ego_graph(graph, randnode)
    if returnNodeName:
        return egoGraph, node
    else:
        return egoGraph

## Load link dataset and format as edgelist

In [None]:
pageLinks = pkl.load(open('pageLinks.pkl', 'rb'))
pageTitles = pkl.load(open('pageTitles.pkl', 'rb'))

Create edgelist from all articles:

In [None]:
edgeList1 = []
for i in pageLinks:
    edgeList1.extend(list(zip([i[0]]*len(i[1]), i[1])))

Remove edges if it links to a page that wasn't in the original list (optional):

In [None]:
edgeList2 = [i for i in edgeList1 if i[0] in pageTitles and i[1] in pageTitles]

Remove self-edges:

In [None]:
edgeList3 = [i for i in edgeList2 if i[0] != i[1]]

Format as undirected NetworkX Graph:

In [None]:
graph = nx.Graph(edgeList3)

## Sample subgraphs

Single example:

In [None]:
egoNet, centrePage = egoGraph(graph = graph, node = None, returnNodeName = True)
print(centrePage)
nx.draw(egoNet)

#### Make graph dataset

Nodes won't be sampled more than once.

- numGraphs = number of graphs to sample
- minNodes = minimum number of nodes the ego graphs should have
- maxNodes = maximum number of nodes the ego graphs should have

In [None]:
numGraphs = 150
minNodes = 15
maxNodes = 50

nets = []
centrePages = []
while len(nets) < numGraphs:
    tempNet, tempPage = egoGraph(graph = graph, node = None, returnNodeName = True)
    if len(tempNet) > minNodes and len(tempNet) < maxNodes and tempPage not in centrePages:
        nets.append(tempNet)
        centrePages.append(tempPage)

In [None]:
# save dataset
#pkl.dump(nets, open('graphDataset.pkl', 'wb'))