# Path lengths
The first thing we want to take a look at is path lengths. NetworkX allows us to calculate the shortest path between any pair of articles. We begin by comparing the length of human and shortests paths.

*Exercises*

For each source/target pair in the list of human navigation paths, calculate the shortest path using NetworkX. Plot the distribution of path lengths.

In [72]:
import csv

# Go through the articles.tsv and add articles to a list
with open('./articles.tsv', 'r') as f:
    # Skip the headers
    lines_after_12 = f.readlines()[12:]
    reader = csv.reader(lines_after_12)
    articles = []
    for row in reader:
        articles.append(row)

In [73]:
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Go through the list 'articles' and add articles as nodes to a graph 'G'
for i in range(len(articles)):
    G.add_node(i)

In [74]:
# Go through the links.tsv and add directed link from one node to another
with open('./links.tsv', 'r') as f:
    # Skip the headers
    lines_after_12 = f.readlines()[12:]
    reader = csv.reader(lines_after_12, delimiter='\t')
    for row in reader:
        G.add_edge(row[0], row[1])

In [75]:
# Go through the paths_finished.tsv and find the source and target articles and the length between them
with open('./paths_finished.tsv', 'r') as f:
    # Skip the headers
    lines_after_16 = f.readlines()[16:]
    reader = csv.reader(lines_after_16, delimiter='\t')
    human_navigation_paths = []
    for row in reader:
        link_start_end = []
        links = row[3].split(';')      
        # Get the source article
        link_start_end.append(links[0])
        # Get the target article
        link_start_end.append(links[-1])
        # Ignore back steps
        ignore_back_steps = [link for link in links if link != '<']
        # Get the length of path without back steps
        link_start_end.append(len(ignore_back_steps))
        # Add source/target article and length of path
        human_navigation_paths.append(link_start_end)

In [77]:
# Create an undirected graph of network 'G'
G_undirected = G.to_undirected()

# Go through the list 'human_navigation_paths' and find shortest paths and add to list
shortest_path = []
for i in human_navigation_paths:
    # Ignore human paths that have length shorter than 3
    if(i[2] > 2):
        shortest_path.append(nx.shortest_path_length(G_undirected, source=i[0], target=i[1]))

In [112]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

print(shortest_path.count(1))
print(shortest_path.count(2))
print(shortest_path.count(3))
print(shortest_path.count(4))

790
29578
19929
212
