In [1]:
import networkx as nx
import os
import re

In [None]:
def extract_title_link(match):
    title = re.search(r'\[\[([^\|\]]+)', match)
    if title:
        return title.group(1).replace(" ", "_")
    else:
        return None

def findLinks(wikipage):
    pattern = r'\[{2}[\w\-\s\(\)]*\|?[\w\s\-\(\)]*\]{2}'
    matches = re.findall(pattern, wikipage)
    unique_matches = set(matches)
    links = [extract_title_link(unique_match) for unique_match in unique_matches]
    return links

def build_graph_from_files(path):
    files = [f for f in os.listdir(path) if f.endswith(".txt")]
    outgoing_links = {}
    pages = set()
    
    # Process each file in the directory to collect outgoing links and all pages
    for file in files:
        if not file.endswith(".txt"): 
            continue
        
        filepath = os.path.join(path, file)
        with open(filepath, "r", encoding="utf-8") as f:
            wikipage = f.read()
            wikipage_links = findLinks(wikipage)
            withoutExtension = os.path.splitext(file)[0]
            pages.add(withoutExtension)  # Add the page to the set of all pages
            
            for link in wikipage_links:
                if link + ".txt" in files:  # Only consider links that exist as files
                    outgoing_links.setdefault(withoutExtension, []).append(link)
                    pages.add(link)  # Add the linked page to the set of all pages

    G = nx.DiGraph()

    # Add all pages to the graph with the 'contentlength' attribute
    for page in pages:
        filename = os.path.join(path, f"{page}.txt")
        with open(filename, "r", encoding="utf-8") as f:
            content = f.read()
        word_count = len(content.split())
        G.add_node(page, contentlength=word_count)
    
    # Add edges based on outgoing links
    for page, links in outgoing_links.items():
        for link in links:
            G.add_edge(page, link)

    # Remove isolated nodes
    isolated_nodes = list(nx.isolates(G))
    if isolated_nodes:
        G.remove_nodes_from(isolated_nodes)

    # Get the largest connected component
    if nx.is_weakly_connected(G):
        S = G.copy()
    else:
        largest_cc = max(nx.weakly_connected_components(G), key=len)
        S = G.subgraph(largest_cc).copy()
    
    return S

# Example usage for building graph
path = "downloads/"
S = build_graph_from_files(path)

In [3]:
print(f"Number of nodes: {S.number_of_nodes()}")
print(f"Number of edges: {S.number_of_edges()}")

Number of nodes: 1366
Number of edges: 10850


In [6]:
import pickle
# save graph locally not to create each time:
pickle.dump(S, open("graph.pkl", "wb"))
# load graph:
S = pickle.load(open("graph.pkl", "rb"))
print(f"Number of nodes: {S.number_of_nodes()}")
print(f"Number of edges: {S.number_of_edges()}")

Number of nodes: 1465
Number of edges: 11202
