# Prelude

Contains the following:
1. Imports and defining helper functions
2. Scraping data from wikipedia and downloads it
3. Building the network either from
    - (A) Downloaded files
    - (B) Local pickle file (created from last time A was run)
4. Simple prelimenary data analysis of network

## 1. Imports and defining helper functions

In [1]:
import urllib.request
import urllib.parse  # to handle special characters in the title
import json
import shutil
import re
import os
import networkx as nx
import csv
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pickle

# Set the directory to downloads
DOWNLOADS_DIR = "downloads"
TITLE_LINKS_FILE = "title_links.json"

def getJsonResponse(title):
  # Define the components of the query
  baseurl = "https://en.wikipedia.org/w/api.php?"
  action = "action=query"
  title = f"titles={urllib.parse.quote(title)}"
  content = "prop=revisions&rvprop=content"
  dataformat = "format=json"
  rvslots = "rvslots=main"

  # Construct the query URL
  query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, title, dataformat, rvslots)

  try:
    # Make the request to Wikipedia API
    wikiresponse = urllib.request.urlopen(query)

    # Check if the HTTP status is OK (200)
    if wikiresponse.getcode() != 200:
      print(f"Error: Received non-200 HTTP status code {wikiresponse.getcode()}")
      return None

    wikidata = wikiresponse.read()

    # Parse the JSON response
    try:
      wikiJson = json.loads(wikidata)
    except json.JSONDecodeError:
      print("Error: Failed to decode JSON response")
      return None

    # Get the page from the JSON response
    page = next(iter(wikiJson['query']['pages'].values()))  # extract the single page

    # Check if the page has revisions and extract the latest wikitext content
    if 'revisions' in page and len(page['revisions']) > 0:
      wikitext = page['revisions'][0]['slots']['main']['*']  # extract wikitext from "main" slot
      return wikitext
    else:
      #print(f"Error: Page '{title}' does not contain revisions.")
      return None

  except urllib.error.URLError as e:
    print(f"Network error: {e.reason}")
    return None
  except Exception as e:
    print(f"Unexpected error: {str(e)}")
    return None

## Convert the list to link titles e.g. John McCain (fictional) => John_McCain_(fictional)
def extract_title_link(match):
  # Regular expression to match the content between [[ and | (the first part of the link)
  title = re.search(r'\[\[([^\|\]]+)', match)
  if title:
    # Replace all whitespaces in the title with underscores
    return title.group(1).replace(" ", "_")
  else:
    print("ERROR FINDING ", match)
    return None

def findLinks(wikipage):
  pattern = r'\[{2}[\w\-\s\(\)]*\|?[\w\s\-\(\)]*\]{2}' ## regex for finding links e.g.: [[John McCain (fictional)|John McCain]]
  matches = re.findall(pattern, wikipage)
  # Convert the list to a set to keep only unique matches
  unique_matches = set(matches)

  links = [extract_title_link(unique_match) for unique_match in unique_matches]
  return links

def build_graph_from_files(path):
    files = os.listdir(path)
    outgoing_links = {}
    pages = set()
    
    # Process each file in the directory to collect outgoing links and all pages
    for file in files:
        if not file.endswith(".txt"): 
            continue
        
        filepath = os.path.join(path, file)
        with open(filepath, "r", encoding="utf-8") as f:
            wikipage = f.read()
            wikipage_links = findLinks(wikipage)
            withoutExtension = os.path.splitext(file)[0]
            pages.add(withoutExtension)  # Add the page to the set of all pages
            
            for link in wikipage_links:
                if link + ".txt" in files:  # Only consider links that exist as files
                    outgoing_links.setdefault(withoutExtension, []).append(link)
                    pages.add(link)  # Add the linked page to the set of all pages

    G = nx.DiGraph()

    # Add all pages to the graph with the 'contentlength' attribute
    for page in pages:
        filename = os.path.join(path, f"{page}.txt")
        with open(filename, "r", encoding="utf-8") as f:
            content = f.read()
        word_count = len(content.split())
        G.add_node(page, contentlength=word_count)
    
    # Add edges based on outgoing links
    for page, links in outgoing_links.items():
        for link in links:
            G.add_edge(page, link)

    # Remove isolated nodes
    isolated_nodes = list(nx.isolates(G))
    if isolated_nodes:
        G.remove_nodes_from(isolated_nodes)

    # Get the largest connected component
    if nx.is_weakly_connected(G):
        S = G.copy()
    else:
        largest_cc = max(nx.weakly_connected_components(G), key=len)
        S = G.subgraph(largest_cc).copy()
    
    return S

## 2. Scraping data
Fetches philosopher data from their wikipedia pages and downloads the wikipedia pages as `{philosopher_name}.txt` in a `downloads/` directory.

>**NOTES**
> 1. This takes a while to run
> 2. It deletes all previous content in `downloads`
> 3. Downloads all pages but skips pages with *no content* or *redirects*.

In [2]:
wiki_links = ["List of philosophers (A–C)", "List of philosophers (D–H)", "List of philosophers (I–Q)", "List of philosophers (R–Z)"]
title_links = []

verbose = False # Debug output during loops
invalid_links = []  # Track titles that could not be saved
redirect_links = []  # Track titles that are redirects

# Delete and recreate the downloads directory
if os.path.exists(DOWNLOADS_DIR):
    shutil.rmtree(DOWNLOADS_DIR)  # Delete the directory and all its contents
os.makedirs(DOWNLOADS_DIR, exist_ok=True)  # Recreate the directory


for wiki_link in wiki_links:
  wiki_markup = getJsonResponse(wiki_link)
  title_links.extend(findLinks(wiki_markup))

# Remove irrelevant links if they exist
for unwanted in ["List_of_philosophers", "Philosopher", "Stanford_Encyclopedia_of_Philosophy", "Encyclopedia_of_Philosophy", "Routledge_Encyclopedia_of_Philosophy", "The_Cambridge_Dictionary_of_Philosophy", "The_Oxford_Companion_to_Philosophy"]:
    if unwanted in title_links:
        title_links.remove(unwanted)

# Writing to files (warning this takes a while)
for title_link in title_links:
  all_wikitext = getJsonResponse(title_link)
  if not all_wikitext:
    if verbose: print(f"Skipping '{title_link}' as it has no content.")
    invalid_links.append(title_link)  # Track invalid pages without modifying the list directly
    continue
  
  # Skip if the content starts with #REDIRECT
  if all_wikitext.strip().startswith("#REDIRECT"):
      if verbose: print(f"Skipping '{title_link}' as it is a redirect.")
      redirect_links.append(title_link)  # Track redirect pages
      continue
  
  filename = os.path.join(DOWNLOADS_DIR, f"{title_link}.txt")
  with open(filename, "w", encoding="utf-8") as file:
    file.write(all_wikitext) # save all the wikitext into one file

title_links = [link for link in title_links if link not in invalid_links + redirect_links]
print(f"Downloaded {len(title_links)} pages.")
print(f"Skipped {len(invalid_links)} pages with no content.")
print(f"Skipped {len(redirect_links)} redirect pages.")

Downloaded 1485 pages.
Skipped 32 pages with no content.
Skipped 249 redirect pages.


## 3. Building the network 

### (A) Create from scratch 
From `downloads/`directory (saves local pickle file for later)

In [3]:
S = build_graph_from_files(DOWNLOADS_DIR)
pickle.dump(S, open("graph.pkl", "wb")) # Saved as local version for later use for (B)


### (B) OR use local version 
From `pickle` file created last time you ran (A)

In [5]:
# load graph:
S = pickle.load(open("graph.pkl", "rb"))

## 4. Prelimenary data analysis

In [6]:
print(f"Number of nodes: {S.number_of_nodes()}")
print(f"Number of edges: {S.number_of_edges()}")

# Calculating total data size
download_size = sum(os.path.getsize(os.path.join(DOWNLOADS_DIR, f)) for f in os.listdir(DOWNLOADS_DIR) if f.endswith(".txt"))
download_size_mb = download_size / (1024 * 1024)  # Convert bytes to MB
print(f"Size of downloaded data: {download_size_mb:.2f} MB")

Number of nodes: 1366
Number of edges: 10850
Size of downloaded data: 48.80 MB
