# `Prelude`

## Step 1: Imports

In [None]:
import urllib.request
import urllib.parse  # to handle special characters in the title
import json
import re
import os
import networkx as nx
import csv
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

## Step 2: Data Fetching
This takes a while to run!

In [None]:
def getJsonResponse(title):
  # Define the components of the query
  baseurl = "https://en.wikipedia.org/w/api.php?"
  action = "action=query"
  title = f"titles={urllib.parse.quote(title)}"
  content = "prop=revisions&rvprop=content"
  dataformat = "format=json"
  rvslots = "rvslots=main"

  # Construct the query URL
  query = "{}{}&{}&{}&{}&{}".format(baseurl, action, content, title, dataformat, rvslots)

  try:
    # Make the request to Wikipedia API
    wikiresponse = urllib.request.urlopen(query)

    # Check if the HTTP status is OK (200)
    if wikiresponse.getcode() != 200:
      print(f"Error: Received non-200 HTTP status code {wikiresponse.getcode()}")
      return None

    wikidata = wikiresponse.read()

    # Parse the JSON response
    try:
      wikiJson = json.loads(wikidata)
    except json.JSONDecodeError:
      print("Error: Failed to decode JSON response")
      return None

    # Get the page from the JSON response
    page = next(iter(wikiJson['query']['pages'].values()))  # extract the single page

    # Check if the page has revisions and extract the latest wikitext content
    if 'revisions' in page and len(page['revisions']) > 0:
      wikitext = page['revisions'][0]['slots']['main']['*']  # extract wikitext from "main" slot
      return wikitext
    else:
      #print(f"Error: Page '{title}' does not contain revisions.")
      return None

  except urllib.error.URLError as e:
    print(f"Network error: {e.reason}")
    return None
  except Exception as e:
    print(f"Unexpected error: {str(e)}")
    return None

## Convert the list to link titles e.g. John McCain (fictional) => John_McCain_(fictional)
def extract_title_link(match):
  # Regular expression to match the content between [[ and | (the first part of the link)
  title = re.search(r'\[\[([^\|\]]+)', match)
  if title:
    # Replace all whitespaces in the title with underscores
    return title.group(1).replace(" ", "_")
  else:
    print("ERROR FINDING ", match)
    return None

def findLinks(wikipage):
  pattern = r'\[{2}[\w\-\s\(\)]*\|?[\w\s\-\(\)]*\]{2}' ## regex for finding links e.g.: [[John McCain (fictional)|John McCain]]
  matches = re.findall(pattern, wikipage)
  # Convert the list to a set to keep only unique matches
  unique_matches = set(matches)

  links = [extract_title_link(unique_match) for unique_match in unique_matches]
  return links

## Step 3: Creating the network
From fetched data

In [None]:
# Set the directory to downloads
DOWNLOADS_DIR = "downloads"
os.makedirs(DOWNLOADS_DIR, exist_ok=True)  # Ensure the folder exists

wiki_links = ["List of philosophers (A–C)", "List of philosophers (D–H)", "List of philosophers (I–Q)", "List of philosophers (R–Z)"]
title_links = []
for wiki_link in wiki_links:
  wiki_markup = getJsonResponse(wiki_link)
  title_links.extend(findLinks(wiki_markup))
#wiki_markup_a_c = getJsonResponse("List of philosophers (A–C)")
#wiki_markup_d_h = getJsonResponse("List of philosophers (D–H)")
#wiki_markup_i_q = getJsonResponse("List of philosophers (I–Q)")
#wiki_markup_r_z = getJsonResponse("List of philosophers (R–Z)")

#title_links = findLinks(wiki_markup_a_c)
#title_links.extend(findLinks(wiki_markup_d_h))
#title_links.extend(findLinks(wiki_markup_i_q))
#title_links.extend(findLinks(wiki_markup_r_z))

# Remove irrelevant links if they exist
for unwanted in ["List_of_philosophers", "Philosopher", "Stanford_Encyclopedia_of_Philosophy", "Encyclopedia_of_Philosophy", "Routledge_Encyclopedia_of_Philosophy", "The_Cambridge_Dictionary_of_Philosophy", "The_Oxford_Companion_to_Philosophy"]:
    if unwanted in title_links:
        title_links.remove(unwanted)

# Writing to files (warning this takes a while)
invalid_links = []  # Track titles that could not be saved
for title_link in title_links:
  all_wikitext = getJsonResponse(title_link)
  if not all_wikitext:
    print(f"Skipping '{title_link}' as it has no content.")
    invalid_links.append(title_link)  # Track invalid pages without modifying the list directly
    continue
  filename = os.path.join(DOWNLOADS_DIR, f"{title_link}.txt")
  with open(filename, "w", encoding="utf-8") as file:
    file.write(all_wikitext) # save all the wikitext into one file

# Remove invalid links from title_links after iteration
title_links = [link for link in title_links if link not in invalid_links]

In [14]:
files = [f for f in os.listdir(DOWNLOADS_DIR) if f.endswith(".txt")]

outgoing_links = {}
for file in files:
  if not file.endswith(".txt"): # makes sure only the txt files are opened
    continue
  file_path = os.path.join(DOWNLOADS_DIR, file)
  with open(file_path, "r", encoding="utf-8") as f:
    wikipage = f.read()
    wikipage_links = findLinks(wikipage)
    withoutExtension = os.path.splitext(file)[0]

    for link in wikipage_links:
      if link in title_links:
        outgoing_links.setdefault(withoutExtension, []).append(link)

G = nx.DiGraph()
for page in title_links:
  file_path = os.path.join(DOWNLOADS_DIR, f"{page}.txt")
  with open(file_path, "r", encoding="utf-8") as f:
    content = f.read()
  word_count = len(content.split()) # splits the content into words by whitespace.
  G.add_node(page, contentlength=word_count)
  for link in outgoing_links.get(page) or []:
    G.add_edge(page, link)

## Removing isolated nodes
isolated_nodes = [node for node, degree in dict(G.degree()).items() if degree == 0]
if (isolated_nodes): G.remove_nodes_from(isolated_nodes)

## Getting the largest connected component
largest_cc = max(nx.weakly_connected_components(G), key=len)
S = G.subgraph(largest_cc).copy() # copying it to another variable
S_undirected = S.to_undirected() # making it undirected

### Prelimenary data analysis

In [18]:
import os
# General graph analysis
print(G) # whole graph
print(S) # largest connected component

# Calculating total data size
total_size = sum(os.path.getsize(os.path.join(DOWNLOADS_DIR, f"{title_link}.txt")) for f in os.listdir(DOWNLOADS_DIR) if f.endswith(".txt"))
total_size_mb = total_size / (1024 * 1024)  # Convert bytes to MB
print(f"Total Size of Data: {total_size_mb:.2f} MB")

# Number of rows
num_rows = len(title_links)
print(f"Number of Rows (Philosophers): {num_rows}")

# Number of nodes and links
num_nodes = S.number_of_nodes()
num_links = S.number_of_edges()
print(f"Number of Nodes: {num_nodes}")
print(f"Number of Links: {num_links}")

DiGraph with 1470 nodes and 11203 edges
DiGraph with 1464 nodes and 11200 edges
Total Size of Data: 45.90 MB
Number of Rows (Philosophers): 1734
Number of Nodes: 1464
Number of Links: 11200
