# Community Extraction

In [1]:
import pickle
import os
import networkx as nx
import re
from wiki_utils import get_category_data, filter_graph_by_attribute

DOWNLOADS_DIR = "downloads"

# load graph:
S = pickle.load(open("graph.pkl", "rb"))

## (A) Using `region` (Not used)

### Creating `S_region`
Finds region in wikitext downloaded and adds it to the network (if multiple exist it takes the first region)

In [3]:
S_subfield = S.copy()

region_pattern = r'\|\s*region\s*=\s*\[\[([^\]]+)\]\]'
nodes_to_remove = []  # Collect nodes to remove

for node in list(S_subfield.nodes):
	node_file_name = DOWNLOADS_DIR + "/" + node + ".txt"
	if os.path.exists(node_file_name):
		try:
			with open(node_file_name, 'r', encoding='utf-8') as f:
				text = f.read()
				matches = re.search(region_pattern, text)
				if matches:
					# Extract all individual regions from the matched string
					regions = re.findall(r'\[\[([^\]]+)\]\]', matches.group(0))
					S_subfield.nodes[node]['region'] = regions[0]
				else:
					nodes_to_remove.append(node)  # Mark node for removal
		except Exception as e:
			print(f"Error processing node {node}: {e}")
	else:
		nodes_to_remove.append(node)  # Mark node for removal if file does not exist
S_subfield.remove_nodes_from(nodes_to_remove)

In [None]:
print("Original graph: ", S)
print("Region graph: ", S_subfield)
print(list(S_subfield.nodes(data=True))[143])

Original graph:  DiGraph with 1366 nodes and 10850 edges
Region graph:  DiGraph with 557 nodes and 4415 edges
('Anaximander', {'contentlength': 7671, 'region': 'Western philosophy'})


### Generating the communities

In [4]:
region_partition = {}

for node, data in S_subfield.nodes(data=True):
		genre = data.get('region')
		if genre not in region_partition:
			region_partition[genre] = []
		region_partition[genre].append(node)

print("Region communities with length:")
for genre, nodes in region_partition.items():
		print(f"Region: {genre}, Length: {len(nodes)}")

Region communities with length:
Region: Western philosophy, Length: 496
Region: Russian philosophy, Length: 10
Region: Persia, Length: 2
Region: Western Philosophy, Length: 3
Region: Western philosophy|Western, Length: 1
Region: Jewish philosophy, Length: 2
Region: East Asian philosophy, Length: 1
Region: Indian philosophy, Length: 1
Region: Greater Iran|Persia, Length: 1
Region: Eastern philosophy, Length: 5
Region: Chinese philosophy, Length: 6
Region: British Unitarianism, Length: 1
Region: Africana philosophy, Length: 1
Region: Middle Eastern philosophy, Length: 4
Region: Teng (state)|Teng, Length: 1
Region: Mithila region, Length: 1
Region: Middle East, Length: 1
Region: Greek philosophy, Length: 1
Region: Islamic philosophy, Length: 3
Region: Byzantine Empire, Length: 1
Region: Roman Egypt, Length: 1
Region: Spanish philosophy, Length: 1
Region: Al-Andalus, Length: 1
Region: African philosophy, Length: 1
Region: Ancient Greek philosophy, Length: 1
Region: German philosophy, Lengt

Only western philosophy is the big one.

## (B) Using `subfield`
https://en.wikipedia.org/wiki/Lists_of_philosophers

### (B.1) Extracting subfield information

First we fetch the data from Wikipedia 
(takes around 5 seconds)

In [2]:
subfield_links = [
		"List of aestheticians",
		"List of critical theorists",
		"List of environmental philosophers",
		"List of epistemologists",
		"List of ethicists",
		"List of existentialists",
		"List of feminist philosophers",
		"List of secular humanists", #List of Humanists redirect
		"List of logicians",
		"List of metaphysicians",
		"Index of sociopolitical thinkers", #List of Social and Political Philosophers redirect
		"List of phenomenologists",
		"List of philosophers of language",
		"List of philosophers of mind",
		"List of philosophers of religion",
		"List of philosophers of science",
		"List of political philosophers",
		"List of political theorists",
		"List of rationalists",
		"List of utilitarians"
]
replace_terms_subfields = [("List of", ""), ("Index of", "")]
clean_subfield_data, philosopher_subfield_info = get_category_data(
    subfield_links, replace_terms_subfields, verbose=True
)
print("Total =", len(philosopher_subfield_info))

- aestheticians: 86
- critical theorists: 107
- environmental philosophers: 80
- epistemologists: 122
- ethicists: 287
- existentialists: 60
- feminist philosophers: 134
- secular humanists: 413
- logicians: 291
- metaphysicians: 184
- sociopolitical thinkers: 245
- phenomenologists: 24
- philosophers of language: 141
- philosophers of mind: 140
- philosophers of religion: 110
- philosophers of science: 111
- political philosophers: 199
- political theorists: 137
- rationalists: 27
- utilitarians: 104
Total = 2183


### (B.2) Creating `S.subfield` network

In [3]:
S_subfield = filter_graph_by_attribute(
    graph=S,
    attribute_info=philosopher_subfield_info,
    attribute_name='subfields',
    verbose=True
)

Did not find subfields for: 771 philosophers (therefore not included in the filtered graph)
 -> Example of removed nodes: ['Georgy_Shchedrovitsky', 'James_McCosh', 'Zeno_of_Sidon']
Original graph: 1366 nodes and 10850 edges
Filtered 'subfields' graph: 595 nodes and 5208 edges


## (C) Using `western` `eastern` divide list

### (C.1) Extracting information from wikipedia

In [4]:
tradition_links = [
	"Timeline of Eastern philosophers",
	"Timeline of Western philosophers"
]

replace_terms_traditions = [("Timeline of", "")]
clean_tradition_data, philosopher_tradition_info = get_category_data(
    tradition_links, replace_terms_traditions, verbose=True
)
print("Total =", len(philosopher_tradition_info))

- Eastern philosophers: 343
- Western philosophers: 565
Total = 893


### (C.2) Creating `S_tradition` network

In [5]:
S_tradition = filter_graph_by_attribute(
    graph=S,
    attribute_info=philosopher_tradition_info,
    attribute_name='tradition',
    verbose=True
)

Did not find tradition for: 1020 philosophers (therefore not included in the filtered graph)
 -> Example of removed nodes: ['Abraham_Joshua_Heschel', 'Georgy_Shchedrovitsky', 'Andrea_Dworkin']
Original graph: 1366 nodes and 10850 edges
Filtered 'tradition' graph: 346 nodes and 3199 edges
