In [4]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import re
from collections import Counter
import os

In [5]:
# Load twentieth-century text file
with open("key_events_20th_century.txt", "r", encoding="utf-8", errors="ignore") as file:
    text_data = file.read()

In [7]:
# show some suspicious characters often found in scraped text / PDFs
suspects = ["\u2018", "\u2019", "\u201c", "\u201d", "\u2013", "\u2014", "\u2026", "\ufeff"]  # ‘ ’ “ ” – — … BOM
labels = {
    "\u2018": "left single quote ‘",
    "\u2019": "right single quote ’",
    "\u201c": "left double quote “",
    "\u201d": "right double quote ”",
    "\u2013": "en dash –",
    "\u2014": "em dash —",
    "\u2026": "ellipsis …",
    "\ufeff": "BOM \\ufeff",
}

found = {labels[c]:text_data.count(c) for c in suspects if text_data.count(c) > 0}
found

{'en dash –': 101, 'em dash —': 4}

In [9]:
clean_text = text_data.replace("–", "-").replace("—", "-")

In [10]:
# optional: normalize whitespace
clean_text = re.sub(r"\s+", " ", clean_text).strip()  

In [11]:
# check output
{"en dash –": clean_text.count("–"), "em dash —": clean_text.count("—")}

{'en dash –': 0, 'em dash —': 0}

In [14]:
with open("key_events_20th_century.txt", "w", encoding="utf-8") as f:
    f.write(clean_text)

#  Text wrangling observations

- The text contains special dash characters: an en dash (–) appears 101 times and an em dash (—) appears 4 times.
- These characters can affect consistent text matching and tokenization.

# Fix applied

- I replaced all en dashes (–) and em dashes (—) with a standard hyphen (-).
- I saved the cleaned version as `twentieth_century_cleaned.txt` for use in the next steps.

In [15]:
# loading cleaned text
with open("key_events_20th_century.txt", "r", encoding="utf-8", errors="ignore") as f:
    clean_text = f.read()

In [17]:
countries = [
    "United States",
    "United Kingdom",
    "Germany",
    "France",
    "Russia",
    "Soviet Union",
    "Japan",
    "China",
    "Italy",
    "Spain",
    "India",
    "Poland",
    "Austria",
    "Hungary",
    "Turkey"
]

In [18]:
text_lower = clean_text.lower()

def country_in_text(country):
    return re.search(r"\b" + re.escape(country.lower()) + r"\b", text_lower) is not None

present = [c for c in countries if country_in_text(c)]
missing = [c for c in countries if not country_in_text(c)]

In [19]:
print("Countries found in text:", present)
print("Countries not found in text:", missing)

Countries found in text: ['United States', 'United Kingdom', 'Germany', 'France', 'Russia', 'Soviet Union', 'Japan', 'China', 'Italy', 'Spain', 'India', 'Poland', 'Austria', 'Hungary']
Countries not found in text: ['Turkey']


In [20]:
for term in ["Turkey", "Turkish", "Ottoman", "Ottoman Empire", "Türkiye", "Constantinople", "Istanbul"]:
    print(term, bool(re.search(r"\b" + re.escape(term.lower()) + r"\b", clean_text.lower())))

Turkey False
Turkish False
Ottoman True
Ottoman Empire True
Türkiye False
Constantinople False
Istanbul False


# Text wrangling and country name consistency

- I compared the country names in my predefined list with the names used in the text.
- All countries except Turkey were found explicitly.
- Further inspection showed that the text refers to Turkey using historical terms such as “Ottoman” and “Ottoman Empire.”
- Since this reflects historically accurate terminology for the time period, no changes were made to the text. The cleaned text was retained for further analysis.

In [21]:
NER = spacy.load("en_core_web_sm")

In [24]:
doc = NER(clean_text)

In [23]:
type(doc)

spacy.tokens.doc.Doc

In [25]:
df_sentences = []

In [26]:
for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({
        "sentence": sent.text,
        "entities": entity_list
    })

In [27]:
df_sentences = pd.DataFrame(df_sentences)

In [28]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,Key events of the 20th century - Wikipedia Jum...,"[the 20th century - Wikipedia Jump, Main, Navi..."
1,The rise of dictatorship 1.4 Global war: World...,"[1.4, World War II, 1939-1945, 1.4.1]"
2,The war in Europe 1.4.2,[Europe]
3,Blitzkrieg 1.4.3 Operation Barbarossa 1.4.4,"[Blitzkrieg 1.4.3 Operation, 1.4.4]"
4,Turning tides 1.4.5 Operation Overlord 1.4.6 F...,[Operation Overlord 1.4.6 Final]
5,the Pacific 1.4.7.1 Background 1.4.8 Japanese ...,"[Japanese, 1.4.9, Allied]"
6,Final days,[]
7,1.4.11 The Holocaust 1.4.12,[]
8,The Nuclear Age begins 1.5 The post-war world ...,"[1.5, 1.5.1, 1.5.2]"
9,The Cold War (1947-1991),"[The Cold War, 1947-1991]"


In [29]:
def filter_countries(entity_list, countries):
    return [ent for ent in entity_list if ent in countries]

In [30]:
df_sentences["country_entities"] = df_sentences["entities"].apply(lambda x: filter_countries(x, countries))

In [31]:
df_sentences.head(10)

Unnamed: 0,sentence,entities,country_entities
0,Key events of the 20th century - Wikipedia Jum...,"[the 20th century - Wikipedia Jump, Main, Navi...",[]
1,The rise of dictatorship 1.4 Global war: World...,"[1.4, World War II, 1939-1945, 1.4.1]",[]
2,The war in Europe 1.4.2,[Europe],[]
3,Blitzkrieg 1.4.3 Operation Barbarossa 1.4.4,"[Blitzkrieg 1.4.3 Operation, 1.4.4]",[]
4,Turning tides 1.4.5 Operation Overlord 1.4.6 F...,[Operation Overlord 1.4.6 Final],[]
5,the Pacific 1.4.7.1 Background 1.4.8 Japanese ...,"[Japanese, 1.4.9, Allied]",[]
6,Final days,[],[]
7,1.4.11 The Holocaust 1.4.12,[],[]
8,The Nuclear Age begins 1.5 The post-war world ...,"[1.5, 1.5.1, 1.5.2]",[]
9,The Cold War (1947-1991),"[The Cold War, 1947-1991]",[]


In [32]:
def clean_entity(ent):
    ent = str(ent)
    ent = re.sub(r"[\[\]]", "", ent)          # remove brackets
    ent = re.sub(r"^\d+(\.\d+)*\s*", "", ent) # remove leading 1.4.1 etc.
    ent = re.sub(r"\s+\d+(\.\d+)*$", "", ent) # remove trailing 1.4.1
    ent = ent.strip()
    return ent

In [33]:
df_sentences["entities_clean"] = df_sentences["entities"].apply(lambda lst: [clean_entity(x) for x in lst])

In [34]:
df_sentences.head(10)

Unnamed: 0,sentence,entities,country_entities,entities_clean
0,Key events of the 20th century - Wikipedia Jum...,"[the 20th century - Wikipedia Jump, Main, Navi...",[],"[the 20th century - Wikipedia Jump, Main, Navi..."
1,The rise of dictatorship 1.4 Global war: World...,"[1.4, World War II, 1939-1945, 1.4.1]",[],"[, World War II, -1945, ]"
2,The war in Europe 1.4.2,[Europe],[],[Europe]
3,Blitzkrieg 1.4.3 Operation Barbarossa 1.4.4,"[Blitzkrieg 1.4.3 Operation, 1.4.4]",[],"[Blitzkrieg 1.4.3 Operation, ]"
4,Turning tides 1.4.5 Operation Overlord 1.4.6 F...,[Operation Overlord 1.4.6 Final],[],[Operation Overlord 1.4.6 Final]
5,the Pacific 1.4.7.1 Background 1.4.8 Japanese ...,"[Japanese, 1.4.9, Allied]",[],"[Japanese, , Allied]"
6,Final days,[],[],[]
7,1.4.11 The Holocaust 1.4.12,[],[],[]
8,The Nuclear Age begins 1.5 The post-war world ...,"[1.5, 1.5.1, 1.5.2]",[],"[, , ]"
9,The Cold War (1947-1991),"[The Cold War, 1947-1991]",[],"[The Cold War, -1991]"


In [35]:
df_sentences_filtered = df_sentences[df_sentences["country_entities"].map(len) > 0].copy()
df_sentences_filtered = df_sentences_filtered.reset_index(drop=True)

In [36]:
df_sentences_filtered.head()

Unnamed: 0,sentence,entities,country_entities,entities_clean
0,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...","[France, Austria]","[the July Crisis, the end of July, the British..."
1,[ 1 ] [ 2 ] In 1917 Russia ended hostile actio...,"[1, 2, 1917, Russia, the Central Powers, Tsar]",[Russia],"[, , , Russia, the Central Powers, Tsar]"
2,The Bolsheviks negotiated the Treaty of Brest-...,"[Bolsheviks, the Treaty of Brest-Litovsk, Germ...","[Germany, Russia]","[Bolsheviks, the Treaty of Brest-Litovsk, Germ..."
3,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany],"[Bolshevik Russia, Baltic, Germany, Kars Oblas..."
4,Although Germany shifted huge forces from the ...,"[Germany, Allied, American, 1918]",[Germany],"[Germany, Allied, American, ]"


In [37]:
relationships = []
window_size = 5

for i in range(len(df_sentences_filtered)):
    end_i = min(i + window_size, len(df_sentences_filtered) - 1)

In [41]:
# combine countries across the window of sentences
country_list = sum(df_sentences_filtered.loc[i:end_i, "country_entities"], [])

In [44]:
# remove immediate duplicates
country_unique = [country_list[j]
        for j in range(len(country_list))
        if (j == 0) or (country_list[j] != country_list[j - 1])
    ]

In [56]:
relationships = []
window_size = 5

for i in range(len(df_sentences_filtered)):
    end_i = min(i + window_size, len(df_sentences_filtered) - 1)

    country_list = sum(
        df_sentences_filtered.loc[i:end_i, "country_entities"],
        []
    )

    country_unique = [
        country_list[j]
        for j in range(len(country_list))
        if (j == 0) or (country_list[j] != country_list[j - 1])
    ]

    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [57]:
relationship_df = pd.DataFrame(relationships)

In [58]:
relationship_df.head()

Unnamed: 0,source,target
0,France,Austria
1,Austria,Russia
2,Russia,Germany
3,Germany,Russia
4,Russia,Germany


In [59]:
relationship_df.to_csv("relationships.csv", index=False)