In [4]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import re

Matplotlib is building the font cache; this may take a moment.


In [6]:
with open(r"C:\Users\moein\20th-century\data\raw\key_events_20th_century_raw.txt", "r", errors="ignore") as file:
    text = file.read().replace("\n", " ")

In [7]:
# Remove special characters like em-dashes, ellipses
text = re.sub(r"[^A-Za-z0-9,.!?;:'\"\s]", " ", text)

# Normalize multiple spaces
text = re.sub(r"\s+", " ", text)

In [9]:
with open(r"C:\Users\moein\20th-century\data\raw\20th_century_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(text)

In [10]:
# Create a NER object
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

In [11]:
# Split sentence entities
df_sentences = []
for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)
df_sentences.head()

Unnamed: 0,sentence,entities
0,SOURCE: https: en.wikipedia.org wiki Key event...,"[the 20th century, 2025, 10:45:33, UTC]"
1,Wikipedia content is under CC BY SA 4.0.,"[Wikipedia, SA 4.0]"
2,Historic events in the 20th century World at t...,"[the 20th century, the beginning of the centur..."
3,The war in Europe Blitzkrieg Operation Barbaro...,"[Europe, Operation Overlord Final]"
4,The war in the Pacific Japanese Expansion Alli...,"[the Pacific Japanese Expansion, The Holocaust..."


In [12]:
# Load the countries list
countries_df = pd.read_csv(
    r"C:\Users\moein\20th-century\data\reference\countries_list_20th_century_1.5.csv"
)

def filter_entities(ent_list, country_df):
    return [ent for ent in ent_list if ent in list(country_df['country_name'])]

# Apply filter to keep only entities that are countries
df_sentences['country_entities'] = df_sentences['entities'].apply(
    lambda x: filter_entities(x, countries_df)
)

# Keep only rows with at least one recognized country
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [15]:
print("Total sentences:", len(df_sentences))
print("Rows w/ any entities:", (df_sentences['entities'].map(len) > 0).sum())
print("Rows w/ country entities:", (df_sentences['country_entities'].map(len) > 0).sum())
print(df_sentences.head(3))
print(df_sentences['entities'].head(10))

Total sentences: 572
Rows w/ any entities: 518
Rows w/ country entities: 0
                                            sentence  \
0  SOURCE: https: en.wikipedia.org wiki Key event...   
1           Wikipedia content is under CC BY SA 4.0.   
2  Historic events in the 20th century World at t...   

                                            entities country_entities  
0            [the 20th century, 2025, 10:45:33, UTC]               []  
1                                [Wikipedia, SA 4.0]               []  
2  [the 20th century, the beginning of the centur...               []  
0              [the 20th century, 2025, 10:45:33, UTC]
1                                  [Wikipedia, SA 4.0]
2    [the 20th century, the beginning of the centur...
3                   [Europe, Operation Overlord Final]
4    [the Pacific Japanese Expansion, The Holocaust...
5                       [the Cold War, the Space Race]
6                            [the 21st century, today]
7                          

In [16]:
def norm(s: str) -> str:
    # lower, remove dots/commas, collapse spaces
    s = s.lower()
    s = re.sub(r"[^\w\s-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Build a fast lookup set from your country list
countries_df = pd.read_csv(r"C:\Users\moein\20th-century\data\reference\countries_list_20th_century_1.5.csv")

# If your CSV has a column 'country_name'
country_names = set(norm(x) for x in countries_df['country_name'].astype(str))

# Add common aliases (extend as needed)
aliases = {
    "u s": "united states",
    "u s a": "united states",
    "us": "united states",
    "usa": "united states",
    "u k": "united kingdom",
    "uk": "united kingdom",
    "great britain": "united kingdom",
    "britain": "united kingdom",
    "russia": "russian federation",
    "soviet union": "ussr",
    "ussr": "ussr",
    "iran": "iran",
    "pr china": "china",
    "peoples republic of china": "china",
}
# Merge aliases into the lookup
country_names |= set(aliases.values())

def normalize_and_map(ent: str) -> str | None:
    n = norm(ent)
    if n in aliases: n = aliases[n]
    return n if n in country_names else None

# Rebuild country_entities with normalization
def filter_entities(ent_list):
    out = []
    for ent in ent_list:
        mapped = normalize_and_map(ent)
        if mapped:
            out.append(mapped.title())  # title-case for readability
    return out

df_sentences['country_entities'] = df_sentences['entities'].apply(filter_entities)
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0].copy()
df_sentences_filtered.reset_index(drop=True, inplace=True)

print("Rows after normalization:", len(df_sentences_filtered))

Rows after normalization: 137


In [17]:
relationships = []
n = len(df_sentences_filtered)
if n == 0:
    print("No country co-occurrences found after normalization. Expand aliases or check text cleaning.")
else:
    window = 5  # sentences
    for i in range(n):
        end_i = min(i + window, n - 1)
        # flatten list of lists in the window
        country_list = [ent for ents in df_sentences_filtered.loc[i:end_i, 'country_entities'] for ent in ents]

        if not country_list:
            continue

        # remove immediate duplicates to reduce noise
        country_unique = [c for idx, c in enumerate(country_list) if idx == 0 or c != country_list[idx - 1]]

        # record adjacent pairs in the window; sort pair so A-B == B-A
        for a, b in zip(country_unique, country_unique[1:]):
            pair = tuple(sorted((a, b)))
            if pair[0] != pair[1]:  # skip self-pairs
                relationships.append({"source": pair[0], "target": pair[1]})

    relationships_df = pd.DataFrame(relationships)

    if relationships_df.empty:
        print("No pairs formed. Try a larger window or broaden aliases.")
    else:
        # aggregate frequencies
        relationships_df["value"] = 1
        relationships_df = relationships_df.groupby(["source", "target"], as_index=False)["value"].sum()
        display(relationships_df.head(10))

Unnamed: 0,source,target,value
0,Albania,Bulgaria,6
1,Albania,Greece,12
2,Albania,Ussr,5
3,Algeria,Italy,5
4,Algeria,Morocco,6
5,Angola,Cape Verde,6
6,Angola,Mozambique,6
7,Australia,Canada,6
8,Australia,Japan,5
9,Australia,Philippines,6


In [22]:
# when creating relationships:
relationships.append({
    "source": min(a, b),
    "target": max(a, b)
})

# Ensure strings, drop bad rows
tmp = relationships_df[['source','target']].dropna().astype(str).copy()

# Build sorted-pair key so A–B == B–A
tmp['pair'] = tmp.apply(lambda r: tuple(sorted((r['source'], r['target']))), axis=1)

# Count occurrences
out = tmp.groupby('pair').size().reset_index(name='value')

# Split pair back to two columns
out[['source','target']] = pd.DataFrame(out['pair'].tolist(), index=out.index)
out = out[['source','target','value']].sort_values('value', ascending=False).reset_index(drop=True)

display(out.head(10))

Unnamed: 0,source,target,value
0,Albania,Bulgaria,1
1,India,Pakistan,1
2,Japan,Poland,1
3,Japan,Philippines,1
4,Italy,United Kingdom,1
5,Italy,Japan,1
6,Israel,South Africa,1
7,Israel,Pakistan,1
8,Israel,Libya,1
9,Iran,United States,1


In [23]:
# Save and export dataframe
relationships_df.to_csv(
    r"C:\Users\moein\20th-century\data\reference\country_relationships.csv",
    index=False,
    encoding="utf-8"
)