In [1]:
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import networkx as nx
import scipy


In [2]:
from pathlib import Path

TEXT_RAW_PATH = Path("../Data/20th_century_page.txt")

text_raw = TEXT_RAW_PATH.read_text(encoding="utf-8", errors="replace")

In [3]:
import re
import unicodedata
from pathlib import Path

def clean_text(s: str) -> str:
 
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    s = s.replace("–", "-").replace("—", "-")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()


text_clean = clean_text(text_raw)


Path("../Data/").mkdir(parents=True, exist_ok=True)

clean_path = Path("../Data/twentieth_century_clean.txt")
clean_path.write_text(text_clean, encoding="utf-8")

print(text_clean[:500])

Key events of the 20th century - Wikipedia Close November 10th, 1:03 pm: "Wikipedia still can't be sold." - An important update for readers in Germany. Please don't skip this 1-minute read. It's Wednesday and if you're like us, you've used Wikipedia countless times. To settle an argument with a friend. To satisfy a curiosity. Whether it's 3 in the morning or the afternoon, Wikipedia is useful in your life. We do not run ads - we rely on the support of everyday readers. This is the 15th day of ou


In [4]:
COUNTRIES_PATH = Path("../Data/countries_list_20th_century_1.5.csv")

df_countries = pd.read_csv(COUNTRIES_PATH)

df_countries.head(10)

Unnamed: 0.1,Unnamed: 0,country_name
0,1,Afghanistan
1,2,Albania
2,3,Algeria
3,4,Andorra
4,5,Angola
5,6,Antigua and Barbuda
6,7,Argentina
7,8,Armenia
8,9,Australia
9,10,Austria


In [5]:
COUNTRIES_PATH = Path("../Data/countries_list_20th_century_1.5.csv")

df_countries = pd.read_csv(COUNTRIES_PATH)
assert "country_name" in df_countries.columns

df_countries["country_name"] = df_countries["country_name"].astype(str).str.strip()
countries_set = set(df_countries["country_name"])


In [6]:
import spacy, time
from itertools import combinations
import pandas as pd

nlp = spacy.load("en_core_web_sm", exclude=["tagger","parser","lemmatizer","attribute_ruler"])
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

rows = []
sent_id = 0
t0 = time.time()

for doc in nlp.pipe([text_clean[i:i+200000] for i in range(0, len(text_clean), 200000)], batch_size=4):
    for sent in doc.sents:
        ents = [e.text for e in sent.ents if e.label_ in ("GPE", "LOC")]
        matched = [e for e in ents if e in countries_set]
        if matched:
            rows.append({
                "sent_id": sent_id,
                "sentence": sent.text.strip(),
                "countries_in_sentence": list(dict.fromkeys(matched))
            })
        sent_id += 1

df_sent_countries = pd.DataFrame(rows)
print("Sentences with ≥1 country:", len(df_sent_countries))
df_sent_countries.head()

Sentences with ≥1 country: 146


Unnamed: 0,sent_id,sentence,countries_in_sentence
0,1,An important update for readers in Germany.,[Germany]
1,30,Technology and IT work in Germany In order for...,[Germany]
2,36,We also provide funding for local spaces in va...,[Germany]
3,61,Annual revenue worldwide - a comparison Google...,[Germany]
4,75,After a period of diplomatic and military esca...,"[France, Austria, Hungary]"


In [31]:
from itertools import combinations
import pandas as pd

rel_rows = []
for _, r in df_sent_countries.iterrows():
    cs = r["countries_in_sentence"]
    if len(cs) >= 2:
        for a, b in combinations(cs, 2):
            s, t = sorted([a, b])
            rel_rows.append({
                "sent_id": r["sent_id"],
                "source": s,
                "target": t,
                "sentence": r["sentence"]
            })

df_relationships = pd.DataFrame(rel_rows)
df_relationships.head(10)

Unnamed: 0,sent_id,source,target,sentence
0,70,Austria,France,After a period of diplomatic and military esca...
1,70,France,Hungary,After a period of diplomatic and military esca...
2,70,Austria,Hungary,After a period of diplomatic and military esca...
3,72,Germany,Russia,The Bolsheviks negotiated the Treaty of Brest-...
4,112,Germany,Italy,"Germany, 1933 Fascism first appeared in Italy ..."
5,120,Austria,Germany,"33 ] Hitler began to put his plan in motion, a..."
6,131,France,Poland,The signing of the Molotov-Ribbentrop Pact in ...
7,133,France,Germany,"Britain and France, much to Hitler's surprise,..."
8,133,France,Poland,"Britain and France, much to Hitler's surprise,..."
9,133,Germany,Poland,"Britain and France, much to Hitler's surprise,..."


In [33]:
from pathlib import Path

OUT_REL = Path("../Data/relationships_countries.csv")
OUT_PAIR = Path("../Data/country_pairs_counts.csv")

df_relationships.to_csv(OUT_REL, index=False, encoding="utf-8")
if 'pair_counts' in globals():
    pair_counts.to_csv(OUT_PAIR, index=False, encoding="utf-8")

print("Saved:")
print("-", OUT_REL.resolve())
if 'pair_counts' in globals():
    print("-", OUT_PAIR.resolve())

Saved:
- D:\Data_Projects\20th_century_project\Data\relationships_countries.csv
