# 1.6 Intro to NLP and Network Analysis

In [125]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [126]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Data wrangling

In [127]:
with open('Key events of the 20th century_without references.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [155]:
# The text contained newline characters (\n), numbered references in square brackets (e.g., [1], [2]), and extra spaces, which could interfere with tokenization.
# There were also a few special characters (such as *) that were not relevant for analysis.

In [128]:
# Clean text: remove newlines, brackets, extra spaces
cleaned_data = re.sub(r'\n+', ' ', data)
cleaned_data = re.sub(r'\[[0-9]+\]', '', cleaned_data)     # remove [1], [2], etc.
cleaned_data = re.sub(r'\s+', ' ', cleaned_data).strip()   # normalize spaces
cleaned_data = re.sub(r'[^A-Za-z0-9\s.,;:\'\"-]', '', cleaned_data)  # remove odd symbols

In [129]:
with open("Key_events_cleaned.txt", "w") as f:
    f.write(cleaned_data)

## NER

In [130]:
book = NER(cleaned_data)

In [131]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

## Get named entity list per sentence

In [132]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [133]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Navigation ..."
1,"(World, War, II, 193919451.4.1The, war, in, Eu...","[World War II, Pacific1.4.7.1Background1.4.8Ja..."
2,"(decolonization1.5.2The, Cold, War, 194719911....",[Cold War 194719911.5.3War]
3,"(race1.5.5The, end, of, the, Cold, War1.5.6Inf...","[the Cold War1.5.6Information, 20th, Actions, ..."
4,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
5,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
6,"(Historic, events, in, the, 20th, centuryeditW...",[the 20th]
7,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]"
8,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
9,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal]"


## Mapping and Normalisation

In [134]:
country_map = {
    # Soviet Union variations
    'Soviet Union': 'Russia',
    'Soviet': 'Russia',
    'USSR': 'Russia',
    'Union of Soviet Socialist Republics': 'Russia',

    # United States variations
    'United States of America': 'United States',
    'United States': 'United States',
    'America': 'United States',
    'US': 'United States',
    'U.S.': 'United States',

    # United Kingdom variations
    'UK': 'United Kingdom',
    'U.K.': 'United Kingdom',
    'Britain': 'United Kingdom',
    'Great Britain': 'United Kingdom',
    'England': 'United Kingdom'
}

In [135]:
from typing import Sequence, List

def normalise_countries(entities: Sequence[str], country_map: dict[str, str]) -> List[str]:
    """Apply manual normalisations used in the original notebook."""
    return [country_map.get(name, name) for name in entities]

In [136]:
normalised = normalise_countries(["US", "UK"], country_map)


In [137]:
#  country_map
entities_normalised = []
for entities in entities_per_sentence:
    normalised = [country_map.get(e, e) for e in entities]
    entities_normalised.append(normalised)


In [138]:
nlp = spacy.load("en_core_web_sm")   

entities_per_sentence = []
for sent in nlp(book).sents:
    doc = nlp(sent.text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    entities_per_sentence.append(entities)

In [139]:
df_sentences = pd.DataFrame({
    'Sentence': [sent.text for sent in nlp(book).sents],
    'Country': entities_per_sentence
})

## Load country names

In [140]:
df_countries = pd.read_csv("List of Countries_2.0.csv", index_col = 0)

In [141]:
df_countries.head()

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [142]:
valid_countries = df_countries['Country'].tolist()

entities_filtered = []
for entities in entities_normalised:
    filtered = [e for e in entities if e in entities_normalised]
    entities_filtered.append(filtered)


In [143]:
# Apply normalization and filtering using lambda

# Normalize country names
df_sentences['Country'] = df_sentences['Country'].apply(
    lambda lst: [country_map.get(e, e) for e in lst]
)

# Keep only countries that exist in the CSV country list
df_sentences['Country'] = df_sentences['Country'].apply(
    lambda lst: [e for e in lst if e in valid_countries]
)


In [144]:
# Filter out rows that don't contain any country entities
df_sentences_filtered = df_sentences[df_sentences['Country'].map(len) > 0]

In [145]:
df_sentences_filtered.head(10)

Unnamed: 0,Sentence,Country
15,After a period of diplomatic and military esca...,"[France, Austria, Hungary]"
16,"In 1917, Russia ended hostile actions against ...",[Russia]
17,The Bolsheviks negotiated the Treaty of Brest-...,"[Germany, Russia]"
18,"In the treaty, Bolshevik Russia ceded the Balt...",[Germany]
19,It also recognized the independence of Ukraine.,[Ukraine]
20,Although Germany shifted huge forces from the ...,[Germany]
29,"Germany was never occupied by Allied troops, y...",[Germany]
58,"Germany, 1933Fascism first appeared in Italy w...","[Germany, Italy]"
60,When Adolf Hitler came to power in Germany in ...,"[Germany, Germany]"
61,The Nazi Party in Germany was dedicated to the...,[Germany]


## Create relationships

In [146]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].Country), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [147]:
relationship_df = pd.DataFrame(relationships)

In [148]:
relationship_df

Unnamed: 0,source,target
0,France,Austria
1,Austria,Hungary
2,France,Austria
3,Austria,Hungary
4,Hungary,Russia
...,...,...
871,South Africa,Rwanda
872,South Africa,Rwanda
873,South Africa,Rwanda
874,Rwanda,North Korea


In [149]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head()

Unnamed: 0,source,target
0,Austria,France
1,Austria,Hungary
2,Austria,France
3,Austria,Hungary
4,Hungary,Russia


In [150]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [151]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Austria,France,6
1,Austria,Hungary,6
2,Hungary,Russia,5
3,Germany,Russia,33
4,Germany,Ukraine,10
5,Germany,Italy,26
6,Austria,Germany,10
7,France,United Kingdom,24
8,France,Poland,6
9,Poland,United Kingdom,5


In [152]:
relationship_df.shape

(114, 3)

In [153]:
relationship_df.to_csv('NER_countries_relationship.csv')