# Import libraries

In [92]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [93]:
NER = spacy.load("en_core_web_sm")

# Creating an NER Object

In [95]:
# Load the book

with open('20th_century_article_wiki.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

book = NER(data)

In [96]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

## data wrangling

In [98]:
# import the countries data set 
countries_df=pd.read_csv(r'/Users/yasersouri/Desktop/data analysis/specialization 1/20th-century/countries-df.csv',index_col=0)

In [99]:
displacy.render(book, options = {'ents': ['GPE']}, style = 'ent', jupyter = True)

In [100]:
# Define alias mappings
alias_map = {
    "USA": "United States",
    "US": "United States",
    "UK": "United Kingdom",
    "U.K.": "United Kingdom",
    "Soviet Union": "Russia",
    "USSR": "Russia",
    "Czech Republic": "Czechoslovakia",
    "DRC": "Democratic Republic of the Congo",
    "Korea": "South Korea",
    "Iran": "Persia"}

In [101]:
#Apply alias normalization to GPEs
normalized_gpes = [alias_map.get(gpe, gpe) for gpe in gpe_entities]


In [102]:
# Create a DataFrame from new_countries
new_rows = pd.DataFrame([{'Country': country, 'Mentions': 0} for country in new_countries])

# Concatenate the new rows to the existing DataFrame
countries_df = pd.concat([countries_df, new_rows], ignore_index=True)


In [103]:
countries_df.to_csv(r'/Users/yasersouri/Desktop/data analysis/specialization 1/20th-century/countries-df.csv')

**Country Name Inconsistencies**

We compared GPE (Geopolitical Entity) mentions from the text using spaCy to the reference country list (countries-df.csv).

Several country aliases and abbreviations were found that don’t match the reference list:

USA, US → should map to United States

UK, U.K. → should map to United Kingdom

USSR, Soviet Union → should map to Russia

Korea → could refer to South Korea or North Korea 
(context-dependent)

These inconsistencies could affect downstream matching or frequency analysis.
**Corrections Made**

A mapping (alias_map) was created to convert inconsistent GPE mentions into standardized names.

The GPE mentions were normalized using this map.

Missing but valid country names from the GPE list were appended to the reference list using pandas.concat().

## Splitting Sentence Entities

In [106]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [107]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Key, events, of, the, 20th, century, -,...",[the 20th century -]
1,"(articleAbout, WikipediaContact, us, \t\t...",[Search Search ...
2,"(The, rise, of, dictatorship, , 1.4,...","[1.4, World War II, 1939–1945, 1.4.1]"
3,"(The, war, in, Europe, , 1.4.2, Blitzk...","[Europe, 1.4.2, Blitzkrieg 1.4.3, Oper..."
4,"(Turning, tides, )",[]
5,"(1.4.5, Operation, Overlord, , 1.4.6, ...",[Operation Overlord 1.4.6 Final days ...
6,"(Allied, offensive, , 1.4.10, Final, d...",[Allied offensive 1.4.10 Final days ...
7,"(The, Holocaust, , 1.4.12, The, Nuclea...","[The Nuclear Age, 1.5]"
8,"(The, post, -, war, world, , 1.5.1)",[1.5.1]
9,"(The, end, of, empires, :, decolonization, ...","[The Cold War, 1947–1991]"


## Filtering Data Using the Main Countries

In [109]:
# import the countries data set as a pandas dataframe
countries_df=pd.read_csv('/Users/yasersouri/Desktop/data analysis/specialization 1/20th-century/countries-df.csv',index_col=0)

In [128]:
# Function to filter out entities not of interest
def filter_entity(ent_list, countries_df):
    # Create a normalized set of country names for faster lookup
    country_set = set(c.strip().lower() for c in countries_df['Country'])

    # Filter entities with normalized comparison
    return [ent for ent in ent_list if ent.strip().lower() in country_set]


In [130]:
# Create a normalized set of country names for faster lookup
country_set = set(c.strip().lower() for c in countries_df['Country'])

# Pass the sentence entities and the countries dataframe into a filter, which will return only the entities of interest
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))


In [132]:
# Filter out sentences that don’t have any country entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1480,"(^, "", History, of, Software, Patents, ,, from...","[Benson, Flook, Diehr, Bilski, Prometheus]",[Bilski]
1490,"(The, Ultimate, History, of, Video, Games, :, ...","[The Ultimate History of Video Games, Pong, Po...",[Pong]
1499,"(Hongkiat, ., 13, April, 2011, .)","[Hongkiat, 13 April 2011]",[Hongkiat]
1531,"(Estrada, ,, Susan, (, 1992, ), .)","[Estrada, Susan, 1992]",[Estrada]
1556,"(Fraser, ,, Nick, (, 2, November, 2014, ), .)","[Fraser, Nick, November 2014]",[Fraser]
1563,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1569,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1570,"(Singapore, :, World, Scientific, ., doi:10.11...","[Singapore, World Scientific]",[Singapore]
1580,"("", The, SARS, epidemic, in, Hong, Kong, "", .)",[Hong Kong],[Hong Kong]
1632,"(Privacy, policy, About, Wikipedia, Disclaimer...","[Mobile, the 20th century, 2]",[Mobile]


## Creating Relationships

In [135]:
# Defining relationships

# window size = 5 : this defines how many sentences will be looked at simultaneously
relationships = []  # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + 5, df_sentences_filtered.index[-1])
    countries_list = sum(df_sentences_filtered.loc[i:end_i].country_entities, [])

    # Remove duplicated countries that are next to each other
    char_unique = [
        countries_list[j] for j in range(len(countries_list))
        if (j == 0) or countries_list[j] != countries_list[j - 1]
    ]

    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [137]:
relationships_df = pd.DataFrame(relationships)

relationships_df

Unnamed: 0,source,target
0,Sarajevo,the British Empire
1,the British Empire,France
2,France,the Russian Empire
3,the Russian Empire,the German Empire
4,the German Empire,Austria
...,...,...
1944,India,Singapore
1945,India,Singapore
1946,India,Singapore
1947,India,Singapore


In [139]:
# sorting the values in the dataframe first, then use the groupby() function to get all the different frequencies of the source:
# Sort the cases with a- >b and b- >a
relationships_df = pd.DataFrame(np.sort(relationships_df.values, axis = 1), columns = relationships_df.columns)
relationships_df.head(5)

Unnamed: 0,source,target
0,Sarajevo,the British Empire
1,France,the British Empire
2,France,the Russian Empire
3,the German Empire,the Russian Empire
4,Austria,the German Empire


In [141]:
relationships_df["value"] = 1
relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationships_df.head(10)

Unnamed: 0,source,target,value
0,Sarajevo,the British Empire,5
1,France,the British Empire,6
2,France,the Russian Empire,6
3,the German Empire,the Russian Empire,6
4,Austria,the German Empire,6
5,Austria,Russia,5
6,Russia,Tsar,6
7,Bolsheviks,Tsar,5
8,Bolsheviks,Germany,6
9,Germany,Russia,16


In [143]:
relationships_df.to_csv(r'/Users/yasersouri/Desktop/data analysis/specialization 1/20th-century/relationships_df.csv')

**Key Findings**:

The relationships_df contains pairs of countries that frequently appeared near each other in the text.

The value column shows how often each country pair co-occurred in the same sentence.

This can indicate:

Historical alliances (e.g., United States and United Kingdom)

Conflicts or geopolitical dynamics (e.g., Germany and Soviet Union)

Decolonization or Cold War era interactions.

**Analytical Use**:

These relationships can be visualized as a network graph (nodes: countries, edges: co-occurrence frequency).

High-frequency links may reveal:

Key geopolitical clusters or events.

Prominent rivalries or collaborations in the 20th century.

You can filter this graph further by setting a minimum frequency threshold for clarity.
