## Import Libraries

In [83]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [84]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 640.0 kB/s eta 0:00:20
     --------------------------------------- 0.0/12.8 MB 487.6 kB/s eta 0:00:27
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.5/12.8 MB 3.7 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/12.8 MB 7.7 MB/s eta 0:00:02
     ------------ --------------------------- 4.1/12.8 MB 16.2 MB/s eta 0:00:01
     ------------------- -------------------- 6.2/12.8 MB 20.9 MB/s eta 0:00:01
     -------------------------- ------------- 8.4/12.8 MB 24.4 MB/s eta 0:00:01
     -------------------------------- ------ 10.6/12.8 MB 40.9 MB/s eta 0:00:01
     ------------------------------

In [85]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Load Key Events of 20th Century book

In [86]:
# Load the book

with open('Key events of 20th century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

## Data Wrangling

- I decided to clean the data as there are stop words and unnecessary punctuation in the data. This will give cleaner version of the text to analyze the data

In [146]:
# Sentence tokenization

from nltk.tokenize import sent_tokenize
tokenized_sent = sent_tokenize(data)

In [147]:
# Word tokenization

from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(data)

In [148]:
# Defining stopwords

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [149]:
# Removing stopwords in words

filtered_words = [] # creates an empty list
for word in tokenized_word:
    if word not in stop_words:
        filtered_words.append(word)

In [150]:
# Substitute all punctuation marks with a space

sans_punc = re.sub("[^a-zA-Z]",  # Search for all non-letters
                        " ",        # Replace all non-letters with spaces
                        str(filtered_words))

In [151]:
# Word tokenization

tokenized_word_2 = word_tokenize(sans_punc)

In [152]:
new_stopwords = ["And", "Then", 'n', 't', 's', 'The', 'In', 'would', 'S', 'II', 'would', 'p']

In [153]:
filtered = []
for word in tokenized_word_2:
    if word not in new_stopwords:
        filtered.append(word)

In [186]:
#Combine the words
listToStr = ' '.join([str(elem) for elem in filtered])

In [187]:
#Exporting Text File
with open('KeyEvents_20thCentury_WrangledData.txt', 'w') as textfile:
    textfile.write(listToStr)

## Creating  NER Objects

In [188]:
# Creating the NER object
book = NER(listToStr)

In [189]:
# Visualize identified entities
displacy.render(book[27:200], style = "ent", jupyter = True)

## Splitting Sentence Entities

In [161]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [162]:
df_sentences.head(20)

Unnamed: 0,sentence,entities
0,"(Jump, contentMain, menuSearchCreate, accountL...",[Wikipedia]
1,"(encyclopediaThe, th, century, changed, world,...","[World Wars, Cold War led Space Race, World Wi..."
2,"(Kaiser, Wilhelm, Much, map, Europe, redrawn, ...","[Kaiser Wilhelm, Europe, Yugoslavia, Czechoslo..."
3,"(However, European, revolutions, defeated, Vla...","[European, Vladimir Lenin, Joseph Stalin, Leon..."
4,"(Many, people, saw, first, stage, end, capital...","[first, Soviet Union, Dorothea Lange, Migrant ..."
5,"(economic, downturn, Hitler, began, put, plan,...","[Hitler, Austria Anschluss, Austria Germany]"
6,"(He, negotiated, annexation, Sudetenland, Germ...","[Sudetenland German, Czechoslovakia Munich Con..."
7,"(This, treaty, gave, Stalin, free, rein, take,...","[Stalin, Baltic]"
8,"(Estonia, Latvia, Lithuania)","[Estonia, Latvia, Lithuania]"
9,"(well, Eastern, Poland, remain, Soviet, posses...","[Eastern Poland, Soviet, Stalin, Finland, Sovi..."


## Loading country names

In [163]:
# Import countries
country_df = pd.read_csv("country names mentioned times.csv", index_col = 0)

In [164]:
country_df.head()

Unnamed: 0,Country_name,Times mentioned
0,Afghanistan,1
1,Albania,2
2,Algeria,1
4,Angola,1
8,Australia,2


## Filtering entities from the book

In [165]:
# Function to filter out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['Country_name'])]

In [166]:
# Check

filter_entity(["Australia"], country_df)

['Australia']

In [167]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [168]:
df_sentences['country_entities'].head(20)

0                                                    []
1     [France, Italy, Russia, Germany, Austria, Hung...
2                                                    []
3                                                    []
4     [Germany, Italy, Germany, Germany, Germany, Ge...
5                                                    []
6     [Spain, France, Poland, Poland, France, German...
7                                                    []
8                          [Estonia, Latvia, Lithuania]
9     [Finland, Germany, Poland, Luxembourg, Norway,...
10            [Denmark, Sweden, France, France, France]
11                                             [France]
12                              [France, Italy, Greece]
13                   [Germany, Greece, Albania, Greece]
14                    [Libya, Egypt, Egypt, Iraq, Iran]
15                                                   []
16                                                   []
17                                              

In [169]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [170]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
90,"(We, Can, Still, Learn, From, It, Time, Retrie...",[Sino Japanese War Anti Japanese War Eight Yea...,"[Italy, Japan]"
91,"(Retrieved, Forgotten, Reason, Why, Japan, Att...","[Japan, Pearl Harbor National Interest, United...","[Japan, Japan]"
98,"(ONLINE, Retrieved, Andrew, Glass, August, Hir...","[Andrew Glass, August, Hirohito, Japan, Aug PO...",[Japan]
104,"(Retrieved, Kirsch, Adam, April, System, Two, ...","[Two, Nazi, December, Perspectives Clinical Re...",[Israel]
113,"(Preparatory, Commission, Comprehensive, Nucle...","[BBC News, Retrieved Nuclear, Stockholm Intern...",[Iran]
114,"(Retrieved, Why, Did, League, Nations, Fail, w...","[Retrieved Why Did League Nations Fail, un, St...",[India]
115,"(Portuguese, Africa, Oxford, Research, Encyclo...","[Portuguese, Africa Oxford Research Encycloped...",[Afghanistan]
117,"(Africa, Retrieved, Yalta, Conference, Cold, W...","[Cold War, March, Roberts Geoffrey, World War ...",[Russia]
124,"(Fall, Saigon, Time, Retrieved, Woollacott, Ma...","[Saigon Time, Woollacott Martin, April, Forty ...",[Vietnam]
139,"(Guardian, ISSN, Retrieved, Corstange, Daniel,...","[Guardian ISSN Retrieved Corstange, Middle Eas...",[India]


In [171]:
# Take only the first name of the characters

##df_sentences_filtered['country_entities'] = df_sentences_filtered['country_entities'].apply(lambda x: [item.split()[0]
                                                                                                   ## for item in x])

In [172]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
90,"(We, Can, Still, Learn, From, It, Time, Retrie...",[Sino Japanese War Anti Japanese War Eight Yea...,"[Italy, Japan]"
91,"(Retrieved, Forgotten, Reason, Why, Japan, Att...","[Japan, Pearl Harbor National Interest, United...","[Japan, Japan]"
98,"(ONLINE, Retrieved, Andrew, Glass, August, Hir...","[Andrew Glass, August, Hirohito, Japan, Aug PO...",[Japan]
104,"(Retrieved, Kirsch, Adam, April, System, Two, ...","[Two, Nazi, December, Perspectives Clinical Re...",[Israel]
113,"(Preparatory, Commission, Comprehensive, Nucle...","[BBC News, Retrieved Nuclear, Stockholm Intern...",[Iran]
114,"(Retrieved, Why, Did, League, Nations, Fail, w...","[Retrieved Why Did League Nations Fail, un, St...",[India]
115,"(Portuguese, Africa, Oxford, Research, Encyclo...","[Portuguese, Africa Oxford Research Encycloped...",[Afghanistan]
117,"(Africa, Retrieved, Yalta, Conference, Cold, W...","[Cold War, March, Roberts Geoffrey, World War ...",[Russia]
124,"(Fall, Saigon, Time, Retrieved, Woollacott, Ma...","[Saigon Time, Woollacott Martin, April, Forty ...",[Vietnam]
139,"(Guardian, ISSN, Retrieved, Corstange, Daniel,...","[Guardian ISSN Retrieved Corstange, Middle Eas...",[India]


## Creating Relationships

In [173]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [174]:
relationship_df = pd.DataFrame(relationships)

In [175]:
relationship_df

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,Russia,Germany
3,Germany,Austria
4,Austria,Hungary
...,...,...
564,India,Afghanistan
565,Afghanistan,Russia
566,India,Afghanistan
567,Afghanistan,Russia


In [176]:
# Sort the cases with a- >b and b- >a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,Germany,Russia
3,Austria,Germany
4,Austria,Hungary


In [177]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Italy,11
1,Italy,Russia,2
2,Germany,Russia,10
3,Austria,Germany,2
4,Austria,Hungary,2
5,Bulgaria,Hungary,2
6,Bulgaria,Russia,2
7,Germany,Ukraine,4
8,Germany,Italy,19
9,Germany,Spain,4


In [191]:
# Export Relationship
relationship_df.to_csv('keyevents_20thcentury_countries_relationship.csv')