## 01. Import Libraries

In [5]:
# import libraries:

import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [9]:
# download English module:

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [35]:
# load spacy English Module

NER = spacy.load("en_core_web_sm")

## 02. Import Data

In [26]:
# import 20th century text:

with open('/Users/piperdutcher/Documents/Data-Visualizations/20th-Century/Data/20th-Century.txt', 'r', errors='ignore') as file:
    text = file.read().replace('\n', '')

In [23]:
# import countries list (cleaned):

countries_df = pd.read_csv("/Users/piperdutcher/Documents/Data-Visualizations/20th-Century/Data/countries_df_clean.csv", index_col = 0)

## 03. Begin to Use NER Model

In [37]:
# NER Model:
# named entity recognition model will help us build a relationship network between countries

fulltext = NER(text)

In [43]:
# visualize some of the entities to verify that this function worked:

displacy.render(fulltext[273:400], style = "ent", jupyter = True)

In [51]:
# now that we can see that the NER model has indeed worked, it is time to create a list of entities per sentence.
# we will later use our countries list as a filter:

df_sentences = [] # empty set to store results

# Loop through sentences, get entity list for each sentence
for sentence in fulltext.sents:
    entities_list = [ent.text for ent in sentence.ents]
    df_sentences.append({"sentence": sentence, "entities": entities_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [56]:
# double check that this worked:

df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Navigation\..."
1,"(the, wars1.2.1Economic)",[]
2,"(depression1.2.2The, rise, of, dictatorship1.3...",[]
3,"(World, War, II, (, 1939–1945)1.3.1The, war, i...",[World War II]
4,"(days1.3.7The, war, in, the, Pacific1.3.7.1Bac...","[Pacific1.3.7.1Background1.3.8Japanese, Holoca..."
5,"(decolonization1.4.2The, Cold, War, (, 1947–19...","[decolonization1.4.2The Cold War, 1947–1991)1...."
6,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race, the World Wide ..."
7,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
8,"(Historic, events, in, the, 20th, century[edit...","[the 20th, Edwardian, the 20th century]"
9,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"


In [58]:
df_sentences.shape

(1197, 2)

In [215]:
# there are 1197 sentences in the text, now we need to filter for the sentences we actually want.
# the sentences we are interested in will utilize our countries list.
# lets begin by checking that out:

countries_df.head(15)

Unnamed: 0,Country Name,Count
0,Afghanistan,1
1,Albania,2
2,Algeria,1
3,Andorra,0
4,Angola,1
5,Antigua and Barbuda,0
6,Argentina,0
7,Armenia,0
8,Australia,2
9,Austria,5


In [75]:
# we can ignore the count column for now... let's filter our sentences
# we only want sentences with entities from our countries list:

def filtered_entities(entities_list, countries_df):
    return [entity for entity in entities_list 
            if entity in list(countries_df['Country Name'])]

# this is defining a new filter, only returning entity names if they occur in both the entities list AND the countries_df

In [77]:
# now that we have created our filter, it is time to apply it to our sentences
# using a lambda function, we can iterate through our sentences df, then ammend results to a new column
# the new column will show us which of our desired entities are in any given sentence.

df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filtered_entities(x, countries_df))

In [219]:
df_sentences.head(20)

Unnamed: 0,sentence,entities,country_entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Navigation\...",[]
1,"(the, wars1.2.1Economic)",[],[]
2,"(depression1.2.2The, rise, of, dictatorship1.3...",[],[]
3,"(World, War, II, (, 1939–1945)1.3.1The, war, i...",[World War II],[]
4,"(days1.3.7The, war, in, the, Pacific1.3.7.1Bac...","[Pacific1.3.7.1Background1.3.8Japanese, Holoca...",[]
5,"(decolonization1.4.2The, Cold, War, (, 1947–19...","[decolonization1.4.2The Cold War, 1947–1991)1....",[]
6,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race, the World Wide ...",[]
7,"(These, advancements, have, played, a, signifi...","[the 21st century, today]",[]
8,"(Historic, events, in, the, 20th, century[edit...","[the 20th, Edwardian, the 20th century]",[]
9,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]",[]


In [84]:
# Filter out sentences that don't have any countries:
# using len > 0 to filter out any rows that have empty country entities 

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.shape

(122, 3)

In [142]:
# it looks like we have filtered out a LOT of sentences! moving in the right direction.
# lets take a closer look at the result:

df_sentences_filtered.head(15)

Unnamed: 0,sentence,entities,country_entities
17,"(The, Allies, ,, known, initially, as, "", The,...","[The Triple Entente, the British Empire, Franc...","[France, Russia]"
18,"(Germany, ,, Austria, -, Hungary, ,, Bulgaria,...","[Germany, Austria-Hungary, Bulgaria, the Ottom...","[Germany, Bulgaria, Russia]"
19,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[Bolsheviks, the Treaty of Brest-Litovsk, Germ...","[Germany, Russia]"
20,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
22,"(Although, Germany, shifted, huge, forces, fro...","[Germany, Allied, American]",[Germany]
28,"(Many, Germans, felt, these, reparations, were...","[Germans, Germany, Allied, Kaiser, Europe]",[Germany]
45,"(Germany, ,, 1933Fascism, first, appeared, in,...","[Germany, Italy, Benito Mussolini, 1922.[25]","[Germany, Italy]"
46,"(The, ideology, was, supported, by, a, large, ...","[Adolf Hitler, Germany, 1933, Nazism, Germany,...","[Germany, Germany]"
47,"(The, Nazi, Party, in, Germany, was, dedicated...","[The Nazi Party, Germany, German, German, Cent...",[Germany]
49,"(They, could, see, nothing, wrong, with, a, st...",[Germany],[Germany]


In [89]:
# nice, this definitely looks like a desired output.
# time to create relationships

## 04. Create Relationships

we need to define the relationships between the entities (countries)
in order to do this, we will need to see how frequently each countries appear to one another throughout the text.

In [277]:
# in order to define our relationsjip, we will need to start w an empty list
# we will also start with a window size of 5 (but will reasses if needed)
# this window is the # of sentences compared at one given time

# Define window size
window = 5

# Create an empty list for relationships:
relationships = []

# then we need to use a for-loop to iterate through our filtered sentences dataframe:
# we include -1 to ensure that the loop runs through all rows.
# in defining end_index, we make sure the loop doesn't attempt to have a window go beyond the last entry.
# we are using the min function to take the smaller of the two possible indexes for the end_index
# [-1] + 1 indicates one space beyond the last row of the dataframe.

for i in range(df_sentences_filtered.index[-1] - window + 1):
    end_index = min(i + window, df_sentences_filtered.index[-1] + 1)
    
# Extract country entities from the current window
    country_list = sum(df_sentences_filtered.loc[i: end_index].country_entities, [])
    
# Remove duplicated countries that are next to each other
# the reason we do this, is because a country won't have a relationship with itself
# to do this we will eliminate any country if it is already listed, but will always list the first country:
    
    countries_unique = [country_list[i] for i in range(len(country_list)) 
                        if i == 0 or country_list[i] != country_list[i - 1]]
    
# if the length of countries_unique is greater than one, then we know a relationship is present (what we're interested in)
# since we are interested in the relationship, we then want to list the countries in our respective columns
# the relationships are recorded consecutibely, hence a never being able to be in the last indexed position [-1]
    
    if len(countries_unique) > 1:
        for index, a in enumerate(countries_unique[:-1]):
            b = countries_unique[index + 1]
            relationships.append({"source_country": a, "target_country": b})
    

In [254]:
# now let's turn this into a dataframe:

relationships_df = pd.DataFrame(relationships)

relationships_df.head(100)

Unnamed: 0,source_country,target_country
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Bulgaria
4,Bulgaria,Russia
...,...,...
95,Lithuania,Finland
96,Finland,Germany
97,Germany,Poland
98,Poland,Germany


In [256]:
relationships_df.shape

(605, 2)

In [281]:
# export this df for future use!

relationships_df.to_csv('20th_century_relationships.csv')