# 1.6 Intro to NLP and Network Analysis

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [3]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 10.1 MB/s eta 0:00:02
     ------------- -------------------------- 4.5/12.8 MB 14.1 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 17.3 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 17.8 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

# Load the twentieth-century text file

In [8]:
# Import txt file

with open('20th_century_scrape.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

In [10]:
print(data)



# Load the countries file

In [13]:
# Read CSV file and skip initial spaces in the columns
countries = pd.read_csv('countries_list_20th_century_1.5.csv', index_col = 0, skipinitialspace=True)

# Strip leading and trailing spaces from a specific column
countries['country_name'] = countries['country_name'].str.strip()

# Text file wrangling (checking for special characters and consistency)

In [16]:
# Check for special characters and country name consistency
special_characters = set()
inconsistent_countries = set()

for line in data:
    special_characters.update(set(line) - set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \t\n'))
    
# Check for country names, assuming country names are in a specific column 
columns = line.strip().split('\t') # Adjust delimiter as needed 
if len(columns) > 1: # Ensure there are at least 2 columns 
    if columns[1] not in known_countries: # Assuming country names are in the second column
        inconsistent_countries.add(columns[1])

# Display results
print("Special Characters Found:", special_characters)
print("Inconsistent Country Names:", inconsistent_countries)


Special Characters Found: {'§', '^', '“', '\xa0', '?', '±', '¨', '¶', '®', 'â', '[', '€', 'ç', ',', '°', 'Œ', '²', '=', '_', 'Ù', 'Ð', '&', '-', '!', '–', '©', ')', '£', 'ï', ']', 'Ã', '½', 'Š', '¼', '—', '.', 'æ', 'º', '(', 'Â', 'Ø', '™', '"', '„', '/', ';', 'Ñ', 'œ', ':', '”', '\xad', '¸', '|', "'", '¹'}
Inconsistent Country Names: set()


The above code revealed special characters in our text but no inconsistent country names. The next step is to remove the special characters from our text.

In [19]:
#Define a regular expression to match allowed characters
allowed_pattern = re.compile(r'[^a-zA-Z0-9 \t\n.,;:\'\"-()[]{}!?]')

# Remove special characters while preserving spaces
cleaned_data = []
for line in data:
    cleaned_line = re.sub(allowed_pattern, '', line)  # Remove disallowed characters
    cleaned_data.append(cleaned_line)

#Save the cleaned data
with open('cleaned_20th_century_scrape.txt', 'w') as file:
    file.writelines(cleaned_data)  # Write the cleaned lines to the file


In [21]:
# Import txt file

with open('cleaned_20th_century_scrape.txt', 'r', errors='ignore') as file: 
   cleaned_data = file.read().replace( '\n', ' ')

In [23]:
print(cleaned_data)



In [27]:
article = NER(cleaned_data)

In [31]:
# Visualize identified entities

displacy.render(article[273:20000], style = "ent", jupyter = True)

# Split the sentence entities from the NER object

In [36]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in article.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [39]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,( ),[]
1,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century, Navigation \t Main, Contr..."
2,"(Donate, Create, account, Log, in, \t\t, ...","[the 20th century, the 20th century, the begin..."
3,"("", The, war, to, end, all, wars, "", :, World,...","[World War I, 1.3 Global, World War II]"
4,"(1.3.1, The, war, in, Europe, , 1.3.2,...","[1.3.1, Europe, 1.3.2, Blitzkrieg 1.3...."
5,(1.3.4),[]
6,"(Turning, tides, , 1.3.5, Operation, O...",[]
7,"(The, war, in, the, Pacific, , 1.3.7.1, ...","[1.3.8, 1.3.9, Allied]"
8,"(The, Holocaust, , 1.3.12)",[]
9,"(The, Nuclear, Age, begins, , 1.4, T...","[1.4, 1.4.1]"


# Filter the entities so we end up only with the ones from countries list

In [45]:
countries.head()

Unnamed: 0,country_name
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


In [48]:
# Function to filter out entities not of interest

def filter_entity(ent_list, countries):
    return [ent for ent in ent_list 
            if ent in list(countries['country_name'])]

In [52]:
# Check

filter_entity(["Poland", "CF", "2"], countries)

['Poland']

In [57]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries))

In [65]:
df_sentences['country_entities'].head(20)

0     []
1     []
2     []
3     []
4     []
5     []
6     []
7     []
8     []
9     []
10    []
11    []
12    []
13    []
14    []
15    []
16    []
17    []
18    []
19    []
Name: country_entities, dtype: object

In [68]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [71]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1043,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1047,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, India, Pakistan, 70 ...","[India, Pakistan]"
1055,"("", The, Philippines, ,, 1898â€“1946, |, US, H...","[Philippines, 1898â€“1946, US, Art & Archives]",[Philippines]
1083,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Enduring Failures of ...",[Afghanistan]
1118,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1174,"("", Selling, "", Operation, Passage, to, Freedo...","[Selling ""Operation Passage to Freedom, Thomas...",[Vietnam]
1203,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1434,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, a Field Exper...",[Lebanon]
1439,"(The, Rise, of, China, and, India, :, A, New, ...","[The Rise of China, India]",[India]
1440,"(Singapore, :, World, Scientific, .)",[Singapore],[Singapore]


# Create the relationships dataframe

In [78]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated countries that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [81]:
relationship_df = pd.DataFrame(relationships)

In [84]:
relationship_df

Unnamed: 0,source,target
0,France,Austria
1,Austria,Hungary
2,France,Austria
3,Austria,Hungary
4,Hungary,Russia
...,...,...
616,India,Singapore
617,India,Singapore
618,India,Singapore
619,India,Singapore


In [88]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,Austria,France
1,Austria,Hungary
2,Austria,France
3,Austria,Hungary
4,Hungary,Russia


In [91]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [95]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Austria,France,6
1,Austria,Hungary,6
2,Hungary,Russia,5
3,Germany,Russia,21
4,Germany,Italy,25
5,Austria,Germany,11
6,Germany,Spain,2
7,France,Poland,11
8,France,Germany,29
9,Germany,Poland,28


# Save and export dataframe

In [99]:
relationship_df.to_csv('20th_century_relationship.csv')