The objective of this notebook is to identify suspect words that appear on the wikipedia pages of 'bad actors' i.e. slave traders.
We will use the examples found from cross referencing with the Wikipedia list of slave traders and the Wikidata entries with the property of the 'Legacies of a British Slave Trader'

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

import re
from collections import Counter

import pandas as pd
import wikipedia

In [None]:
FILENAME = '../data/open-plaques-subjects-london.csv'
df = pd.read_csv(FILENAME)

Focus on the records that have a Wikipedia URL

In [None]:
df.dropna(subset = ['en_wikipedia_url'], inplace = True)

In [None]:
def get_wiki_contents(url):
    name = url.split('/')[-1]
    try:
        contents = wikipedia.page(name).content
        return contents 
    except:
        print('Wiki retrieval failed {0}'.format(name))
        return ''

In [None]:
# Know bad slave traders from wikipedia list cross reference and wikidata attribute
know_names = ['James Brown', 'William Penn', 'John Marshall', 'Robert Milligan',
              'Martin Van Buren', 'Benjamin Franklin', 
              'Quintin Hogg', 'William Ewart Gladstone', 'Elizabeth Barrett Browning']

content = ''

for name in know_names:
    url = df.loc[df['full_name'] == name, 'en_wikipedia_url'].iloc[0]
    n_content = get_wiki_contents(url)
    content = content + n_content.lower()
    print(len(n_content))

In [None]:
stop_words = set(stopwords.words('english')) 
word_tokens = word_tokenize(content) 
  
filtered_sentences = [w for w in word_tokens if not w in stop_words] 

In [None]:
# Filter out the punctuation
nonPunct = re.compile('.*[A-Za-z].*')  # must contain a letter
filtered = [w for w in filtered_sentences if nonPunct.match(w)]
counts = Counter(filtered)

In [None]:
sorted(counts.items(), key=lambda item: item[1])[::-1]

In [None]:
# Know bad slave traders from wikipedia list cross reference and wikidata attribute
know_names = ['James Brown','William Penn','John Marshall','Robert Milligan','Martin Van Buren','Benjamin Franklin', 
              'Quintin Hogg', 'William Ewart Gladstone']#, 'Elizabeth Barrett Browning']

all_freqs = []
for name in know_names:
    url = df.loc[df['full_name'] == name, 'en_wikipedia_url'].iloc[0]
    content = get_wiki_contents(url).lower()
    
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(content) 

    filtered_sentences = [w for w in word_tokens if not w in stop_words] 

    # Filter out the punctuation
    nonPunct = re.compile('.*[A-Za-z].*')  # must contain a letter
    filtered = [w for w in filtered_sentences if nonPunct.match(w)]
    counts = Counter(filtered)
    
    freqs = [pair[0] for pair in sorted(counts.items(), key=lambda item: item[1])][::-1]
    all_freqs.append(freqs)
    print(freqs[:10])

In [None]:
all_words = set([item for sublist in all_freqs for item in sublist])

inter_count = {}
for word in all_words:
    inter_count[word] = 0
    for freq in all_freqs:
        if word in freq:
            inter_count[word] += 1

In [None]:
print(sorted(inter_count.items(), key=lambda item: item[1])[::-1][:200])