In [1]:
# The objective of this file is to identify words suspect to appear on the wikipedia pages of 'bad actors' i.e. slave traders
# We will use the examples found from cross referencing with the wikipedia list of slave traders and the wikidata entries with
# the property of the 'legacies of a british slave trader'

In [2]:
from wikipedia_scrape import get_wiki_contents
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

import re
from collections import Counter

import pandas as pd

In [3]:
FILENAME = 'open-plaques-subjects-london.csv'
df = pd.read_csv(FILENAME)

df.dropna(subset = ['en_wikipedia_url'], inplace = True)

In [4]:
# Know bad slave traders from wikipedia list cross reference and wikidata attribute
know_names = ['James Brown','William Penn','John Marshall','Robert Milligan','Martin Van Buren','Benjamin Franklin', 
              'Quintin Hogg', 'William Ewart Gladstone', 'Elizabeth Barrett Browning']

content = ''

for name in know_names:
    url = df.loc[df['full_name'] == name, 'en_wikipedia_url'].iloc[0]
    n_content = get_wiki_contents(url)
    content = content + n_content.lower()
    print(len(n_content))

76740
49631
52324
2084
67155
103151
4172
87404
28044


In [5]:
stop_words = set(stopwords.words('english')) 
word_tokens = word_tokenize(content) 
  
filtered_sentences = [w for w in word_tokens if not w in stop_words] 

In [6]:
# Filter out the punctuation
nonPunct = re.compile('.*[A-Za-z].*')  # must contain a letter
filtered = [w for w in filtered_sentences if nonPunct.match(w)]
counts = Counter(filtered)

In [7]:
sorted(counts.items(), key=lambda item: item[1])[::-1]

[("'s", 841),
 ('franklin', 459),
 ('brown', 408),
 ('gladstone', 283),
 ('van', 256),
 ('buren', 250),
 ('penn', 241),
 ('new', 207),
 ('marshall', 204),
 ('first', 186),
 ('would', 161),
 ('william', 153),
 ('james', 147),
 ('state', 145),
 ('also', 145),
 ('american', 139),
 ('court', 136),
 ('one', 128),
 ('states', 121),
 ('time', 116),
 ('benjamin', 110),
 ('government', 104),
 ('united', 104),
 ('john', 104),
 ('later', 103),
 ('party', 101),
 ('became', 101),
 ('london', 100),
 ('president', 100),
 ('two', 95),
 ('pennsylvania', 94),
 ('many', 94),
 ('years', 94),
 ('life', 93),
 ('york', 92),
 ('house', 91),
 ('political', 88),
 ('jackson', 88),
 ('election', 79),
 ('man', 78),
 ('war', 76),
 ('philadelphia', 75),
 ('father', 75),
 ('family', 72),
 ('british', 71),
 ('national', 71),
 ('elizabeth', 69),
 ('made', 69),
 ('barrett', 68),
 ('public', 68),
 ('university', 66),
 ('may', 66),
 ('wrote', 65),
 ('case', 63),
 ('browning', 62),
 ('england', 62),
 ('great', 62),
 ('law'

In [19]:
# Know bad slave traders from wikipedia list cross reference and wikidata attribute
know_names = ['James Brown','William Penn','John Marshall','Robert Milligan','Martin Van Buren','Benjamin Franklin', 
              'Quintin Hogg', 'William Ewart Gladstone']#, 'Elizabeth Barrett Browning']

all_freqs = []
for name in know_names:
    url = df.loc[df['full_name'] == name, 'en_wikipedia_url'].iloc[0]
    content = get_wiki_contents(url).lower()
    
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(content) 

    filtered_sentences = [w for w in word_tokens if not w in stop_words] 

    # Filter out the punctuation
    nonPunct = re.compile('.*[A-Za-z].*')  # must contain a letter
    filtered = [w for w in filtered_sentences if nonPunct.match(w)]
    counts = Counter(filtered)
    
    freqs = [pair[0] for pair in sorted(counts.items(), key=lambda item: item[1])][::-1]
    all_freqs.append(freqs)
    print(freqs[:10])

['brown', "'s", 'james', 'band', 'also', 'music', 'b', 'r', 'song', 'new']
['penn', "'s", 'william', 'pennsylvania', 'quaker', 'quakers', 'new', 'father', 'king', 'admiral']
['marshall', 'court', "'s", 'state', 'supreme', 'john', 'states', 'case', 'virginia', 'federal']
['milligan', 'docks', 'west', 'london', 'india', "'s", 'street', 'statue', 'also', 'george']
['van', 'buren', "'s", 'jackson', 'new', 'party', 'state', 'president', 'election', 'york']
['franklin', "'s", 'benjamin', 'american', 'first', 'philadelphia', 'pennsylvania', 'new', 'one', 'became']
['hogg', 'polytechnic', 'london', 'quintin', "'s", 'street', 'school', 'university', 'regent', 'oxford']
['gladstone', "'s", 'government', 'first', 'lord', 'one', 'william', 'would', 'house', 'liberal']


In [20]:
all_words = set([item for sublist in all_freqs for item in sublist])

inter_count = {}
for word in all_words:
    inter_count[word] = 0
    for freq in all_freqs:
        if word in freq:
            inter_count[word] += 1

In [27]:
print(sorted(inter_count.items(), key=lambda item: item[1])[::-1][:200])

[('called', 8), ('george', 8), ('day', 8), ('later', 8), ('two', 8), ('public', 8), ('links', 8), ('external', 8), ('death', 8), ('may', 8), ("'s", 8), ('also', 8), ('years', 8), ('prominent', 8), ('born', 7), ('known', 7), ('three', 7), ('member', 7), ('living', 7), ('richard', 7), ('find', 7), ('old', 7), ('near', 7), ('second', 7), ('james', 7), ('owned', 7), ('named', 7), ('political', 7), ('education', 7), ('father', 7), ('street', 7), ('daughter', 7), ('built', 7), ('early', 7), ('local', 7), ('wrote', 7), ('july', 7), ('press', 7), ('son', 7), ('lord', 7), ('life', 7), ('school', 7), ('first', 7), ('left', 7), ('charles', 7), ('married', 7), ('part', 7), ('served', 7), ('time', 7), ('references', 7), ('february', 7), ('following', 7), ('personal', 7), ('family', 7), ('still', 7), ('york', 7), ('national', 7), ('eventually', 7), ('due', 7), ('died', 7), ('around', 7), ('january', 7), ('year', 7), ('june', 7), ('william', 7), ('history', 7), ('university', 7), ('see', 7), ('many',