In [1]:
#use base: conda for kernel root
import bs4 as bs
import urllib.request
import regex as re

# 1. Use beautifulsoup4 and lxml to scrape each Wikipedia page

In [2]:
def get_wiki_text(url):
    #retrieve webpage data
    scraped_data = urllib.request.urlopen(url)

    #read in data on web page
    article = scraped_data.read()

    #parse data using lmxl protocol
    parsed_article = bs.BeautifulSoup(article, 'lxml')

    #retrieve all the paragraphs contained on the parsed document
    paragraphs = parsed_article.find_all('p')

    article_text = ""

    #extract text from each paragraph and insert into article_text string
    for p in paragraphs:
        article_text += p.text
    
    return article_text
article_text = get_wiki_text('https://en.wikipedia.org/wiki/Green_Arrow')

# 2. Remove Wiki references and extra spaces

In [3]:
def rm_refs_and_spaces(article_text):
    #substitute references (numbers in bracket) with nothing
    rm_refs = re.sub(r'\[[0-9]*\]', '', article_text)

    #remove extra spaces
    rm_xtra_space = re.sub(r'\s+', ' ', rm_refs)

    return rm_xtra_space

rm_xtra_space = rm_refs_and_spaces(article_text)


# 3. Use the NLTK sentence tokenizer to break up the text into sentences. Count the number of sentences in each article. Report this number in a comment or text block.

In [4]:
import nltk
from nltk.tokenize import sent_tokenize

def count_sentences(text):
    sentences = sent_tokenize(text) #source: https://www.guru99.com/tokenize-words-sentences-nltk.html

    return sentences
    #301 Sentences

len(count_sentences(rm_xtra_space))

301

# 4. Use spaCy's "en_core_web_sm" model to tokenize the text of your article. 

In [5]:
import spacy 
nlp = spacy.load('en_core_web_sm')

def make_spacy_tkns(text):
    return nlp(text)


spacy_tokens_doc = make_spacy_tkns(rm_xtra_space)

# 5. Use displacy to display the named entities in your article. 

In [6]:
def display_entities(spacy_tkn_obj):
    entities = spacy.displacy.render(spacy_tkn_obj, style = 'ent', jupyter=True)
    return entities

display_entities(spacy_tokens_doc)

# 6. Build a complete list of the entities from your article that are labeled as ORG.

In [7]:
def list_orgs(doc):
    org_list = []

    for i in doc.ents:
        if i.label_ == 'ORG':
            org_list.append(i)

    return set([i.text for i in org_list ])

org_list_arrow = list_orgs(spacy_tokens_doc)


org_list_arrow

{'Action Comics',
 'Adams',
 'Adventure Comics',
 'Arrow',
 'Arrow Clan',
 'Arrow-Cave',
 'Arrow-Plane',
 'Arrow-Signal',
 'Athena',
 'Batman, Arsenal, Emiko',
 "Black Canary's",
 'Black Lantern Corps',
 'Bold',
 "Bull's Eye",
 'CCO of DC Entertainment',
 'CIA',
 'CW',
 'Connor',
 'Count Vertigo',
 'Crisis',
 'Cupid',
 'Cupid, Brick',
 'DC',
 'DC Comics',
 "DC Comics'",
 'DC Universe Animated Original Movies',
 'DC comics',
 'Diggle',
 'Dinah',
 'Emiko',
 'Felicity',
 'Felicity Smoak',
 'Flashpoint',
 'Green Arrow',
 'Green Arrow Industries',
 'Green Arrow/Black Canary',
 'Grell',
 'Hartley',
 'Hawke',
 'Infinite Crisis',
 'Insurgency',
 'JLA',
 "Jock's Green Arrow",
 'Justice',
 'Justice League',
 'Justice League Dark',
 'Justice League Elite',
 'Justice League United',
 'Justice League of America',
 'Justice League:',
 'Justice Society',
 'Justin Hartley',
 'Komodo',
 'Leading Comics',
 'League',
 'League of Justice',
 'Lemire',
 'Lian Harper',
 'Longbow Hunters',
 'LuthorCorp',
 "Ma

# 7. Count the number of times each ORG entity occurs in the list from the previous step. 
# Document the most frequently occurring organization in a text box or comment. 
# If each ORG is mentioned only once, document this fact and name one ORG that is earliest alphabetically.

In [8]:
#get frequencies of each organization
def org_freqs(doc, subdoc_list):
    freq_list = []
    
    doc_text = doc.text
    
    for i in subdoc_list:

        #get string value of each entity
        k = i

        #get count of text in document
        v = doc_text.count(k)
        
        #append values to a doc
        freq_list.append((k, v))

    #sort by value
    freq_list.sort(reverse=True, key = lambda x: x[1])

    return freq_list


out = org_freqs(spacy_tokens_doc, org_list_arrow)

out

#Justice is the most common text labled as an organization

[('Arrow', 137),
 ('Green Arrow', 113),
 ('Oliver', 112),
 ('Queen', 52),
 ('DC', 37),
 ('Justice', 31),
 ('League', 28),
 ('Justice League', 24),
 ("Oliver's", 21),
 ('Connor', 18),
 ('Superman', 15),
 ('Grell', 15),
 ('Hartley', 10),
 ('Smallville', 9),
 ('DC Comics', 9),
 ("O'Neil", 8),
 ('JLA', 8),
 ('Emiko', 8),
 ('the Justice League', 7),
 ('Hawke', 7),
 ('Sin', 6),
 ('Green Arrow/Black Canary', 6),
 ('Mia', 6),
 ('Crisis', 6),
 ('Lemire', 5),
 ('More Fun Comics', 5),
 ('Diggle', 5),
 ('Komodo', 5),
 ('Adams', 5),
 ('Red Arrow', 5),
 ('Longbow Hunters', 4),
 ('Felicity', 4),
 ('Multiverse', 4),
 ('Queen Industries', 4),
 ('Count Vertigo', 4),
 ('Justin Hartley', 4),
 ('Dinah', 4),
 ('the Green Arrow', 3),
 ('Flashpoint', 3),
 ('Insurgency', 3),
 ('The Longbow Hunters', 3),
 ('The Green Arrow/', 3),
 ('The Green Arrow/Black Canary', 3),
 ('Adventure Comics', 3),
 ('the Green Lantern/Green Arrow', 2),
 ('Justice League United', 2),
 ('The Dark Knight Strikes Again', 2),
 ("World's 

# 8. Use the LatentDirichletAllocation from sklearn.decomposition to conduct a Latent Dirichlet Analysis topic model from your article with 12 topics. 
# Use sentences as documents (see step 3). 
# Find one topic that contains, among its top 10 words, one of the organizations from the previous step, preferably the one that is mentioned most frequently. 
# If none of the ORG tokens appear in the top 10 list, add more topics and repeat your analysis. 
# Add a text block or comment that explains this topic, including a list of the top 10 words in that topic.

In [9]:
from nltk.tokenize.casual import casual_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


# take out stopwords
my_stopwords = stopwords.words('english') + list('`~!@#$%^&*(),.<>;:/-_\'"')

# create an instance of the count vectorizer
# min_df: word has to appear in at least 2 documents
# max_df: word can't appear in more than 90% of docs
vectorized_text = CountVectorizer(min_df = 2, max_df = 0.95, tokenizer = casual_tokenize, stop_words = my_stopwords)

#generate a sparse matrix of word counts
count_matrix = vectorized_text.fit_transform(raw_documents=count_sentences(rm_xtra_space)).toarray() 

In [10]:
import pandas as pd

#convert matrix to dataframe to perform calculations
count_df = pd.DataFrame(count_matrix, columns=vectorized_text.get_feature_names())

count_df.head(n=5)

Unnamed: 0,#137,#73,#85,#89,1,1940s,1941,1969,2,2000s,...,would,writer,writers,written,wrote,year,years,young,zero,–
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Apply Latent Dirichlet Allocation fitting algorithm 
from sklearn.decomposition import LatentDirichletAllocation as LDIA
from numpy import random as rnd
rnd.seed(123)

num_topics = 12

# Create an instance of the LDIA analyzer
ldia_model = LDIA(n_components = num_topics, learning_method = 'batch')

# provide the data of word counts to the algorithm
ldia_model = ldia_model.fit(count_df)

ldia_model.components_.shape

(12, 701)

In [12]:

def selected_topics(model, vectorizer, top_words=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx)) # One of these headers for each topic
        
        # This uses a list comprehension to iterate over the words
        # in each topic, picking out the highest coefficient values.
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_words - 1:-1]]) 

In [13]:
selected_topics(ldia_model, vectorized_text, top_words=10)

Topic 0:
[('green', 16.39740321943574), ('arrow', 13.5216830668348), ('oliver', 11.407752714040175), ('justice', 8.083335074567485), ('series', 7.029486391641485), ('league', 6.083332780301784), ('dc', 5.083340406049663), ('also', 5.083338425179472), ('events', 5.0833377435222955), ("o'neil", 5.083334323440212)]
Topic 1:
[('oliver', 7.900642427326662), ('green', 7.463577115348534), ('arrow', 6.911115662890421), ('clark', 6.083346255907429), ('justice', 6.012633548383003), ('series', 5.790046294404257), ('story', 5.083351757657934), ('well', 5.083351401998135), ('league', 4.996359797457434), ('part', 3.083330570116737)]
Topic 2:
[('oliver', 7.887565502639482), ('arrow', 7.3366744044818555), ('green', 7.255706816481496), ('earth', 5.520299198678286), ('issue', 5.083354042846047), ('also', 4.755203310092466), ('character', 4.083342204438433), ('batman', 4.083341368940408), ('final', 4.083337637205982), ('one', 3.98457510634869)]
Topic 3:
[('green', 20.350426909549146), ('oliver', 18.31404

# 8. Response
## The first topic includes the entity 'Justice' as well as the word League. This is referring to the Green Arrow, a DC comic book series with the main protagonist named Oliver Queen.

# 9. Oliver Twist

In [14]:
#read in Wiki Page

article_text = get_wiki_text('https://en.wikipedia.org/wiki/Oliver_%26_Company#Cast_and_characters')



In [15]:
# Remove references and extra spaces

rm_xtra_space = rm_refs_and_spaces(article_text)

In [16]:
# Break text into sentences

len(count_sentences(rm_xtra_space))

# 54 Sentences

113

In [17]:
# Tokenize the Text

spacy_tokens_doc = make_spacy_tkns(rm_xtra_space)


In [18]:
# Display named entities in article

display_entities(spacy_tokens_doc)

In [19]:
# build list of entities labled ORG

org_list_oliver = list_orgs(spacy_tokens_doc)
org_list_oliver

{'ABC',
 'Alexa',
 'Animation',
 'Buena Vista International',
 'Burger King',
 'DeSoto',
 'Disney',
 'Disney re-',
 'Eisner',
 'Fagin',
 "Halliwell's Film Guide called Oliver & Company",
 'J.A.C. Redford',
 'Katzenberg',
 'Kodak',
 'McDonald',
 "McDonald's",
 'Nevertheless, Oliver & Company',
 'Oliver & Company',
 'Oliver’s',
 'Owings & Merrill',
 'Paramount',
 'Paramount Pictures',
 'Rotten Tomatoes',
 'Ryder',
 'Sears',
 'Siskel & Ebert',
 'Sony',
 'Sykes',
 'The Black Cauldron',
 'The Magical World',
 'The Ren & Stimpy Show',
 'The San Francisco Examiner',
 'The Washington Post',
 'Tito',
 'Tramp',
 'VHS',
 'Walt Disney Feature Animation',
 'Walt Disney Pictures',
 'Warner Bros. Buena Vista International',
 'Washington Post',
 'Yamaha',
 'the Oliver & Company'}

In [20]:
# Get organization frequencies

out = org_freqs(spacy_tokens_doc, org_list_oliver)

out

# Disney is the highest frequency organization

[('Disney', 26),
 ('Fagin', 18),
 ('Oliver & Company', 16),
 ('Sykes', 10),
 ('Katzenberg', 8),
 ('Tito', 6),
 ('Animation', 4),
 ('DeSoto', 3),
 ('Eisner', 3),
 ('Tramp', 2),
 ('McDonald', 2),
 ('The Black Cauldron', 2),
 ('Paramount', 2),
 ('Walt Disney Feature Animation', 2),
 ('Buena Vista International', 2),
 ('Washington Post', 2),
 ("McDonald's", 2),
 ('Sears', 1),
 ('The Ren & Stimpy Show', 1),
 ("Halliwell's Film Guide called Oliver & Company", 1),
 ('Disney re-', 1),
 ('The Magical World', 1),
 ('The San Francisco Examiner', 1),
 ('Yamaha', 1),
 ('Burger King', 1),
 ('Ryder', 1),
 ('Owings & Merrill', 1),
 ('ABC', 1),
 ('J.A.C. Redford', 1),
 ('VHS', 1),
 ('The Washington Post', 1),
 ('Alexa', 1),
 ('Rotten Tomatoes', 1),
 ('the Oliver & Company', 1),
 ('Walt Disney Pictures', 1),
 ('Siskel & Ebert', 1),
 ('Nevertheless, Oliver & Company', 1),
 ('Oliver’s', 1),
 ('Sony', 1),
 ('Paramount Pictures', 1),
 ('Warner Bros. Buena Vista International', 1),
 ('Kodak', 1)]

In [21]:
#generate a sparse matrix of word counts
count_matrix = vectorized_text.fit_transform(raw_documents=count_sentences(rm_xtra_space)).toarray() 

In [22]:
#convert matrix to dataframe to perform calculations
count_df = pd.DataFrame(count_matrix, columns=vectorized_text.get_feature_names())

count_df.head(n=5)

Unnamed: 0,10,18,1985,1988,1996,2,2002,2009,2013,20th,...,working,world,worldwide,worry,would,writing,written,year,york,young
0,0,1,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
# Apply Latent Dirichlet Allocation fitting algorithm 

# Create an instance of the LDIA analyzer
ldia_model = LDIA(n_components = num_topics, learning_method = 'batch')

# provide the data of word counts to the algorithm
ldia_model = ldia_model.fit(count_df)

ldia_model.components_.shape

(12, 256)

In [24]:
# pull out top n words for each row (topic) and their coefficient values
selected_topics(ldia_model, vectorized_text, top_words=10)

Topic 0:
[('new', 5.083338981321957), ('dodger', 4.0833406574377635), ('film', 4.083330364376132), ('york', 3.083334489834802), ('disney', 3.0833308098954357), ('dogs', 3.083329640499173), ('chihuahua', 3.083325895165385), ('released', 2.6083566983515225), ('rita', 2.0833341682845616), ('animated', 2.0833338813220994)]
Topic 1:
[('film', 10.083341542364456), ('story', 5.083334820472589), ('gave', 3.08333754114428), ('siskel', 3.083333333302751), ('oliver', 3.0833311044894764), ('scribner', 2.98270611898182), ('little', 2.0833374084945775), ('classics', 2.083334651615018), ('young', 2.083334089505422), ('production', 2.083333888889052)]
Topic 2:
[('fagin', 4.887819167105025), ('sykes', 4.083335234907533), ('jenny', 3.5534317383937966), ('ransom', 3.0833296478933834), ('oliver', 2.5130411907483645), ('dogs', 2.499378703863648), ('however', 2.083337253145749), ('later', 2.083334828941785), ('would', 2.0833336252907), ('york', 2.0833330580753215)]
Topic 3:
[('katzenberg', 3.083342497097395

# Topic Interpretation:
## the first topic contains Disney, the top most mentioned organization. It contains information about the film location (new , york), meta information (released, animated, film), and character information (dodger, dogs, chihuahua, rita)

# 10. compare the results from each of your two articles. In particular, which article mentions the most ORGs? Do the two articles have any ORGs in common? When examining the topics that mention an ORG from each article, are there any linguistic similarities in the word lists?

In [25]:
# which article mentions the most orgs?

print('Green Arrow organizations:',str(len(org_list_arrow)))
print('Oliver & Company organizations:',str(len(org_list_oliver)))

# Oliver & company mentions less organizations, probably due to different lengths in referenced content 
# (stand-alone movie vs. comic book series with TV show)

# orgs in common:
common_orgs = set(org_list_arrow).intersection(org_list_oliver)

print(str(len(common_orgs))+' orgs in common.')


# Similar topics that include their respective characters (O&C topic 2 and GA topic 4) 
# as well as the social groups they formed (O&C topic 4 and GA topic 0)
# 
# They both group together named entities like characters, organizations and descriptors of the content they're referring to. 

Green Arrow organizations: 103
Oliver & Company organizations: 42
0 orgs in common.
