# HW2: Text Mining

## Read data for processing

In [1]:
text = None

with open('./../data/data.txt', 'r') as f:
    text = f.read()

## NLTK

In [2]:
import nltk
# nltk.download()
# Download Corpora -> brown webtext words stopwords
# Download Models -> punkt averaged_perceptron_tagger maxent_ne_chunker vader_lexicon wordnet tagsets
nltk.download(["brown","webtext", "words", "stopwords"] )
nltk.download(["punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "vader_lexicon", "wordnet", "tagsets"])

[nltk_data] Downloading package brown to /home/wiedzmin/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package webtext to /home/wiedzmin/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package words to /home/wiedzmin/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wiedzmin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/wiedzmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/wiedzmin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/wiedzmin/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Dow

True

## Custom functions

In [3]:
from collections import Counter

def tokenCounts(tokens):
    counts = Counter(tokens)
    sortedCounts = sorted(counts.items(), key=lambda count:count[1], reverse=True)
    return sortedCounts

def listCounts(text, lst):
    res = []
    for item in lst:
        item = item + (text.count(item[0]),)
        item = [item[0], item[1], item[2]]
        res.append(item)
    res = sorted(res, key=lambda item: item[2], reverse=True)
    return res

def dictCounts(text, d):
    items = d.items()
    res = []
    for item in items:
        item = item + (text.count(item[0]),)
        item = [item[0], item[1], item[2]]
        res.append(item)
    res = sorted(res, key=lambda item: item[2], reverse=True)
    return res

## Text preprocessing

### Tokenize text

In [4]:
tokens = nltk.word_tokenize(text)
print('Words total: {}'.format(len(tokens)))
print(tokenCounts(tokens)[:20])

Words total: 12781
[(',', 714), ('the', 643), ('.', 533), ('of', 333), ('to', 330), ('and', 268), ('in', 218), ('a', 208), ('that', 149), ('is', 143), ("'s", 112), ("''", 97), ('are', 93), ('it', 93), ('``', 92), ('for', 90), ('says', 84), ('be', 73), ('The', 73), ('as', 72)]


### Filter punctation and stop words

In [5]:
from nltk.corpus import stopwords
from string import punctuation

def filterTokens(t):
    # Filter custom special characters from text
    extra_punctuation = punctuation + "–''``’“"
    stops = stopwords.words('english')
    filtered_tokens = [token for token in t if token not in extra_punctuation]
    filtered_tokens = [token for token in filtered_tokens if token not in stops]
    return filtered_tokens

filtered_tokens = filterTokens(tokens)
print('Filtered words cnt: {}'.format(len(filtered_tokens)))
print(tokenCounts(filtered_tokens)[:30])

Filtered words cnt: 6741
[("'s", 112), ('says', 84), ('The', 73), ('geothermal', 41), ('Ireland', 36), ('sargassum', 36), ('land', 33), ('ground', 32), ('In', 31), ('years', 28), ('also', 27), ('Halligen', 27), ('steam', 26), ('permafrost', 26), ('But', 24), ('Olkaria', 24), ('could', 24), ('one', 23), ('It', 23), ("n't", 22), ('water', 22), ('would', 21), ('people', 20), ('sea', 20), ('climate', 19), ('heat', 18), ('power', 18), ('We', 18), ('energy', 17), ('much', 17)]


### Lemmatize tokens

In [6]:
# from nltk.corpus.reader.wordnet import NOUN,VERB
# from nltk.stem import WordNetLemmatizer

# lemmatizer = WordNetLemmatizer()
# lemmas = {token:lemmatizer.lemmatize(token, pos=VERB) for token in filtered_tokens}
# print('Lemmas count: {}'.format(len(lemmas)))

# tf = {}

# for key, val in lemmas.items():
#     print([key, val])

## POS tagging

In [7]:
tagged = nltk.pos_tag(filtered_tokens)
print('POST results cnt: {}'.format(len(tagged)))
print('Top 50 POS results:')
print(listCounts(text, tagged[:50]))

POST results cnt: 6741
Top 50 POS results:
[["'s", 'POS', 112], ["'s", 'POS', 112], ["'s", 'POS', 112], ['In', 'IN', 43], ['D', 'NNP', 38], ['steam', 'NN', 28], ['road', 'NN', 22], ['energy', 'NN', 18], ['see', 'VB', 18], ['Kenya', 'NNP', 17], ['past', 'IN', 13], ['region', 'NN', 12], ['Africa', 'NNP', 7], ['tectonic', 'JJ', 7], ['sometimes', 'RB', 7], ['clean', 'VBP', 6], ['Hell', 'NNP', 6], ['National', 'NNP', 6], ['Rift', 'NNP', 5], ['apart', 'RB', 5], ['Gate', 'NNP', 5], ['East', 'NNP', 4], ['Great', 'NNP', 4], ['Valley', 'NNP', 4], ['continent', 'JJ', 4], ['along', 'IN', 4], ['must', 'MD', 4], ['releasing', 'VBG', 3], ['winds', 'VBZ', 3], ['park', 'NN', 3], ['volcanic', 'JJ', 2], ['shifts', 'NNS', 2], ['quantities', 'NNS', 2], ['giraffes', 'VBP', 2], ["'ll", 'MD', 2], ['avoid', 'JJ', 2], ['tearing', 'VBG', 1], ['unimaginable', 'JJ', 1], ['Drive', 'NNP', 1], ['dusty', 'JJ', 1], ['dirt', 'NN', 1], ['zebra', 'NN', 1], ['gazelles', 'NNS', 1], ['plume', 'JJ', 1], ['shooting', 'VBG', 1]

## NER with entity classification (using nltk.ne_chunk)

In [8]:
ne_chunked_binary = nltk.ne_chunk(tagged, binary=True)
ne_chunked = nltk.ne_chunk(tagged, binary=False)

def extractEntities(ne_chunked):
    data = {}
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    return data

ne_binary = extractEntities(ne_chunked_binary)
ne_binary_cnts = dictCounts(text, ne_binary)
print('NEW binary results cnt: {}'.format(len(ne_binary_cnts)))
print('Top 50 NER binary entities:')
print(ne_binary_cnts[:50])

ne = extractEntities(ne_chunked)
ne_cnts = dictCounts(text, ne)
print('NER results cnt: {}'.format(len(ne_cnts)))
print('Top 50 NER results:')
print(ne_cnts[:50])

NEW binary results cnt: 184
Top 50 NER binary entities:
[['Ireland', 'NE', 36], ['Hallig', 'NE', 32], ['Halligen', 'NE', 27], ['Olkaria', 'NE', 24], ['Kenya', 'NE', 17], ['Earth', 'NE', 13], ['Atlantic', 'NE', 12], ['German', 'NE', 11], ['Newson', 'NE', 10], ['Karingithi', 'NE', 9], ['Daltun', 'NE', 9], ['Hooge', 'NE', 9], ['Mexico', 'NE', 9], ['Mwangi', 'NE', 8], ['County', 'NE', 8], ['Germany', 'NE', 8], ['Nordstrandischmoor', 'NE', 8], ['Africa', 'NE', 7], ['Arctic', 'NE', 7], ['Hell', 'NE', 6], ['Maasai', 'NE', 6], ['Iceland', 'NE', 6], ['Tibet', 'NE', 6], ['Doré', 'NE', 6], ['Irish', 'NE', 6], ['North Sea', 'NE', 6], ['Deicke', 'NE', 6], ['Morrison', 'NE', 6], ['Olkaria V', 'NE', 5], ['CO2', 'NE', 5], ['Northern Ireland', 'NE', 5], ['Fogarty', 'NE', 5], ['Rösner', 'NE', 5], ['Caribbean', 'NE', 5], ['Wang', 'NE', 5], ['Olkaria VI', 'NE', 4], ['KenGen', 'NE', 4], ['Geothermal', 'NE', 4], ['Rift Valley', 'NE', 4], ['Nyaga', 'NE', 4], ['Canada', 'NE', 4], ['Highway', 'NE', 4], ['Hanse

## NER with custom patterns

In [9]:
# print(tagged)
grammar = "NP: {<DT>?<JJ>*<NNP|NNPS>+}"
cp = nltk.RegexpParser(grammar)
ne_custom_tmp = extractEntities(cp.parse(tagged))
ne_custom = {}
for entity, phr in ne_custom_tmp.items():
    if len(entity) > 1:
        ne_custom[entity] = phr 
print('NER custom results cnt: {}'.format(len(ne_custom)))
print('NER custom top results:')
# print(dictCounts(text, ne_custom)[:50])
ne_custom_cnts = dictCounts(text, ne_custom)
print(ne_custom_cnts[:50])

NER custom results cnt: 303
NER custom top results:
[['Ireland', 'NP', 36], ['Hallig', 'NP', 32], ['Halligen', 'NP', 27], ['Olkaria', 'NP', 24], ['Kenya', 'NP', 17], ['As', 'NP', 17], ['Earth', 'NP', 13], ['Atlantic', 'NP', 12], ['Newson', 'NP', 10], ['Hartwig-Kruse', 'NP', 10], ['Karingithi', 'NP', 9], ['Daltun', 'NP', 9], ['Hooge', 'NP', 9], ['Mexico', 'NP', 9], ['Mwangi', 'NP', 8], ['All', 'NP', 8], ['Germany', 'NP', 8], ['Nordstrandischmoor', 'NP', 8], ['Africa', 'NP', 7], ['Arctic', 'NP', 7], ['Reid', 'NP', 7], ['Hell', 'NP', 6], ['Maasai', 'NP', 6], ['Iceland', 'NP', 6], ['Doré', 'NP', 6], ['Republic', 'NP', 6], ['North Sea', 'NP', 6], ['Deicke', 'NP', 6], ['Allen', 'NP', 6], ['Morrison', 'NP', 6], ['Gate', 'NP', 5], ['Olkaria V', 'NP', 5], ['CO2', 'NP', 5], ['Northern Ireland', 'NP', 5], ['Fogarty', 'NP', 5], ["O'Connell", 'NP', 5], ['Rösner', 'NP', 5], ['Caribbean', 'NP', 5], ['Wang', 'NP', 5], ['Olkaria VI', 'NP', 4], ['KenGen', 'NP', 4], ['Geothermal', 'NP', 4], ['Rift Valley

## Custom entity classification

### NER using nltk.ne_chunk

In [10]:
import wikipedia
from difflib import SequenceMatcher

def nearestTerm(term, options):
    maxRatio = -1.0
    nearest = options[0]
    for opt in options:
        # Use longest contiguous matching subsequence
        ratio = SequenceMatcher(None, term, opt).ratio()
        if ratio > maxRatio:
            maxRatio = ratio
            nearest = opt
    return nearest

def classFromSummary(summary, generic):
    first_sent = nltk.sent_tokenize(summary)[0]
    first_sent_tokens = nltk.word_tokenize(first_sent)
    first_sent_tagged = nltk.pos_tag(first_sent_tokens)
#     print(first_sent)
#     print(first_sent_tagged)
    grammar = "NP: {<VBZ|VBP|VBD><DT><JJ|JJR|JJS|CC|NNP|IN|,>*<NN|NNS>+(<IN><DT>?<JJ|JJR|JJS|CC|NNP|IN|,>*<NN|NNS|NNP|NNPS>+)?}"
    c = nltk.RegexpParser(grammar)
    classification = extractEntities(c.parse(first_sent_tagged))
    if len(classification) < 1:
        grammar = "NP: {<VBZ|VBP|VBD><DT>?<JJ|JJR|JJS|CC|NNP|IN|,>*<NN|NNS>+(<IN><DT>?<JJ|JJR|JJS|CC|NNP|IN|,>*<NN|NNS|NNP|NNPS>+)?}"
        c = nltk.RegexpParser(grammar)
        classification = extractEntities(c.parse(first_sent_tagged))
        if len(classification) < 1:
            return generic
    classification = next(iter(classification))
#     print(classification)
    classification_tagged = nltk.pos_tag(nltk.word_tokenize(classification))
    grammar = "NP: {<JJ|JJR|JJS|CC|NNP|IN|,>*<NN|NNS>+(<IN><DT>?<JJ|JJR|JJS|CC|NNP|IN|,>*<NN|NNS|NNP|NNPS>+)?}"
    c = nltk.RegexpParser(grammar)
    classification = extractEntities(c.parse(classification_tagged))
    classification = next(iter(classification))
    return classification

def wikipediaClassification(entity):
    generic_class = 'generic'
    try:
        results = wikipedia.search(entity)
        if len(results) < 1:
            return generic_class
        page = wikipedia.page(results[0])
        return classFromSummary(page.summary, generic_class)
    except wikipedia.DisambiguationError as e:
        # Handle ambiguous search result - find nearest option
        nearest = nearestTerm(entity, e.options)
        try:
            page = wikipedia.page(nearest)
            return classFromSummary(page.summary, generic_class)
        except (wikipedia.DisambiguationError, wikipedia.PageError):
            # In case of multiple ambiguousity, return generic classification
            return generic_class
    except wikipedia.PageError:
        return generic_class

for entity in ne_cnts[:50]:
    classification = wikipediaClassification(entity[0])
    print([entity, classification])

[['Ireland', 'GPE', 36], 'piece of subcontinental land']
[['Hallig', 'PERSON', 32], 'small islands without protective dikes']
[['Halligen', 'PERSON', 27], 'small islands without protective dikes']
[['Olkaria', 'GPE', 24], 'region']




  lis = BeautifulSoup(html).find_all('li')


[['Kenya', 'PERSON', 17], 'generic']
[['Earth', 'PERSON', 13], 'generic']
[['Atlantic', 'ORGANIZATION', 12], 'second-largest of the world']
[['German', 'GPE', 11], 'country at the intersection']
[['Newson', 'ORGANIZATION', 10], 'generic']
[['Karingithi', 'PERSON', 9], 'generic']
[['Mexico', 'GPE', 9], 'country in the southern portion']
[['Daltun', 'PERSON', 9], 'pair of agreements']
[['Hooge', 'PERSON', 9], 'generic']
[['Mwangi', 'PERSON', 8], 'Kenyan photojournalist']
[['All', 'PERSON', 8], 'sorosilicate group of minerals']
[['Germany', 'GPE', 8], 'country at the intersection']
[['Nordstrandischmoor', 'GPE', 8], 'generic']
[['Africa', 'PERSON', 7], 'country']
[['Arctic', 'ORGANIZATION', 7], 'generic']
[['Reid', 'PERSON', 7], 'technique wherein']
[['Hell', 'PERSON', 6], 'generic']
[['Maasai', 'PERSON', 6], 'Nilotic ethnic group']
[['Iceland', 'GPE', 6], 'island in the North Atlantic']
[['Doré', 'PERSON', 6], 'generic']
[['Irish', 'GPE', 6], 'islands of Ireland and Great Britain']
[['No

### NER with custom patterns

In [11]:
for entity in ne_custom_cnts[:50]:
    classification = wikipediaClassification(entity[0])
    print([entity, classification])

[['Ireland', 'NP', 36], 'piece of subcontinental land']
[['Hallig', 'NP', 32], 'small islands without protective dikes']
[['Halligen', 'NP', 27], 'small islands without protective dikes']
[['Olkaria', 'NP', 24], 'region']
[['Kenya', 'NP', 17], 'generic']
[['As', 'NP', 17], 'first letter']
[['Earth', 'NP', 13], 'generic']
[['Atlantic', 'NP', 12], 'second-largest of the world']
[['Newson', 'NP', 10], 'generic']
[['Hartwig-Kruse', 'NP', 10], 'German politician for the populist Alternative for Germany']
[['Karingithi', 'NP', 9], 'generic']
[['Daltun', 'NP', 9], 'pair of agreements']
[['Hooge', 'NP', 9], 'generic']
[['Mexico', 'NP', 9], 'country in the southern portion']
[['Mwangi', 'NP', 8], 'Kenyan photojournalist']
[['All', 'NP', 8], 'sorosilicate group of minerals']
[['Germany', 'NP', 8], 'country at the intersection']
[['Nordstrandischmoor', 'NP', 8], 'generic']
[['Africa', 'NP', 7], 'country']
[['Arctic', 'NP', 7], 'generic']
[['Reid', 'NP', 7], 'technique wherein']
[['Hell', 'NP', 6]