In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
# download packages for nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

[nltk_data] Downloading package punkt to /home/martilad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/martilad/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/martilad/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/martilad/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
# load the text
texts = []
for i in range(1, 16, 1):
    text = None
    with open('../data/t'+ str(i) + '.txt', 'r') as f:
        text = f.read()
    texts.append(text)

In [4]:
def add_to_graph(G, node1, node2):
    """Function for add edge and nodes to graph.
    - sort the name of node - it is non oriented graph
    - id node not in in graph add
    - each edge add to same nodes increase weight on the edge    
    """
    if node1 > node2:
        tmp = node1
        node1 = node2
        node2 = tmp
    if node1 not in G:
        G.add_node(node1)
        
    if node2 not in G:
        G.add_node(node2)
        
    if node2 not in G[node1]:
        G.add_edge(node1, node2)
        G[node1][node2]['weight'] = 0
    
    G[node1][node2]['weight'] += 1

In [5]:
# Do POS tagging on the whole text 
all_text = ''
for i in texts:
    all_text += i
tokens = nltk.word_tokenize(all_text)
tagged = nltk.pos_tag(tokens)
print("Number of words in dataset: ", len(tokens))

Number of words in dataset:  15267


In [6]:
# Load text to sentences and do POS tag on it
sentences_in_text = []
sent_tokens_in_text = []
sent_tagged_in_text = []
cnt = 0
for i in texts:
    sentences = nltk.sent_tokenize(i)
    cnt += len(sentences)
    sentences_in_text.append(sentences)
    sent_tokens = [nltk.word_tokenize(sent) for sent in sentences]
    sent_tokens_in_text.append(sent_tokens)
    sent_tagged = [nltk.pos_tag(sent) for sent in sent_tokens]
    sent_tagged_in_text.append(sent_tagged)
print("Number of sentences in dataset: ", cnt)

Number of sentences in dataset:  619


In [7]:
# Counting words per category in dict
def count_words_per_category(counter_dict, category, word):
    if category not in counter_dict:
        counter_dict[category] = {}
    if word not in counter_dict[category]:
        counter_dict[category][word] = 0
    counter_dict[category][word] += 1
    
# Print n best for each category from counter dict|
def print_best_n_for_each_category(cnt, n):
    for i in cnt:
        sorted_x = sorted(cnt[i].items(), key=lambda kv: kv[1])
        print(i,": ", sep='')
        for j in range(n if len(sorted_x)>=n else len(sorted_x)):
            print(' ', sorted_x[-(j+1)][0],'-',sorted_x[-(j+1)][1])

In [8]:
# Extract entities from ne_chunked format to dict
def extractEntities(ne_chunked):
    data = {}
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            data[text] = ent
        else:
            continue
    return data

### Best word in POS tagging

In [9]:
# Count the words in each tags
cnt = {}
for text in sent_tagged_in_text:
    for sent in text:
        for word in sent:
            count_words_per_category(cnt, word[1], word[0])
print_best_n_for_each_category(cnt, 5)

NN: 
  hydrogen - 32
  deal - 29
  time - 23
  electricity - 21
  manager - 20
:: 
  - - 86
  : - 41
  ; - 2
  ... - 1
IN: 
  of - 334
  in - 245
  on - 110
  for - 104
  at - 101
NNP: 
  United - 83
  Solskjaer - 68
  ’ - 39
  Mourinho - 28
  League - 25
VBP: 
  have - 54
  are - 45
  do - 10
  ’ - 9
  say - 7
DT: 
  the - 704
  a - 307
  The - 96
  an - 46
  this - 39
VBZ: 
  is - 110
  has - 84
  says - 24
  ’ - 9
  does - 9
TO: 
  to - 332
VB: 
  be - 81
  have - 18
  make - 10
  get - 10
  happen - 8
JJ: 
  first - 36
  last - 17
  new - 15
  final - 14
  clean - 13
NNS: 
  people - 29
  islands - 22
  years - 18
  villages - 12
  results - 12
.: 
  . - 578
  ? - 41
  ! - 2
NNPS: 
  Orcadians - 4
  Commons - 3
  Rights - 1
  States - 1
  Indians - 1
VBD: 
  was - 101
  had - 64
  said - 59
  were - 33
  did - 18
RB: 
  not - 53
  n't - 31
  also - 25
  only - 23
  just - 19
VBG: 
  including - 15
  according - 11
  winning - 7
  being - 7
  looking - 5
CD: 
  one - 26
  two - 15
 

### Best word in NER using ne_chunk

In [10]:
# Find the entities for each sentence and for each article
NER_nltk = []
for sent_tag in sent_tagged_in_text:
    # Find the entities for each sentence
    NER_nltk_sent = []
    ne_chunked_sent = nltk.ne_chunk_sents(sent_tagged)
    for i in ne_chunked_sent:
        NER_nltk_sent.append(extractEntities(i))
    NER_nltk.append(NER_nltk_sent)    

In [11]:
cnt_NER = {}
for text in NER_nltk:
    for sent in text:
        for i in sent:
            count_words_per_category(cnt_NER, sent[i], i)
print_best_n_for_each_category(cnt_NER, 5)

GPE: 
  Shapinsay - 60
  Orkney - 60
  Scottish - 30
  Stromness - 30
  Scotland - 30
PERSON: 
  Orkney - 150
  Bews - 45
  Lidderdale - 30
  Stockan - 15
  Clipsham - 15
ORGANIZATION: 
  EMEC - 75
  UK - 60
  IMO - 30
  CCS - 30
  Orcadians - 30
LOCATION: 
  North Sea - 15
  Scotland - 15


### My own paterns

In [12]:
def check_entity_sent(tagged, pattern):
    sent_entity = []
    cp = nltk.RegexpParser(pattern)
    for j in tagged:
        sent_entity.append(extractEntities(cp.parse(j)))
    return sent_entity

In [13]:
# Find the entities for each sentence and for each article
NER_nltk2 = []
for sent_tag in sent_tagged_in_text:
    # Find the entities for each sentence
    NER_nltk_sent = []
    ne_chunked_sent = check_entity_sent(sent_tag, """
                        NOUNS: {<N.*>{2,}}
                        NOUN WITH ADJECTIVE: {<DT>?<JJ*><NN|NNS>}
                            {<DT|PP\$>?<JJ><NN>}
                        PROPER NOUN: {<NNP*>+} 
                        """)
    NER_nltk2.append(ne_chunked_sent)

In [14]:
cnt_NER2 = {}
for text in NER_nltk2:
    for sent in text:
        for i in sent:
            count_words_per_category(cnt_NER2, sent[i], i)
print_best_n_for_each_category(cnt_NER2, 10)

PROPER NOUN: 
  United - 51
  Solskjaer - 47
  Mourinho - 23
  hydrogen - 21
  deal - 21
  Huawei - 15
  job - 14
  % - 13
  MPs - 13
  way - 12
NOUNS: 
  Old Trafford - 11
  Champions League - 9
  Premier League - 8
  Manchester United - 8
  home draw - 6
  Mrs May - 6
  Ole Gunnar Solskjaer - 5
  Orkney ’ - 4
  % possession - 4
  Louis van Gaal - 4
NOUN WITH ADJECTIVE: 
  last year - 7
  the first time - 7
  clean energy - 6
  ’ t - 5
  fossil fuels - 4
  the first leg - 4
  the 21st manager - 3
  final game - 3
  young players - 3
  surplus electricity - 2


### Find results in wikipedia

In [15]:
import wikipedia

In [16]:
# Extract first entity from text
def extractFirstEntities(ne_chunked):
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            return text, ent
    return None

In [17]:
def check_entity(tagged, pattern):
    #print(tagged)
    cp = nltk.RegexpParser(pattern)
    return extractFirstEntities(cp.parse(tagged))

In [19]:
def category_detection_from_wikipedia(data_dict):
    result = []
    cnt = 0
    for cat in data_dict:
        cnt += 1
        for entity in data_dict[cat]:
            try:
                results = wikipedia.search(entity)
                desc = ""
                if len(results) > 0:
                    desc = check_entity(nltk.pos_tag(nltk.word_tokenize(wikipedia.page(results[0]).summary)),
                                       """IS: {<VB.*>+<DT>+<JJ.*>*<NN.*>+}""")[0]
                    desc = check_entity(nltk.pos_tag(nltk.word_tokenize(desc)),
                                       """IS: {<JJ.*>*<NN.*>+}""")[0]
                if len(desc) == 0:
                        desc = 'Thing'
                result.append([entity, desc, cat, data_dict[cat][entity]])
            except:
                continue
    return result

In [37]:
def print_some_from_wiki(data, n):
    cnt = 0
    for i in data:
        if cnt == n:
            break
        cnt += 1
        print(i[0], "Description: " + i[1], "Entity: " + i[2], "Number in text: " + str(i[3]), sep='\n   ' )

#### Find wikipedia results in entity recognition by nltk

In [20]:
wiki_anotated_nltk_entity = category_detection_from_wikipedia(cnt_NER)
wiki_anotated_nltk_entity.sort(key=lambda x: x[3])
wiki_anotated_nltk_entity = wiki_anotated_nltk_entity[::-1]

In [39]:
print_some_from_wiki(wiki_anotated_nltk_entity, 10)

Orkney
   Description: archipelago
   Entity: PERSON
   Number in text: 150
EMEC
   Description: UKAS
   Entity: ORGANIZATION
   Number in text: 75
UK
   Description: sovereign country
   Entity: ORGANIZATION
   Number in text: 60
Shapinsay
   Description: eighth largest island
   Entity: GPE
   Number in text: 60
Orkney
   Description: archipelago
   Entity: GPE
   Number in text: 60
Bews
   Description: inherent limitations
   Entity: PERSON
   Number in text: 45
Orcadians
   Description: people
   Entity: ORGANIZATION
   Number in text: 30
Orkney
   Description: archipelago
   Entity: ORGANIZATION
   Number in text: 30
Stromness
   Description: second-most populous town
   Entity: GPE
   Number in text: 30
Scotland
   Description: country
   Entity: GPE
   Number in text: 30


#### Find wikipedia results in entity recognition by my own patterns

In [None]:
# del from entity take by my pattern entity with some count for get from wikipedia -> this take a long time
size = 4
delel = []
for i in cnt_NER2:
    for j in cnt_NER2[i]:
        if cnt_NER2[i][j] < 3:
            delel.append((i, j))
for i in delel:
    del cnt_NER2[i[0]][i[1]]

In [31]:
wiki_anotated_my_entity = category_detection_from_wikipedia(cnt_NER2)
wiki_anotated_my_entity.sort(key=lambda x: x[3])
wiki_anotated_my_entity = wiki_anotated_my_entity[::-1]

In [40]:
print_some_from_wiki(wiki_anotated_my_entity, 10)

Solskjaer
   Description: Norwegian football manager
   Entity: PROPER NOUN
   Number in text: 47
Mourinho
   Description: Portuguese professional football coach
   Entity: PROPER NOUN
   Number in text: 23
hydrogen
   Description: chemical element
   Entity: PROPER NOUN
   Number in text: 21
Huawei
   Description: Chinese multinational telecommunications equipment
   Entity: PROPER NOUN
   Number in text: 15
job
   Description: person
   Entity: PROPER NOUN
   Number in text: 14
%
   Description: symbol
   Entity: PROPER NOUN
   Number in text: 13
way
   Description: eighth studio album
   Entity: PROPER NOUN
   Number in text: 12
Old Trafford
   Description: football stadium
   Entity: NOUNS
   Number in text: 11
Dumbo
   Description: mouse
   Entity: PROPER NOUN
   Number in text: 11
UK
   Description: sovereign country
   Entity: PROPER NOUN
   Number in text: 11
