In [1]:
import numpy as np 
import pandas as pd
import tqdm
from collections import defaultdict

In [2]:
TRAIN_SIZE = 0.8

In [3]:
train_data = pd.read_csv('../input/name-entity-recognition-ner-dataset/NER dataset.csv', encoding = 'unicode_escape')

In [4]:
train_data['Sentence #'].ffill(inplace=True)
train_data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
def get_list_from_data(col_name, data):
    return data.groupby('Sentence #')[col_name].apply(list)

def split_train_test_data(d_list):
    train_size = int(TRAIN_SIZE * len(d_list))
    return d_list[:train_size].to_list(), d_list[train_size:].to_list()
def create_vocab(corpus):
    vocab_dict = {}
    vocab = []
    vocab_freq = {}
    
    for i in corpus:
        vocab.extend(i)
        
    for word in sorted(vocab):
        vocab_freq[word] = vocab_freq.get(word,0) + 1
        
    for i, word in enumerate(vocab_freq):
        if vocab_freq[word] > 1:
            vocab_dict[word] = i
    
    return vocab_dict

In [6]:
def combine_lists(list_of_lists):
    c_list = []
    for i in list_of_lists:
        c_list.extend(i)
    return c_list

def create_corpus(word_l_array,pos_l_array):

    corpus = []
    
    for idx_sentence in range(len(word_l_array)):
        
        sentence = word_l_array[idx_sentence]
        pos_list = pos_l_array[idx_sentence]
        word_pos_l = []
        for idx_word in range(len(sentence)):
            word_pos_l.append((sentence[idx_word], pos_list[idx_word]))
            
        corpus.append(word_pos_l)
        
    return corpus

def create_transition_emission_tag_counts(corpus, vocab):
    
    transition_dict = defaultdict(int)
    emission_dict = defaultdict(int)
    tag_dict = defaultdict(int)

    for tagged_sentence in tqdm.tqdm_notebook(corpus, total = len(corpus)):
        
        prev_tag = '-start-'
        
        for word_tag in tagged_sentence:
            word, tag = word_tag
        
            transition_dict[(prev_tag, tag)] += 1
            emission_dict[(tag,word)] += 1
        
            tag_dict[tag] += 1
        
            prev_tag = tag
        
    return transition_dict, emission_dict, tag_dict

def get_accuracy(corpus, emission_counts, vocab, states):
    
    num_correct = 0
    total = 0
    for tagged_sentence in tqdm.tqdm_notebook(corpus, total=len(corpus)):
        total += len(tagged_sentence)
        
        for word_tag in tagged_sentence:
            word, tag = word_tag
            
            highest_freq = 1
            found_pos = ''
            
            if word in vocab:
                for state in states:
                    key = (state, word)
                    
                    freq = emission_counts[key]
            
                    if freq > highest_freq:
                        highest_freq = freq
                        found_pos = state
                        
                if found_pos == tag:
                    num_correct+=1
   
    print(f'Correctly Predicted : {num_correct}, Total Predictions :{total}')
    return num_correct / total
            

In [7]:
word_l_array = get_list_from_data('Word', train_data)
pos_l_array = get_list_from_data('POS', train_data)

train_wl, test_wl = split_train_test_data(word_l_array)
train_pos, test_pos = split_train_test_data(pos_l_array)

In [8]:
vocab = create_vocab(train_wl)

In [9]:
train_corpus = create_corpus(train_wl, train_pos)
test_corpus = create_corpus(test_wl, test_pos)

In [10]:
transition_counts, emission_counts, tag_counts = create_transition_emission_tag_counts(train_corpus, vocab)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/38367 [00:00<?, ?it/s]

In [11]:
states = sorted(tag_counts.keys())

In [12]:
accuracy  = {'TRAIN' : get_accuracy(train_corpus,emission_counts, vocab, states),
            'TEST' : get_accuracy(test_corpus,emission_counts, vocab, states)}

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/38367 [00:00<?, ?it/s]

Correctly Predicted : 787329, Total Predictions :839790


  0%|          | 0/9592 [00:00<?, ?it/s]

Correctly Predicted : 192683, Total Predictions :208785


In [13]:
df_accuracy = pd.DataFrame(accuracy, index=['Simple POS Tagger'])
df_accuracy

Unnamed: 0,TRAIN,TEST
Simple POS Tagger,0.937531,0.922878
