In [1]:
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [2]:
def process_string(string):
    string = re.sub('[^A-Za-z0-9\-\/ ]+', ' ', string).split()
    return [y.strip() for y in string]

In [3]:
def parse_raw(filename):
    with open(filename, 'r') as fopen:
        entities = fopen.read()
    soup = BeautifulSoup(entities, 'html.parser')
    inside_tag = ''
    texts, labels = [], []
    for sentence in soup.prettify().split('\n'):
        if len(inside_tag):
            splitted = process_string(sentence)
            texts += splitted
            labels += [inside_tag] * len(splitted)
            inside_tag = ''
        else:
            if not sentence.find('</'):
                pass
            elif not sentence.find('<'):
                inside_tag = sentence.split('>')[0][1:]
            else:
                splitted = process_string(sentence)
                texts += splitted
                labels += ['OTHER'] * len(splitted)
    assert (len(texts)==len(labels)), "length texts and labels are not same"
    print('len texts and labels: ', len(texts))
    return texts,labels

In [4]:
train_texts, train_labels = parse_raw('data_train.txt')

len texts and labels:  34012


In [5]:
test_texts, test_labels = parse_raw('data_test.txt')
train_texts += test_texts
train_labels += test_labels

len texts and labels:  9249


In [6]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'location', 'organization', 'person', 'quantity', 'time'],
       dtype='<U12'), array([35613,  1536,  1592,  2358,  1336,   826]))

In [7]:
with open('entities-bm-normalize-v3.txt','r') as fopen:
    entities_bm = fopen.read().split('\n')[:-1]
entities_bm = [i.split() for i in entities_bm]
entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]

In [8]:
replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',
             'EVENT':'OTHER','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}
for i in entities_bm:
    try:
        string = process_string(i[0])
        if len(string):
            train_labels.append(replace_by[i[1]])
            train_texts.append(process_string(i[0])[0])  
    except Exception as e:
        print(e)
        
assert (len(train_texts)==len(train_labels)), "length texts and labels are not same"

'KN'
'KA'


In [9]:
np.unique(train_labels,return_counts=True)

(array(['OTHER', 'law', 'location', 'organization', 'person', 'quantity',
        'time'], dtype='<U12'),
 array([47406,   107,  2010,  2435,  3913,  1336,  1240]))

In [10]:
target = LabelEncoder().fit_transform(train_labels)
bow_chars = CountVectorizer(ngram_range=(2, 4), analyzer='char',lowercase=False).fit(train_texts)
vectors = bow_chars.transform(train_texts)
vectors.shape

(58447, 21197)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)
del vectors

In [12]:
from sklearn import metrics

In [13]:
multinomial = MultinomialNB().fit(train_X, train_Y)
print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = np.unique(train_labels)))

              precision    recall  f1-score   support

       OTHER       0.95      0.93      0.94     37870
         law       0.81      0.33      0.47        87
    location       0.68      0.73      0.70      1613
organization       0.53      0.70      0.61      1957
      person       0.74      0.83      0.78      3174
    quantity       0.61      0.42      0.50      1094
        time       0.69      0.66      0.67       962

 avg / total       0.90      0.89      0.89     46757



In [14]:
print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = np.unique(train_labels)))

              precision    recall  f1-score   support

       OTHER       0.95      0.93      0.94      9536
         law       0.56      0.25      0.34        20
    location       0.60      0.67      0.63       397
organization       0.46      0.62      0.53       478
      person       0.66      0.75      0.70       739
    quantity       0.47      0.33      0.39       242
        time       0.69      0.59      0.64       278

 avg / total       0.88      0.87      0.88     11690



In [15]:
import pickle
with open('multinomial-entities.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('bow-entities.pkl','wb') as fopen:
    pickle.dump(bow_chars,fopen)