In [1]:
'''
    Reimplementation of bam2014named
    Nepali NER using SVM
    
    Author - Oyesh Mann Singh
    Date - 05/24/2019
    
    Reimplementation on server
    Date - 05/31/2019
'''

import os
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn.svm as svm

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

## To convert the dataset into kaggle format. Just for ease!!!

In [3]:
data_set='umbc'

data_root_path='../data/ner/bal/'
if data_set == 'umbc':
    data_root_path='../data/ner/combined/after_stemming/'

datapath=data_root_path+'total.txt'
outpath=data_root_path+'total_kaggle.txt'

def kaggle_converter(datapath, outpath):
    with open(datapath, 'r', encoding='utf-8') as in_file, open(outpath, 'w', encoding='utf-8') as out_file:
        reader = csv.reader(in_file, delimiter=' ',  quoting=csv.QUOTE_NONE)
        sent_counter=1
        for i, row in enumerate(reader):
            if len(row)>0:
                out_file.write(str(sent_counter)+' '+row[0]+' '+row[1]+' '+row[2]+'\n')
            else:
                sent_counter+=1
                out_file.write('\n')

        in_file.close()
        out_file.close()
    
# If you need to convert the data into kaggle format, then unlock the below function
# Kaggle Format: <SENTENCE NO> <WORD> <POS_TAG> <ENTITY_TAG>
# kaggle_converter(datapath, outpath)

In [4]:
df = pd.read_csv(outpath, sep=' ', encoding='utf-8', names=['SENTENCE','WORDS', 'POS', 'TAGS'], quoting=csv.QUOTE_NONE)

# Drop POS, since we are not going to use it for this experiment
df = df.drop(columns=['POS'])
df.head(5)

Unnamed: 0,SENTENCE,WORDS,TAGS
0,1,वकिल,O
1,1,साहेव,O
2,1,ले,O
3,1,यत्ति,O
4,1,भन्न,O


In [5]:
print("Total unique sentences = ", df.SENTENCE.nunique())
print("Total unique WORDS = ", df.WORDS.nunique())
print("Total unique TAGS = ", df.TAGS.nunique())

Total unique sentences =  3606
Total unique WORDS =  12473
Total unique TAGS =  4


In [6]:
df.groupby('TAGS').size().reset_index(name='counts')

Unnamed: 0,TAGS,counts
0,LOC,2313
1,O,82775
2,ORG,3811
3,PER,5059


In [7]:
# Read the corpus
corpus_path=data_root_path+'text_tag_only/text_only.txt'
with open(corpus_path, 'r', encoding='utf-8') as in_file:
    corpus = in_file.read().split()

vocab=sorted(set(corpus))
print ('{} unique word'.format(len(vocab)))

12473 unique word


In [8]:
# Creating a mapping from unique characters to indices
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)

## Read gazeetteer list

In [9]:
gaz_path='../data/ner/NER_surya/Archive/Final_gazetteer/'
ent_path='../data/ner/NER_surya/Archive/Final_dictionary/'

with open(gaz_path+'Action verb.txt', 'r', encoding='utf-8-sig') as in_file:
    verb_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Common location word.txt', 'r', encoding='utf-8-sig') as in_file:
    comm_loc_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Day name.txt', 'r', encoding='utf-8-sig') as in_file:
    day_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Designation words.txt', 'r', encoding='utf-8-sig') as in_file:
    desig_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Middle name.txt', 'r', encoding='utf-8-sig') as in_file:
    mid_name_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Month name.txt', 'r', encoding='utf-8-sig') as in_file:
    month_name_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Organization suffix word.txt', 'r', encoding='utf-8-sig') as in_file:
    org_suff_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Person prefix word.txt', 'r', encoding='utf-8-sig') as in_file:
    per_prefix_list=in_file.read().strip().split()
    in_file.close()
    
with open(gaz_path+'Surname.txt', 'r', encoding='utf-8-sig') as in_file:
    surname_list=in_file.read().strip().split()
    in_file.close()
    
with open(ent_path+'1_person.txt', 'r', encoding='utf-8-sig') as in_file:
    per_list=in_file.read().strip().split()
    in_file.close()
    
with open(ent_path+'2_location.txt', 'r', encoding='utf-8-sig') as in_file:
    loc_list=in_file.read().strip().split()
    in_file.close()
    
with open(ent_path+'3_organization.txt', 'r', encoding='utf-8-sig') as in_file:
    org_list=in_file.read().strip().split()
    in_file.close()
    
with open(ent_path+'4_misc.txt', 'r', encoding='utf-8-sig') as in_file:
    misc_list=in_file.read().strip().split()
    in_file.close()

## Get the sentence

In [10]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['WORDS'].values.tolist(),
                                                           s['TAGS'].values.tolist())]
        self.grouped = self.data.groupby('SENTENCE').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

getter = SentenceGetter(df)
sentences = getter.sentences

## Assign features

In [11]:
def isdigitandcomma(word):
    if word.count(',') > 0:
        return ''.join(word.split(',')).isdigit()
    else:
        return False
    
def isdigitandhyphen(word):
    if word.count('-') > 0:
        return ''.join(word.split('-')).isdigit()
    else:
        return False
    
def isdigitandpercentage(word):
    if word.count('%') > 0:
        return ''.join(word.split('%')).isdigit()
    else:
        return False

def isdigitandslash(word):
    if word.count('/') > 0:
        return ''.join(word.split('/')).isdigit()
    elif word.count('\\') > 0:
        return ''.join(word.split('\\')).isdigit()
    else:
        return False

# Assign features for every word
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'word': word2idx[word],
        'BOS': True if i == 0 else False,
        'word.length()': True if len(word) < 2 else False,
        'word.isdigit()': word.isdigit(),
        'word.isfourdigit()': True if word.isdigit() and len(word) == 4 else False,
        'word.istwodigit()': True if word.isdigit() and len(word) == 2 else False,
        'word.isdigitandcomma()': True if word.count(',') > 0 and ''.join(word.split(',')).isdigit() else False,
        'word.isdigitandslash()': isdigitandslash(word),
        'word.isdigitandhyphen()': isdigitandhyphen(word),
        'word.isdigitandpercentage()': isdigitandpercentage(word),
        'gaz.isperson()': True if word in per_list else False,
        'gaz.isloc()': True if word in loc_list else False,
        'gaz.isorg()': True if word in org_list else False,
        'gaz.ismonth()': True if word in month_name_list else False,
        'gaz.isday()': True if word in day_list else False,
        'gaz.isperprefix()': True if word in per_prefix_list else False,    
        'gaz.ismidname()': True if word in mid_name_list else False,
        'gaz.issurname()': True if word in surname_list else False,
        'gaz.iscommlocword()': True if word in comm_loc_list else False,
        'gaz.isverb()': True if word in verb_list else False,
        'gaz.isdesignation()': True if word in desig_list else False,
        'gaz.isorgsuffix()': True if word in org_suff_list else False,        
    }
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [12]:
# Get features for every word
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [13]:
# Flatten the X and y list, since we are not going to care about the context
flat_X = [item for sublist in X for item in sublist]
flat_y = [item for sublist in y for item in sublist]

In [14]:
flat_X[0]

{'word': 9993,
 'BOS': True,
 'word.length()': False,
 'word.isdigit()': False,
 'word.isfourdigit()': False,
 'word.istwodigit()': False,
 'word.isdigitandcomma()': False,
 'word.isdigitandslash()': False,
 'word.isdigitandhyphen()': False,
 'word.isdigitandpercentage()': False,
 'gaz.isperson()': False,
 'gaz.isloc()': False,
 'gaz.isorg()': False,
 'gaz.ismonth()': False,
 'gaz.isday()': False,
 'gaz.isperprefix()': False,
 'gaz.ismidname()': False,
 'gaz.issurname()': False,
 'gaz.iscommlocword()': False,
 'gaz.isverb()': False,
 'gaz.isdesignation()': True,
 'gaz.isorgsuffix()': False}

### Vectorize the features

In [15]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
X_vec = v.fit_transform(flat_X)

vec_df = pd.DataFrame.from_dict(X_vec, orient='columns')
vec_df.columns = v.feature_names_

vec_df.head()

Unnamed: 0,BOS,gaz.iscommlocword(),gaz.isday(),gaz.isdesignation(),gaz.isloc(),gaz.ismidname(),gaz.ismonth(),gaz.isorg(),gaz.isorgsuffix(),gaz.isperprefix(),...,gaz.isverb(),word,word.isdigit(),word.isdigitandcomma(),word.isdigitandhyphen(),word.isdigitandpercentage(),word.isdigitandslash(),word.isfourdigit(),word.istwodigit(),word.length()
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9993.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11518.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9891.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9048.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8079.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, flat_y, test_size=0.33, random_state=163)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [17]:
v.feature_names_

['BOS',
 'gaz.iscommlocword()',
 'gaz.isday()',
 'gaz.isdesignation()',
 'gaz.isloc()',
 'gaz.ismidname()',
 'gaz.ismonth()',
 'gaz.isorg()',
 'gaz.isorgsuffix()',
 'gaz.isperprefix()',
 'gaz.isperson()',
 'gaz.issurname()',
 'gaz.isverb()',
 'word',
 'word.isdigit()',
 'word.isdigitandcomma()',
 'word.isdigitandhyphen()',
 'word.isdigitandpercentage()',
 'word.isdigitandslash()',
 'word.isfourdigit()',
 'word.istwodigit()',
 'word.length()']

In [18]:
svec = svm.SVC()

In [19]:
%%time
svec.fit(X_train, y_train)

CPU times: user 6min 4s, sys: 300 ms, total: 6min 4s
Wall time: 6min 4s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

### Save model

In [21]:
import pickle
model_path='../data/models/'
model_file=model_path+'svm_ner_models.pkl'

pickle.dump(svec, open(model_file, 'wb'))

### Load model

In [22]:
svec=pickle.load(open(model_file, 'rb'))

In [23]:
y_pred = svec.predict(X_test)

In [24]:
y = df.TAGS.values

classes = np.unique(y)
classes = classes.tolist()
new_classes = classes.copy()
if data_set == 'bal':
    new_classes.pop(2)        # To remove 'O' tag
else:
    new_classes.pop(1)        # To remove 'O' tag
new_classes

['LOC', 'ORG', 'PER']

In [25]:
y_test=np.asarray(y_test)

### Accuracy reporting

In [26]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.9375302351082013


### Reporting F1 score

In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, labels=new_classes, target_names=new_classes, digits=4))

             precision    recall  f1-score   support

        LOC     0.7574    0.5553    0.6407       742
        ORG     0.7826    0.5131    0.6198      1263
        PER     0.8314    0.6107    0.7041      1631

avg / total     0.7993    0.5655    0.6619      3636



### Write results to file for conll evaluation

In [28]:
def write_to_file():
    output_file='../data/ner/results/ner_svm_umbc.txt'
    X_test_list=[]
    for each in X_test:
        X_test_list.append(idx2word[int(each[13])])    
    y_test_list = y_test.tolist()
    y_pred_list = y_pred.tolist()
    with open(output_file, 'w', encoding='utf-8') as f:
        for x,y,z in zip(X_test_list, y_test_list, y_pred_list):
            f.write(x+' '+y+' '+z+'\n')
            
write_to_file()