# Creating NER Models
## Purpose:
* Create sample Name Entity Recognition model from scratch (i.e. not NLTK implementation)
* Phase 2: Machine Learning

### Relevant Links:
- Data Repo
    * https://github.com/davidsbatista/NER-datasets/tree/master/CONLL2003
- Description of datasets used historically for NER
    * https://www.clips.uantwerpen.be/conll2003/ner/

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [67]:
# read in all data
train_df = pd.read_csv('train_df.csv')
test_df = pd.read_csv('test_df.csv')
val_df = pd.read_csv('valid_df.csv')

In [79]:
# start with simple Naive Bayes multi-class classification
# let's combine the pre-defined train/test/val sets so that we
# can have more train/test data (i.e. no val data)

# val data will be more useful if we want to train model
# with a neural network approach
df = pd.concat([train_df, test_df, val_df],axis=0)\
       .reset_index(drop=True)

In [29]:
def split_X_y(df):
    """Docstring: quickly parse through data to split vectorized text as feature vector X
    and label data (NER labels) y for machine learning models"""
    # vectorize words - this is essentially a one hot encoding for each word
    # into it's own n-dimensional space
    v = DictVectorizer(sparse=False)
    # fit/transform our data
    X = v.fit_transform(df.to_dict('records'))
    # return label values
    y = df.NER.values
    
    return X, y

In [46]:
# create feature vectors with encoded vectors and corresponding labels
X, y = split_X_y(df)

In [47]:
# perform test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [83]:
# Let's start with a simple Naive Bayes classifier
nb = MultinomialNB(alpha=0.01) # instantiate model
# train with a partial fit to not load all data into memory. can rerun this later with more data
nb.partial_fit(X_train, y_train, classes) 

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [84]:
# class names used for classification report
classes = df.NER.unique().tolist()

In [85]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = classes))

              precision    recall  f1-score   support

       B-ORG       1.00      1.00      1.00      3053
      B-MISC       1.00      1.00      1.00      1711
       B-PER       1.00      1.00      1.00      3360
       I-PER       1.00      1.00      1.00      2240
       B-LOC       1.00      1.00      1.00      3498
       I-ORG       1.00      1.00      1.00      1602
      I-MISC       1.00      1.00      1.00       524
       I-LOC       1.00      1.00      1.00       549

    accuracy                           1.00     16537
   macro avg       1.00      1.00      1.00     16537
weighted avg       1.00      1.00      1.00     16537



In [86]:
nb.score(X_test, y_test)

1.0

## Model has high precision, recall and accuracy
* Multinomial Naive Bayes worked fairly well with this dataset

# Functionalize model so that we can run in Flask API

In [164]:
import joblib

In [165]:
filename = "nb_ner.sav"
joblib.dump(nb, filename)

['nb_ner.sav']

In [97]:
# vectorize words into feature vector set
vectorized = split_X_y(df)[0]
# convert to a pandas DataFrame and add the original word to use as a look-up table
word_vectors = pd.DataFrame(vectorized)
word_vectors['Word'] = df['Word']

In [163]:
# save for quick uploading into Flask API
word_vectors.to_csv('word_vectors.csv',index=False)

In [173]:
# 
f = 'word_vectors.joblib'
joblib.dump(word_vectors, f)

['word_vectors.joblib']

In [166]:
# load naive bayes model from disk
model = joblib.load(filename)

def make_NER_prediction(string, word_vectors):
    """Docstring: make a prediction on a target word. If the word is in our corpus, 
    the model provides the NER. Otherwise, the model provides'O'.
    
    This function uses the Naive Bayes model trained previously. 
    
    Inputs: string - input string to find NER
            word_vectors - word vectors set (saved as word_vectors.csv)
            
    Outputs: type string object with either NER prediction or 'O' for out of scope"""
    
    if string in word_vectors.Word.tolist():
        # if the word is in our corpus, grab the vector. there could be multiple occurances
        # so we are grabbing the mean of this vector
        x = word_vectors.loc[word_vectors.Word==string].mean().values
        # this vector is of the right dimensions that the model was fit on
        # now we can make a prediction
        pred = model.predict(x.reshape(1,-1))[0]
        return pred
    else:
        return "O"


In [171]:
# Sample implementation

sentence = "Jack lives in London".split(' ')
NER = []
for word in sentence:
    # make prediction
    NER.append(make_NER_prediction(word, word_vectors))
    

In [172]:
NER

['B-PER', 'O', 'O', 'B-LOC']