In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV, LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("ner.csv", index_col=0)
data

Unnamed: 0_level_0,Word,Pos_tag,Tag
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,in,IN,O
1,in,IN,O
2,pedestrians,NNP,O
3,April Sosa,NNP,PERSON
4,Keith Golden,NNP,PERSON
...,...,...,...
14175,Motaz,NNP,PERSON
14176,Ahmed,NNP,PERSON
14177,Amar,NNP,PERSON
14178,Moaz,NNP,PERSON


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14180 entries, 0 to 14179
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Word     14180 non-null  object
 1   Pos_tag  14180 non-null  object
 2   Tag      14180 non-null  object
dtypes: object(3)
memory usage: 443.1+ KB


# **Preprocessing**

In [6]:
data.dropna(inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14180 entries, 0 to 14179
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Word     14180 non-null  object
 1   Pos_tag  14180 non-null  object
 2   Tag      14180 non-null  object
dtypes: object(3)
memory usage: 443.1+ KB


In [8]:
pd.DataFrame({"Number of Duplicates Rows":data.duplicated().sum()},index=[''])

Unnamed: 0,Number of Duplicates Rows
,7311


In [9]:
data = data.drop_duplicates()

In [10]:
pd.DataFrame({"Number of Duplicates Rows":data.duplicated().sum()},index=[''])

Unnamed: 0,Number of Duplicates Rows
,0


In [11]:
data.shape

(6869, 3)

In [12]:
data['Tag'].unique()

array(['O', 'PERSON', 'DATE', 'P-NUMBER', 'CARDINAL', 'COMPANY', 'ORG',
       'COLOR', 'Org', 'CITY', 'COUNTRY'], dtype=object)

In [13]:
data['Tag_Encoding'] = data['Tag'].factorize()[0]
data['Tag_Encoding']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Tag_Encoding'] = data['Tag'].factorize()[0]


Index
0        0
2        0
3        1
4        1
5        0
        ..
14175    1
14176    1
14177    1
14178    1
14179    1
Name: Tag_Encoding, Length: 6869, dtype: int64

In [14]:
data['Tag_Encoding'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [15]:
Tag_label={'O':0, 'PERSON':1, 'DATE':2, 'P-NUMBER':3, 'CARDINAL':4, 'COMPANY':5, 'ORG':6,
       'COLOR':7, 'Org':8, 'CITY':9, 'COUNTRY':10}

In [16]:
Tag_label

{'O': 0,
 'PERSON': 1,
 'DATE': 2,
 'P-NUMBER': 3,
 'CARDINAL': 4,
 'COMPANY': 5,
 'ORG': 6,
 'COLOR': 7,
 'Org': 8,
 'CITY': 9,
 'COUNTRY': 10}

In [17]:
df = data.Tag.explode().to_frame().reset_index(drop=True)
dfc = data.groupby('Tag').Tag.count().reset_index(name = 'Count').sort_values(['Count'],ascending=False)
dfc.plot.bar(x='Tag')

<Axes: xlabel='Tag'>

In [18]:
data.dtypes

Word            object
Pos_tag         object
Tag             object
Tag_Encoding     int64
dtype: object

In [19]:
data['Word'].unique()

array(['in', 'pedestrians', 'April Sosa', ..., 'Amar', 'Moaz', 'Amr'],
      dtype=object)

In [20]:
X = data[["Word", "Pos_tag"]]
Y = data["Tag"]

word_vectorizer = CountVectorizer()
X_words = word_vectorizer.fit_transform(X["Word"])

pos_vectorizer = CountVectorizer()
X_pos = pos_vectorizer.fit_transform(X["Pos_tag"])
X_combined = pd.concat(
    [pd.DataFrame(X_words.toarray()), pd.DataFrame(X_pos.toarray())], axis=1
)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(X_combined, Y, test_size=0.3,random_state=42,shuffle=True)

In [22]:
Model = {"SVC": SVC(kernel="linear"), "LogisticRegression": LogisticRegression(), "RidgeClassifierCV": RidgeClassifierCV(alphas=[0.1,1,10]),
         "SGDClassifier": SGDClassifier(), "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=150 
                                                                                            ,criterion='entropy'),
              
         "RandomForestClassifier": RandomForestClassifier(n_estimators=100,
                                                          max_depth=150,
                                                          criterion='entropy', 
                                                          n_jobs=2),
         "mlp_Classifier":MLPClassifier(learning_rate_init=0.0005, max_iter=300)}
def BuildModel(training_padded, training_labels, testing_padded, testing_labels, model):
  dic_test_evluation_metrics={}
  dic_train_evluation_metrics={}
  dic_models={}

  for model_name in model:
      current_model = model[model_name]
      current_model.fit(x_train, y_train)
      dic_models[model_name] = current_model

      y_test_pre = current_model.predict(x_test)
      y_train_pre = current_model.predict(x_train)

      dic_test_evluation_metrics[model_name] = dic_evaluation_test = {
          "Model_name": model_name,
          "accuracy": round(accuracy_score(y_test, y_test_pre) * 100, 3),
          "precision": round(precision_score(y_test, y_test_pre, average='macro') * 100, 3),
          "recall_score": round(recall_score(y_test, y_test_pre, average='macro') * 100, 3),
          "f1_score": round(f1_score(y_test, y_test_pre, average='macro') * 100, 3),
          "confusion_matrix": confusion_matrix(y_test, y_test_pre)}
      
      
      dic_train_evluation_metrics[model_name] = dic_evaluation_train = {
          "Model_name": model_name,
          "accuracy":  round(accuracy_score(y_train, y_train_pre)* 100, 3),
          "precision": round(precision_score(y_train, y_train_pre, average='macro')* 100, 3),
          "recall_score":  round(recall_score(y_train, y_train_pre, average='macro')* 100, 3),
          "f1_score":  round(f1_score(y_train, y_train_pre, average='macro')* 100, 3),
          "confusion_matrix": confusion_matrix(y_train, y_train_pre)}
  return dic_test_evluation_metrics, dic_train_evluation_metrics, dic_models
    

In [23]:
dic_test_evluation_metrics, dic_train_evluation_metrics, dic_models = BuildModel(x_train, y_train, x_test, y_test, Model)

  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  ret = a @ b
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
pd.DataFrame(dic_test_evluation_metrics.values()).drop(['confusion_matrix'], axis = 1)

Unnamed: 0,Model_name,accuracy,precision,recall_score,f1_score
0,SVC,76.274,72.605,49.141,52.201
1,LogisticRegression,73.217,76.654,44.683,46.681
2,RidgeClassifierCV,78.651,70.925,50.134,52.417
3,SGDClassifier,78.991,67.452,51.337,53.409
4,DecisionTreeClassifier,67.88,70.847,41.072,43.458
5,RandomForestClassifier,67.103,73.612,40.878,42.328
6,mlp_Classifier,80.107,62.511,50.769,52.054


In [33]:
pd.DataFrame(dic_train_evluation_metrics.values()).drop('confusion_matrix', axis=1)

Unnamed: 0,Model_name,accuracy,precision,recall_score,f1_score
0,SVC,99.314,98.747,99.174,98.953
1,LogisticRegression,86.314,76.872,52.025,55.332
2,RidgeClassifierCV,99.334,99.042,99.372,99.199
3,SGDClassifier,99.21,98.13,99.08,98.554
4,DecisionTreeClassifier,77.704,95.009,58.17,62.878
5,RandomForestClassifier,73.461,75.941,45.199,49.417
6,mlp_Classifier,99.563,99.61,99.296,99.45


In [24]:
text1 = """The fact that Henry Armstrong was buried did not seem to him to prove that he was dead: he had always been a hard man to convince. That 
he really was buried, the testimony of his senses compelled him to admit. His posture -- flat upon his back, with his hands crossed upon his 
stomach and tied with something that he easily broke without profitably altering the situation -- the strict confinement of his entire person, the 
black darkness and profound silence, made a body of evidence impossible to controvert and he accepted it without cavil.
But dead -- no; he was only very, very ill. He had, withal, the invalid's apathy and did not greatly concern himself about the uncommon fate that 
had been allotted to him. No philosopher was he -- just a plain, commonplace person gifted, for the time being, with a pathological 
indifference: the organ that he feared consequences with was torpid. So, with no particular apprehension for his immediate future, he fell 
asleep and all was peace with Henry Armstrong.
But something was going on overhead. It was a dark summer night, shot through with infrequent shimmers of lightning silently firing a cloud 
lying low in the west and portending a storm. These brief, stammering illuminations brought out with ghastly distinctness the monuments and 
headstones of the cemetery and seemed to set them dancing. It was not a night in which any credible witness was likely to be straying about a 
cemetery, so the three men who were there, digging into the grave of Henry Armstrong, felt reasonably secure.
Two of them were young students from a medical college a few miles away; the third was a gigantic negro known as Jess. For many years 
Jess had been employed about the cemetery as a man-of-all-work and it was his favourite pleasantry that he knew 'every soul in the place.' 
From the nature of what he was now doing it was inferable that the place was not so populous as its register may have shown it to be.
Outside the wall, at the part of the grounds farthest from the public road, were a horse and a light wagon, waiting.
The work of excavation was not difficult: the earth with which the grave had been loosely filled a few hours before offered little resistance and 
was soon thrown out. Removal of the casket from its box was less easy, but it was taken out, for it was a perquisite of Jess, who carefully 
unscrewed the cover and laid it aside, exposing the body in black trousers and white shirt. At that instant the air sprang to flame, a cracking 
shock of thunder shook the stunned world and Henry Armstrong tranquilly sat up. With inarticulate cries the men fled in terror, each in a 
different direction. For nothing on earth could two of them have been persuaded to return. But Jess was of another breed.
In the grey of the morning the two students, pallid and haggard from anxiety and with the terror of their adventure still beating tumultuously in 
their blood, met at the medical college.
'You saw it?' cried one.
'God! yes -- what are we to do?'
They went around to the rear of the building, where they saw a horse, attached to a light wagon, hitched to a gatepost near the door of the 
dissecting-room. Mechanically they entered the room. On a bench in the obscurity sat the negro Jess. He rose, grinning, all eyes and teeth.
'I'm waiting for my pay,' he said.
Stretched naked on a long table lay the body of Henry Armstrong, the head defiled with blood and clay from a blow with a spade."""

In [25]:
import nltk
from nltk import tokenize
from nltk.corpus import stopwords

In [26]:
sentences = tokenize.sent_tokenize(text1)
words = [tokenize.word_tokenize(sent) for sent in sentences]
words

[['The',
  'fact',
  'that',
  'Henry',
  'Armstrong',
  'was',
  'buried',
  'did',
  'not',
  'seem',
  'to',
  'him',
  'to',
  'prove',
  'that',
  'he',
  'was',
  'dead',
  ':',
  'he',
  'had',
  'always',
  'been',
  'a',
  'hard',
  'man',
  'to',
  'convince',
  '.'],
 ['That',
  'he',
  'really',
  'was',
  'buried',
  ',',
  'the',
  'testimony',
  'of',
  'his',
  'senses',
  'compelled',
  'him',
  'to',
  'admit',
  '.'],
 ['His',
  'posture',
  '--',
  'flat',
  'upon',
  'his',
  'back',
  ',',
  'with',
  'his',
  'hands',
  'crossed',
  'upon',
  'his',
  'stomach',
  'and',
  'tied',
  'with',
  'something',
  'that',
  'he',
  'easily',
  'broke',
  'without',
  'profitably',
  'altering',
  'the',
  'situation',
  '--',
  'the',
  'strict',
  'confinement',
  'of',
  'his',
  'entire',
  'person',
  ',',
  'the',
  'black',
  'darkness',
  'and',
  'profound',
  'silence',
  ',',
  'made',
  'a',
  'body',
  'of',
  'evidence',
  'impossible',
  'to',
  'controver

In [27]:
len(words)

32

In [28]:
word_tags = [nltk.pos_tag(sent) for sent in words]
word_tags

[[('The', 'DT'),
  ('fact', 'NN'),
  ('that', 'IN'),
  ('Henry', 'NNP'),
  ('Armstrong', 'NNP'),
  ('was', 'VBD'),
  ('buried', 'VBN'),
  ('did', 'VBD'),
  ('not', 'RB'),
  ('seem', 'VB'),
  ('to', 'TO'),
  ('him', 'PRP'),
  ('to', 'TO'),
  ('prove', 'VB'),
  ('that', 'IN'),
  ('he', 'PRP'),
  ('was', 'VBD'),
  ('dead', 'JJ'),
  (':', ':'),
  ('he', 'PRP'),
  ('had', 'VBD'),
  ('always', 'RB'),
  ('been', 'VBN'),
  ('a', 'DT'),
  ('hard', 'JJ'),
  ('man', 'NN'),
  ('to', 'TO'),
  ('convince', 'VB'),
  ('.', '.')],
 [('That', 'IN'),
  ('he', 'PRP'),
  ('really', 'RB'),
  ('was', 'VBD'),
  ('buried', 'VBN'),
  (',', ','),
  ('the', 'DT'),
  ('testimony', 'NN'),
  ('of', 'IN'),
  ('his', 'PRP$'),
  ('senses', 'NNS'),
  ('compelled', 'VBD'),
  ('him', 'PRP'),
  ('to', 'TO'),
  ('admit', 'VB'),
  ('.', '.')],
 [('His', 'PRP$'),
  ('posture', 'NN'),
  ('--', ':'),
  ('flat', 'JJ'),
  ('upon', 'IN'),
  ('his', 'PRP$'),
  ('back', 'NN'),
  (',', ','),
  ('with', 'IN'),
  ('his', 'PRP$'),
  ('h

In [29]:
word=[]
pos=[]
for idx in word_tags:
    for idj in idx:
        word.append(idj[0])

for idx in word_tags:
    for idj in idx:
        pos.append(idj[1])

frame={'Word':word,
       'Pos_Tag':pos}
frame = pd.DataFrame(frame)
frame

Unnamed: 0,Word,Pos_Tag
0,The,DT
1,fact,NN
2,that,IN
3,Henry,NNP
4,Armstrong,NNP
...,...,...
690,blow,NN
691,with,IN
692,a,DT
693,spade,NN


In [30]:
X = frame[["Word", "Pos_Tag"]]


X_words = word_vectorizer.transform(X["Word"])


X_pos = pos_vectorizer.transform(X["Pos_Tag"])
X_combined = pd.concat(
    [pd.DataFrame(X_words.toarray()), pd.DataFrame(X_pos.toarray())], axis=1
)

In [31]:
classifier = dic_models['SVC']
text_pred = classifier.predict(X_combined)

display(text_pred)

array(['O', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [32]:
word = list(frame.Word)
print(word)
print(len(word))
print(len(text_pred))

['The', 'fact', 'that', 'Henry', 'Armstrong', 'was', 'buried', 'did', 'not', 'seem', 'to', 'him', 'to', 'prove', 'that', 'he', 'was', 'dead', ':', 'he', 'had', 'always', 'been', 'a', 'hard', 'man', 'to', 'convince', '.', 'That', 'he', 'really', 'was', 'buried', ',', 'the', 'testimony', 'of', 'his', 'senses', 'compelled', 'him', 'to', 'admit', '.', 'His', 'posture', '--', 'flat', 'upon', 'his', 'back', ',', 'with', 'his', 'hands', 'crossed', 'upon', 'his', 'stomach', 'and', 'tied', 'with', 'something', 'that', 'he', 'easily', 'broke', 'without', 'profitably', 'altering', 'the', 'situation', '--', 'the', 'strict', 'confinement', 'of', 'his', 'entire', 'person', ',', 'the', 'black', 'darkness', 'and', 'profound', 'silence', ',', 'made', 'a', 'body', 'of', 'evidence', 'impossible', 'to', 'controvert', 'and', 'he', 'accepted', 'it', 'without', 'cavil', '.', 'But', 'dead', '--', 'no', ';', 'he', 'was', 'only', 'very', ',', 'very', 'ill', '.', 'He', 'had', ',', 'withal', ',', 'the', 'invalid'

In [33]:
for idx in range(len(text_pred)):
      if text_pred[idx] == "O":
            continue
            
      elif text_pred[idx] == "PERSON":
            print(f"{word[idx]}: PERSON")
      elif text_pred[idx] == "DATE":
            print(f"{word[idx]}: DATE")  
      elif text_pred[idx] == "P-NUMBER":
            print(f"{word[idx]}: P-NUMBER")  
      elif text_pred[idx] == "CARDINAL":
            print(f"{word[idx]}: CARDINAL")
      elif text_pred[idx] == "COMPANY":
            print(f"{word[idx]}: COMPANY")
      elif text_pred[idx] == "ORG":
            print(f"{word[idx]}: ORG")
      elif text_pred[idx] == "COLOR":
            print(f"{word[idx]}: COLOR")
      elif text_pred[idx] == "Org":
            print(f"{word[idx]}: Org")
      elif text_pred[idx] == "CITY":
            print(f"{word[idx]}: CITY")
      elif text_pred[idx] == "COUNTRY":
            print(f"{word[idx]}: COUNTRY")
      

Henry: PERSON
Henry: PERSON
Henry: PERSON
Two: CARDINAL
Henry: PERSON
two: CARDINAL
two: CARDINAL
'You: CARDINAL
'God: CARDINAL
Henry: PERSON


In [34]:
from joblib import dump, load     # save model
model = dic_models['mlp_Classifier']
dump(model,"mlp_Classifier.joblib")

['mlp_Classifier.joblib']

In [35]:
loaded_model = load('mlp_classifier.joblib')

# GUI

In [37]:
import streamlit as st
from joblib import dump, load
from sklearn.neural_network import MLPClassifier
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import pickle

model_loaded = load("mlp_Classifier.joblib")
st.sidebar.header("Name Entity Recognition Task: ")

text = st.sidebar.text_area("Enter Your Text:")

def preprocess_text(text):
    sentences = tokenize.sent_tokenize(text)
    words = [tokenize.word_tokenize(sent) for sent in sentences]
    word_tags = [nltk.pos_tag(sent) for sent in words]
    
    word=[]
    pos=[]
    for idx in word_tags:
        for idj in idx:
            word.append(idj[0])

    for idx in word_tags:
        for idj in idx:
            pos.append(idj[1])

    frame={'Word':word,
        'Pos_Tag':pos}
    frame = pd.DataFrame(frame)
    X = frame[["Word", "Pos_Tag"]]
    
    with open('word_vectorizer.pkl', 'rb') as f:
        word_vectorizer = pickle.load(f)

    with open('pos_vectorizer.pkl', 'rb') as f:
        pos_vectorizer = pickle.load(f)
        
    X_words = word_vectorizer.transform(X["Word"])

    X_pos = pos_vectorizer.transform(X["Pos_Tag"])
    X_combined = pd.concat(
        [pd.DataFrame(X_words.toarray()), pd.DataFrame(X_pos.toarray())], axis=1
    )
    
    return [ X_combined , frame.Word] 

if text:
   Text_Features , word = preprocess_text(text)
   word = list(word)
   text_pred = model_loaded.predict(Text_Features)
   
   st.header("The Result:")
   
   for idx in range(len(text_pred)):
      if text_pred[idx] == "O":
            continue
            # print (f"{word[idx]}: O") 
      elif text_pred[idx] == "PERSON":
            st.write(f"{word[idx]}: PERSON")
      elif text_pred[idx] == "DATE":
            st.write(f"{word[idx]}: DATE")  
      elif text_pred[idx] == "P-NUMBER":
            st.write(f"{word[idx]}: P-NUMBER")  
      elif text_pred[idx] == "CARDINAL":
            st.write(f"{word[idx]}: CARDINAL")
      elif text_pred[idx] == "COMPANY":
            st.write(f"{word[idx]}: COMPANY")
      elif text_pred[idx] == "ORG":
            st.write(f"{word[idx]}: ORG")
      elif text_pred[idx] == "COLOR":
            st.write(f"{word[idx]}: COLOR")
      elif text_pred[idx] == "Org":
            st.write(f"{word[idx]}: Org")
      elif text_pred[idx] == "CITY":
            st.write(f"{word[idx]}: CITY")
      elif text_pred[idx] == "COUNTRY":
            st.write(f"{word[idx]}: COUNTRY")