In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
DATA_DIR = "../data/"

In [3]:
processedSentences = np.load(os.path.join(DATA_DIR,"processed_sents.npy"),allow_pickle=True)
processedTags = np.load(os.path.join(DATA_DIR,"processed_tags.npy"),allow_pickle=True)
POS = np.load(os.path.join(DATA_DIR,"pos.npy"),allow_pickle=True)

In [4]:
'''
I used the POS tags and words as my features and the BIO tags as my output. I use that to build models like Perceptron, Passive
Aggressive classifier, SGD classifier and Multinomial Naive Bayes. For these models I need the data in the a data frame having 
columns such that each word has 1 POS and 1 BIO tag
POS, words, tags
which is why I flatten the POS, sentences and tags numpy arrays
'''
pos_flat = [item for sublist in POS for item in sublist]
print(len(pos_flat))

sentences_flat = [item for sublist in processedSentences for item in sublist]
print(len(sentences_flat))

tags_flat = [item for sublist in processedTags for item in sublist]
print(len(tags_flat))

42630
42630
42630


In [5]:
data = pd.DataFrame(
    {'pos': pos_flat,
     'words': sentences_flat,
     'tags': tags_flat
    })

In [6]:
data

Unnamed: 0,pos,words,tags
0,PRON,it,O
1,DET,the,O
2,NOUN,view,O
3,ADP,from,O
4,ADV,where,O
...,...,...,...
42625,AUX,been,O
42626,VERB,made,O
42627,VERB,sat,O
42628,NOUN,jan,O


In [7]:
#create the X numpy array which contains the features from POS and words.y are the labels. Split to training and validation.
X = data.drop('tags', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))

y = data.tags.values
classes = np.unique(y)
classes = classes.tolist()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=0)

print("X_train",X_train.shape)
print("y_train",y_train.shape)
print("X_val",X_val.shape)
print("y_val",y_val.shape)

X_train (34104, 7696)
y_train (34104,)
X_val (8526, 7696)
y_val (8526,)


In [8]:
#fit the data to a SGD classifier
sgd = SGDClassifier()
sgd.fit(X_train, y_train)

SGDClassifier()

In [9]:

new_classes = classes.copy()
new_classes.remove('O')
print(new_classes)

['B-corporation', 'B-creative-work', 'B-group', 'B-location', 'B-person', 'B-product', 'I-corporation', 'I-creative-work', 'I-group', 'I-location', 'I-person', 'I-product']


In [10]:
#check on how the classifier is doing on all classes
print(classification_report(y_pred=sgd.predict(X_val), y_true=y_val))

                 precision    recall  f1-score   support

  B-corporation       0.94      0.38      0.54        45
B-creative-work       0.00      0.00      0.00        29
        B-group       0.00      0.00      0.00        47
     B-location       1.00      0.02      0.04        99
       B-person       1.00      0.10      0.18       138
      B-product       1.00      0.07      0.12        30
  I-corporation       0.00      0.00      0.00        14
I-creative-work       0.00      0.00      0.00        43
        I-group       0.00      0.00      0.00        19
     I-location       1.00      0.03      0.05        40
       I-person       1.00      0.05      0.10        55
      I-product       0.00      0.00      0.00        30
              O       0.94      1.00      0.97      7937

       accuracy                           0.94      8526
      macro avg       0.53      0.13      0.15      8526
   weighted avg       0.92      0.94      0.91      8526



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
#check on how the classifier is doing on all classes except 'O'. This is the true measure of performance.
#SGD does very bad having a F1 score of only 0.11
print(classification_report(y_pred=sgd.predict(X_val), y_true=y_val, labels=new_classes))

                 precision    recall  f1-score   support

  B-corporation       0.94      0.38      0.54        45
B-creative-work       0.00      0.00      0.00        29
        B-group       0.00      0.00      0.00        47
     B-location       1.00      0.02      0.04        99
       B-person       1.00      0.10      0.18       138
      B-product       1.00      0.07      0.12        30
  I-corporation       0.00      0.00      0.00        14
I-creative-work       0.00      0.00      0.00        43
        I-group       0.00      0.00      0.00        19
     I-location       1.00      0.03      0.05        40
       I-person       1.00      0.05      0.10        55
      I-product       0.00      0.00      0.00        30

      micro avg       0.97      0.07      0.12       589
      macro avg       0.50      0.05      0.09       589
   weighted avg       0.69      0.07      0.11       589



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
#save the perceptron model
import pickle
filename = '../models/sgd.sav'
pickle.dump(sgd, open(filename, 'wb'))

In [13]:
#fit the data to a Perceptron
per = Perceptron(verbose=10, max_iter=50)
per.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-- Epoch 1
Norm: 16.91, NNZs: 202, Bias: -2.000000, T: 34104, Avg. loss: 0.006832
Total training time: 0.39 seconds.
-- Epoch 2
Norm: 25.77, NNZs: 274, Bias: -2.000000, T: 68208, Avg. loss: 0.003607
Total training time: 0.77 seconds.
-- Epoch 3
Norm: 31.27, NNZs: 329, Bias: -2.000000, T: 102312, Avg. loss: 0.002346
Total training time: 1.16 seconds.
-- Epoch 4
Norm: 34.50, NNZs: 357, Bias: -3.000000, T: 136416, Avg. loss: 0.001965
Total training time: 1.50 seconds.
-- Epoch 5
Norm: 36.61, NNZs: 379, Bias: -2.000000, T: 170520, Avg. loss: 0.001965
Total training time: 1.83 seconds.
-- Epoch 6
Norm: 38.39, NNZs: 402, Bias: -3.000000, T: 204624, Avg. loss: 0.001701
Total training time: 2.16 seconds.
-- Epoch 7
Norm: 40.10, NNZs: 415, Bias: -3.000000, T: 238728, Avg. loss: 0.001730
Total training time: 2.53 seconds.
-- Epoch 8
Norm: 41.71, NNZs: 431, Bias: -2.000000, T: 272832, Avg. loss: 0.001877
Total training time: 2.86 seconds.
Convergence after 8 epochs took 2.86 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


Norm: 11.58, NNZs: 125, Bias: -3.000000, T: 34104, Avg. loss: 0.006480
Total training time: 0.36 seconds.
-- Epoch 2
Norm: 18.49, NNZs: 181, Bias: -2.000000, T: 68208, Avg. loss: 0.005337
Total training time: 0.77 seconds.
-- Epoch 3
Norm: 23.49, NNZs: 220, Bias: -3.000000, T: 102312, Avg. loss: 0.004369
Total training time: 1.14 seconds.
-- Epoch 4
Norm: 26.65, NNZs: 240, Bias: -2.000000, T: 136416, Avg. loss: 0.004134
Total training time: 1.50 seconds.
-- Epoch 5
Norm: 29.15, NNZs: 261, Bias: -3.000000, T: 170520, Avg. loss: 0.004252
Total training time: 1.86 seconds.
-- Epoch 6
Norm: 30.92, NNZs: 274, Bias: -3.000000, T: 204624, Avg. loss: 0.004340
Total training time: 2.23 seconds.
-- Epoch 7
Norm: 33.26, NNZs: 290, Bias: -2.000000, T: 238728, Avg. loss: 0.003812
Total training time: 2.61 seconds.
Convergence after 7 epochs took 2.61 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.4s remaining:    0.0s


Norm: 19.29, NNZs: 315, Bias: -2.000000, T: 34104, Avg. loss: 0.011113
Total training time: 0.38 seconds.
-- Epoch 2
Norm: 30.79, NNZs: 452, Bias: -3.000000, T: 68208, Avg. loss: 0.007096
Total training time: 0.77 seconds.
-- Epoch 3
Norm: 39.24, NNZs: 552, Bias: -2.000000, T: 102312, Avg. loss: 0.004193
Total training time: 1.15 seconds.
-- Epoch 4
Norm: 43.66, NNZs: 594, Bias: -3.000000, T: 136416, Avg. loss: 0.003401
Total training time: 1.51 seconds.
-- Epoch 5
Norm: 46.58, NNZs: 623, Bias: -4.000000, T: 170520, Avg. loss: 0.003900
Total training time: 1.85 seconds.
-- Epoch 6
Norm: 49.54, NNZs: 653, Bias: -3.000000, T: 204624, Avg. loss: 0.003812
Total training time: 2.20 seconds.
-- Epoch 7
Norm: 51.48, NNZs: 690, Bias: -3.000000, T: 238728, Avg. loss: 0.003460
Total training time: 2.55 seconds.
-- Epoch 8
Norm: 53.63, NNZs: 713, Bias: -3.000000, T: 272832, Avg. loss: 0.003196
Total training time: 2.92 seconds.
Convergence after 8 epochs took 2.92 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.3s remaining:    0.0s


Norm: 28.50, NNZs: 576, Bias: -2.000000, T: 34104, Avg. loss: 0.021904
Total training time: 0.38 seconds.
-- Epoch 2
Norm: 43.03, NNZs: 796, Bias: -3.000000, T: 68208, Avg. loss: 0.012872
Total training time: 0.75 seconds.
-- Epoch 3
Norm: 52.29, NNZs: 937, Bias: -3.000000, T: 102312, Avg. loss: 0.008709
Total training time: 1.10 seconds.
-- Epoch 4
Norm: 57.88, NNZs: 1028, Bias: -3.000000, T: 136416, Avg. loss: 0.007858
Total training time: 1.45 seconds.
-- Epoch 5
Norm: 61.81, NNZs: 1094, Bias: -4.000000, T: 170520, Avg. loss: 0.007301
Total training time: 1.80 seconds.
-- Epoch 6
Norm: 65.68, NNZs: 1154, Bias: -3.000000, T: 204624, Avg. loss: 0.006627
Total training time: 2.20 seconds.
-- Epoch 7
Norm: 68.80, NNZs: 1201, Bias: -3.000000, T: 238728, Avg. loss: 0.007477
Total training time: 2.61 seconds.
-- Epoch 8
Norm: 71.62, NNZs: 1243, Bias: -2.000000, T: 272832, Avg. loss: 0.006803
Total training time: 3.00 seconds.
Convergence after 8 epochs took 3.00 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.3s remaining:    0.0s


Norm: 31.11, NNZs: 682, Bias: -2.000000, T: 34104, Avg. loss: 0.021992
Total training time: 0.38 seconds.
-- Epoch 2
Norm: 46.48, NNZs: 945, Bias: -2.000000, T: 68208, Avg. loss: 0.013048
Total training time: 0.72 seconds.
-- Epoch 3
Norm: 57.69, NNZs: 1145, Bias: -2.000000, T: 102312, Avg. loss: 0.006011
Total training time: 1.06 seconds.
-- Epoch 4
Norm: 63.44, NNZs: 1238, Bias: -2.000000, T: 136416, Avg. loss: 0.005014
Total training time: 1.40 seconds.
-- Epoch 5
Norm: 67.36, NNZs: 1308, Bias: -2.000000, T: 170520, Avg. loss: 0.005043
Total training time: 1.78 seconds.
-- Epoch 6
Norm: 71.08, NNZs: 1372, Bias: -2.000000, T: 204624, Avg. loss: 0.004692
Total training time: 2.11 seconds.
-- Epoch 7
Norm: 74.50, NNZs: 1428, Bias: -2.000000, T: 238728, Avg. loss: 0.004750
Total training time: 2.45 seconds.
-- Epoch 8
Norm: 77.19, NNZs: 1466, Bias: -2.000000, T: 272832, Avg. loss: 0.004750
Total training time: 2.81 seconds.
Convergence after 8 epochs took 2.81 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.1s remaining:    0.0s


Norm: 14.14, NNZs: 150, Bias: -2.000000, T: 34104, Avg. loss: 0.004838
Total training time: 0.37 seconds.
-- Epoch 2
Norm: 21.54, NNZs: 206, Bias: -2.000000, T: 68208, Avg. loss: 0.003079
Total training time: 0.72 seconds.
-- Epoch 3
Norm: 26.31, NNZs: 248, Bias: -2.000000, T: 102312, Avg. loss: 0.001847
Total training time: 1.06 seconds.
-- Epoch 4
Norm: 29.15, NNZs: 266, Bias: -3.000000, T: 136416, Avg. loss: 0.001701
Total training time: 1.40 seconds.
-- Epoch 5
Norm: 31.21, NNZs: 281, Bias: -2.000000, T: 170520, Avg. loss: 0.001730
Total training time: 1.74 seconds.
-- Epoch 6
Norm: 33.02, NNZs: 289, Bias: -3.000000, T: 204624, Avg. loss: 0.001437
Total training time: 2.07 seconds.
-- Epoch 7
Norm: 34.21, NNZs: 302, Bias: -3.000000, T: 238728, Avg. loss: 0.001554
Total training time: 2.41 seconds.
-- Epoch 8
Norm: 35.75, NNZs: 312, Bias: -3.000000, T: 272832, Avg. loss: 0.001407
Total training time: 2.77 seconds.
Convergence after 8 epochs took 2.77 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   16.9s remaining:    0.0s


Norm: 7.35, NNZs: 51, Bias: -2.000000, T: 34104, Avg. loss: 0.001525
Total training time: 0.36 seconds.
-- Epoch 2
Norm: 10.77, NNZs: 63, Bias: -2.000000, T: 68208, Avg. loss: 0.001290
Total training time: 0.71 seconds.
-- Epoch 3
Norm: 13.71, NNZs: 74, Bias: -2.000000, T: 102312, Avg. loss: 0.000880
Total training time: 1.05 seconds.
-- Epoch 4
Norm: 14.90, NNZs: 80, Bias: -3.000000, T: 136416, Avg. loss: 0.001056
Total training time: 1.44 seconds.
-- Epoch 5
Norm: 16.49, NNZs: 87, Bias: -2.000000, T: 170520, Avg. loss: 0.000880
Total training time: 1.79 seconds.
-- Epoch 6
Norm: 18.00, NNZs: 96, Bias: -1.000000, T: 204624, Avg. loss: 0.000821
Total training time: 2.12 seconds.
Convergence after 6 epochs took 2.12 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   19.0s remaining:    0.0s


Norm: 13.42, NNZs: 162, Bias: -3.000000, T: 34104, Avg. loss: 0.008973
Total training time: 0.38 seconds.
-- Epoch 2
Norm: 18.92, NNZs: 202, Bias: -3.000000, T: 68208, Avg. loss: 0.008914
Total training time: 0.73 seconds.
-- Epoch 3
Norm: 23.19, NNZs: 244, Bias: -4.000000, T: 102312, Avg. loss: 0.008210
Total training time: 1.06 seconds.
-- Epoch 4
Norm: 26.80, NNZs: 265, Bias: -3.000000, T: 136416, Avg. loss: 0.008152
Total training time: 1.41 seconds.
-- Epoch 5
Norm: 28.98, NNZs: 295, Bias: -5.000000, T: 170520, Avg. loss: 0.008181
Total training time: 1.76 seconds.
-- Epoch 6
Norm: 31.65, NNZs: 315, Bias: -3.000000, T: 204624, Avg. loss: 0.007888
Total training time: 2.12 seconds.
Convergence after 6 epochs took 2.12 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   21.1s remaining:    0.0s


Norm: 13.71, NNZs: 153, Bias: -3.000000, T: 34104, Avg. loss: 0.007272
Total training time: 0.36 seconds.
-- Epoch 2
Norm: 19.80, NNZs: 199, Bias: -2.000000, T: 68208, Avg. loss: 0.006451
Total training time: 0.71 seconds.
-- Epoch 3
Norm: 25.10, NNZs: 244, Bias: -2.000000, T: 102312, Avg. loss: 0.005307
Total training time: 1.09 seconds.
-- Epoch 4
Norm: 29.22, NNZs: 280, Bias: -2.000000, T: 136416, Avg. loss: 0.004545
Total training time: 1.44 seconds.
-- Epoch 5
Norm: 31.27, NNZs: 308, Bias: -4.000000, T: 170520, Avg. loss: 0.005043
Total training time: 1.78 seconds.
-- Epoch 6
Norm: 33.20, NNZs: 312, Bias: -2.000000, T: 204624, Avg. loss: 0.005043
Total training time: 2.13 seconds.
-- Epoch 7
Norm: 34.55, NNZs: 328, Bias: -4.000000, T: 238728, Avg. loss: 0.005161
Total training time: 2.51 seconds.
-- Epoch 8
Norm: 36.30, NNZs: 334, Bias: -4.000000, T: 272832, Avg. loss: 0.004779
Total training time: 2.87 seconds.
Convergence after 8 epochs took 2.87 seconds
-- Epoch 1


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   24.0s remaining:    0.0s


Norm: 18.44, NNZs: 259, Bias: -2.000000, T: 34104, Avg. loss: 0.010703
Total training time: 0.42 seconds.
-- Epoch 2
Norm: 27.78, NNZs: 371, Bias: -3.000000, T: 68208, Avg. loss: 0.007712
Total training time: 0.77 seconds.
-- Epoch 3
Norm: 35.47, NNZs: 447, Bias: -2.000000, T: 102312, Avg. loss: 0.005923
Total training time: 1.13 seconds.
-- Epoch 4
Norm: 39.47, NNZs: 484, Bias: -3.000000, T: 136416, Avg. loss: 0.005337
Total training time: 1.48 seconds.
-- Epoch 5
Norm: 42.38, NNZs: 521, Bias: -3.000000, T: 170520, Avg. loss: 0.005659
Total training time: 1.82 seconds.
-- Epoch 6
Norm: 45.23, NNZs: 546, Bias: -4.000000, T: 204624, Avg. loss: 0.005161
Total training time: 2.21 seconds.
-- Epoch 7
Norm: 48.06, NNZs: 585, Bias: -3.000000, T: 238728, Avg. loss: 0.004633
Total training time: 2.54 seconds.
-- Epoch 8
Norm: 49.84, NNZs: 601, Bias: -3.000000, T: 272832, Avg. loss: 0.005366
Total training time: 2.91 seconds.
Convergence after 8 epochs took 2.91 seconds
-- Epoch 1
Norm: 23.24, 

[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   36.4s finished


Perceptron(max_iter=50, verbose=10)

In [14]:
#The perceptron does better than SGD but still not the best possible. Its has an F1 score of 0.35
print(classification_report(y_pred=per.predict(X_val), y_true=y_val, labels=new_classes))

                 precision    recall  f1-score   support

  B-corporation       0.69      0.53      0.60        45
B-creative-work       0.43      0.10      0.17        29
        B-group       0.69      0.19      0.30        47
     B-location       0.44      0.33      0.38        99
       B-person       0.67      0.39      0.49       138
      B-product       0.70      0.23      0.35        30
  I-corporation       0.12      0.14      0.13        14
I-creative-work       0.33      0.05      0.08        43
        I-group       0.33      0.05      0.09        19
     I-location       0.62      0.20      0.30        40
       I-person       0.53      0.16      0.25        55
      I-product       0.47      0.27      0.34        30

      micro avg       0.54      0.27      0.36       589
      macro avg       0.50      0.22      0.29       589
   weighted avg       0.55      0.27      0.35       589



In [15]:
#save the perceptron model
import pickle
filename = '../models/perceptron.sav'
pickle.dump(per, open(filename, 'wb'))

In [16]:
#fit the data to Multinomial Naive bayes
nb = MultinomialNB(alpha=0.01)
nb.fit(X_train, y_train)

MultinomialNB(alpha=0.01)

In [17]:
#naive bayes does poor on f1 score for all classes except 'O'
print(classification_report(y_pred=nb.predict(X_val), y_true=y_val, labels = new_classes))

                 precision    recall  f1-score   support

  B-corporation       0.75      0.53      0.62        45
B-creative-work       0.33      0.03      0.06        29
        B-group       0.67      0.21      0.32        47
     B-location       0.19      0.37      0.25        99
       B-person       0.71      0.37      0.49       138
      B-product       0.70      0.23      0.35        30
  I-corporation       0.00      0.00      0.00        14
I-creative-work       0.50      0.05      0.09        43
        I-group       0.11      0.05      0.07        19
     I-location       0.29      0.28      0.28        40
       I-person       0.05      0.65      0.10        55
      I-product       0.38      0.17      0.23        30

      micro avg       0.18      0.31      0.23       589
      macro avg       0.39      0.25      0.24       589
   weighted avg       0.45      0.31      0.30       589



In [18]:
#save the perceptron model
import pickle
filename = '../models/nb.sav'
pickle.dump(nb, open(filename, 'wb'))

In [19]:
#fit the data to the passive aggresive classifier
pa =PassiveAggressiveClassifier()
pa.fit(X_train, y_train)

PassiveAggressiveClassifier()

In [20]:
#this too does as good as the perceptron with a F1 score of 0.34 on all classes except 'O'
print(classification_report(y_pred=pa.predict(X_val), y_true=y_val, labels=new_classes))

                 precision    recall  f1-score   support

  B-corporation       0.74      0.51      0.61        45
B-creative-work       0.33      0.07      0.11        29
        B-group       0.39      0.26      0.31        47
     B-location       0.56      0.33      0.42        99
       B-person       0.67      0.38      0.49       138
      B-product       0.25      0.27      0.26        30
  I-corporation       0.00      0.00      0.00        14
I-creative-work       0.38      0.07      0.12        43
        I-group       0.12      0.05      0.07        19
     I-location       0.50      0.12      0.20        40
       I-person       0.58      0.13      0.21        55
      I-product       0.29      0.13      0.18        30

      micro avg       0.52      0.26      0.34       589
      macro avg       0.40      0.19      0.25       589
   weighted avg       0.50      0.26      0.33       589



In [21]:
#save this model
import pickle
filename = '../pass-agg-cls.sav'
pickle.dump(per, open(filename, 'wb'))