In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
DATA_DIR = "../data/"

In [3]:
sentences = np.load(os.path.join(DATA_DIR,"processed_sents_dl.npy"),allow_pickle=True)

In [4]:
tags = np.load(os.path.join(DATA_DIR,"processed_tags_dl.npy"),allow_pickle=True)


The idea behind using KNN and Random forests is as follows:
We have a BIO tag for each word in the data given. So I used the words I obtained from the cleaned data and used the glove 
embedding to represent each word I had. The I attempt to create a KNN or random forest classifier using the BIO tags and 
word embeddings. The idea being if two words are similar they have similar embeddings and hence similar tags. I got this idea
from the word cloud I created.


In [5]:
EMBEDDING_DIM=50

In [6]:
# Loading glove embeddings to a dictionary embeddings_index
embeddings_index = {}
f = open("../embeddings/glove.6B.50d.txt", encoding="utf-8")
for line in f:
    values = line.strip().split(' ')
    word = values[0] # the first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') #50d vectors   
    #representing the word
    embeddings_index[word] = coefs
f.close()

In [7]:
#create the data set with X representing the corresponding word embedding for every word in the cleaned data
#y are the BIO tags
X=[]
y=[]
for i,sent in enumerate(sentences):
    tag=tags[i]
    for j,word in enumerate(sent):
        currentTag = tag[j]
        y.append(currentTag)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            X.append(embedding_vector)
        else:
            X.append([0]*EMBEDDING_DIM)

In [8]:
X = np.array(X)
y= np.array(y)

In [9]:
#split into training and valiation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=98)

In [10]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(34952, 50)
(11651, 50)
(34952,)
(11651,)


In [11]:
#fit the data to knn classifier and obtain predictions
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(X_train, y_train)

y_pred = neigh.predict(X_val)

In [12]:
#there is no point in evaluating on all classes since 'O' is the dominating class and its F1 score is always going to be high
#evaluate on all classes except 'O'
all_classes = list(np.unique(y_train))

new_classes = all_classes.copy()

new_classes.remove('O')

In [13]:
report = classification_report(y_val, y_pred, digits=4,labels=new_classes)

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
#knn doesn't do very well. It has an F1 score of just 0.267
print(report)

                 precision    recall  f1-score   support

  B-corporation     0.6786    0.3878    0.4935        49
B-creative-work     0.3333    0.0263    0.0488        38
        B-group     0.2857    0.1754    0.2174        57
     B-location     0.5000    0.3937    0.4405       127
       B-person     0.6043    0.5153    0.5563       163
      B-product     0.3462    0.2903    0.3158        31
  I-corporation     0.0000    0.0000    0.0000        12
I-creative-work     0.1111    0.0185    0.0317        54
        I-group     0.0000    0.0000    0.0000        27
     I-location     0.3704    0.1538    0.2174        65
       I-person     0.3538    0.2875    0.3172        80
      I-product     0.5000    0.0476    0.0870        42

      micro avg     0.4772    0.2805    0.3533       745
      macro avg     0.3403    0.1914    0.2271       745
   weighted avg     0.4219    0.2805    0.3218       745



In [22]:
#save this model
import pickle
filename = '../models/knn.sav'
pickle.dump(neigh, open(filename, 'wb'))

In [17]:
#fit a random forest classifier to the data
clf = RandomForestClassifier(n_estimators=50)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_val)

In [18]:
report = classification_report(y_val, y_pred, digits=4,labels=new_classes)

  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
#this does much better than knn since it is an ensemble technique which can handle the problem of class imbalance much better.
print(report)

                 precision    recall  f1-score   support

  B-corporation     0.7500    0.4898    0.5926        49
B-creative-work     0.1111    0.0263    0.0426        38
        B-group     0.7222    0.2281    0.3467        57
     B-location     0.6250    0.3150    0.4188       127
       B-person     0.7010    0.4172    0.5231       163
      B-product     0.8889    0.2581    0.4000        31
  I-corporation     0.0000    0.0000    0.0000        12
I-creative-work     0.2727    0.0556    0.0923        54
        I-group     0.0000    0.0000    0.0000        27
     I-location     0.3000    0.1385    0.1895        65
       I-person     0.6250    0.2500    0.3571        80
      I-product     0.2632    0.1190    0.1639        42

      micro avg     0.5913    0.2564    0.3577       745
      macro avg     0.4383    0.1915    0.2605       745
   weighted avg     0.5351    0.2564    0.3410       745



In [21]:
#save this model
import pickle
filename = '../models/random-forest.sav'
pickle.dump(clf, open(filename, 'wb'))