In [2]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs
import nltk 
from sklearn.model_selection import train_test_split
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report


In [3]:
with codecs.open("reuters.xml","r","utf-8") as f:
	soup = bs(f,"html5lib")

In [4]:

docs = []
for element in soup.find_all("document"):
	# print(element)

	texts = []

	for c in element.find("textwithnamedentities").children:
		if type(c) == Tag :
			if c.name == "namedentityintext":
				label = "N"

			else : 
				label = "I"

			for w in c.text.split(" "):
				if len(w) > 0:
					texts.append((w,label))  

	docs.append(texts)


In [5]:
print(docs)

[[('Paxar', 'N'), ('Corp', 'N'), ('said', 'I'), ('it', 'I'), ('has', 'I'), ('acquired', 'I'), ('Thermo-Print', 'N'), ('GmbH', 'N'), ('of', 'I'), ('Lohn', 'N'), (',', 'I'), ('West', 'N'), ('Germany', 'N'), (',', 'I'), ('a', 'I'), ('distributor', 'I'), ('of', 'I'), ('Paxar', 'N'), ('products,', 'I'), ('for', 'I'), ('undisclosed', 'I'), ('terms.', 'I')], [('Key', 'N'), ('Tronic', 'N'), ('corp', 'N'), ('said', 'I'), ('it', 'I'), ('has', 'I'), ('received', 'I'), ('contracts', 'I'), ('to', 'I'), ('provide', 'I'), ('seven', 'I'), ('original', 'I'), ('equipment', 'I'), ('manufacturers', 'I'), ('with', 'I'), ('which', 'I'), ('it', 'I'), ('has', 'I'), ('not', 'I'), ('done', 'I'), ('business', 'I'), ('recently', 'I'), ('with', 'I'), ('over', 'I'), ('300,000', 'I'), ('computer', 'I'), ('keyboards', 'I'), ('for', 'I'), ('delivery', 'I'), ('within', 'I'), ('the', 'I'), ('next', 'I'), ('12', 'I'), ('months.', 'I'), ('The', 'I'), ('company', 'I'), ('said', 'I'), ('The', 'I'), ('new', 'I'), ('contracts

In [9]:
data = []
for i, doc in enumerate(docs):
	tokens = [t for t,label in doc]

	tagged = nltk.pos_tag(tokens)

	data.append([(w,pos,label) for (w,label),(word,pos) in zip(doc,tagged)])



In [10]:
print(data)

[[('Paxar', 'NNP', 'N'), ('Corp', 'NNP', 'N'), ('said', 'VBD', 'I'), ('it', 'PRP', 'I'), ('has', 'VBZ', 'I'), ('acquired', 'VBN', 'I'), ('Thermo-Print', 'NNP', 'N'), ('GmbH', 'NNP', 'N'), ('of', 'IN', 'I'), ('Lohn', 'NNP', 'N'), (',', ',', 'I'), ('West', 'NNP', 'N'), ('Germany', 'NNP', 'N'), (',', ',', 'I'), ('a', 'DT', 'I'), ('distributor', 'NN', 'I'), ('of', 'IN', 'I'), ('Paxar', 'NNP', 'N'), ('products,', 'NN', 'I'), ('for', 'IN', 'I'), ('undisclosed', 'JJ', 'I'), ('terms.', 'NN', 'I')], [('Key', 'NNP', 'N'), ('Tronic', 'NNP', 'N'), ('corp', 'NN', 'N'), ('said', 'VBD', 'I'), ('it', 'PRP', 'I'), ('has', 'VBZ', 'I'), ('received', 'VBN', 'I'), ('contracts', 'NNS', 'I'), ('to', 'TO', 'I'), ('provide', 'VB', 'I'), ('seven', 'CD', 'I'), ('original', 'JJ', 'I'), ('equipment', 'NN', 'I'), ('manufacturers', 'NNS', 'I'), ('with', 'IN', 'I'), ('which', 'WDT', 'I'), ('it', 'PRP', 'I'), ('has', 'VBZ', 'I'), ('not', 'RB', 'I'), ('done', 'VBN', 'I'), ('business', 'NN', 'I'), ('recently', 'RB', 'I'

In [28]:
def word2feature(doc,i):
	word = doc[i][0]
	postag = doc[i][1]

	feature = [
		'bias',
		'word.lower='+word.lower(),
		'word[-3]='+word[-3:],
		'word[-2]='+word[-2:],
		'word.isupper=%s'%word.isupper(),
		'word.isdigit=%s'%word.isdigit(),
		'postag='+postag

		]

	if i > 0:
		word1 = doc[i-1][0]
		postag1 = doc[i-1][1]
		feature.extend([
			'-1:word.lower='+word.lower(),
			'-1:word.isupper=%s'%word.isupper(),
			'-1:word.isdigit=%s'%word.isdigit(),
			'-1:postag='+postag
			])
	else :
		feature.append('BOS')
	
	if i < len(doc)-1:
		word1 = doc[i+1][0]
		postag1 = doc[i+1][1]
		feature.extend([
			'+1:word.lower='+word.lower(),
			'+1:word.isupper=%s'%word.isupper(),
			'+1:word.isdigit=%s'%word.isdigit(),
			'+1:postag='+postag
			])
	else :
		feature.append('EOS')

	return feature


In [47]:
def extract_feature(doc):
	return [word2feature(doc,i) for i in range(len(doc))]


def get_labels(doc):
	return [elem[1] for elem in doc]

In [37]:
x = [extract_feature(doc) for doc in data]

In [38]:
import pandas as pd

In [41]:
reader = pd.read_json('tagged_data.json',lines=True)

In [42]:
data = reader.tags

In [43]:
x = [extract_feature(doc) for doc in data]

In [44]:
x

[[['bias',
   'word.lower=outdoor',
   'word[-3]=oor',
   'word[-2]=or',
   'word.isupper=False',
   'word.isdigit=False',
   'postag=B-brand',
   'BOS',
   '+1:word.lower=outdoor',
   '+1:word.isupper=False',
   '+1:word.isdigit=False',
   '+1:postag=B-brand'],
  ['bias',
   'word.lower=collection',
   'word[-3]=ion',
   'word[-2]=on',
   'word.isupper=False',
   'word.isdigit=False',
   'postag=E-brand',
   '-1:word.lower=collection',
   '-1:word.isupper=False',
   '-1:word.isdigit=False',
   '-1:postag=E-brand',
   '+1:word.lower=collection',
   '+1:word.isupper=False',
   '+1:word.isdigit=False',
   '+1:postag=E-brand'],
  ['bias',
   'word.lower=ballerines',
   'word[-3]=nes',
   'word[-2]=es',
   'word.isupper=False',
   'word.isdigit=False',
   'postag=B-object',
   '-1:word.lower=ballerines',
   '-1:word.isupper=False',
   '-1:word.isdigit=False',
   '-1:postag=B-object',
   '+1:word.lower=ballerines',
   '+1:word.isupper=False',
   '+1:word.isdigit=False',
   '+1:postag=B-obje

In [48]:
y = [get_labels(doc) for doc in data]

In [49]:
y


[['B-brand',
  'E-brand',
  'B-object',
  'E-object',
  'B-colour',
  'B-colour',
  'B-details'],
 ['B-brand', 'E-brand', 'B-object', 'O', 'O', 'O', 'B-colour', 'O'],
 ['B-brand',
  'E-brand',
  'B-object',
  'B-brand',
  'O',
  'B-colour',
  'B-colour',
  'O',
  'O',
  'O'],
 ['B-brand', 'E-brand', 'B-object', 'O', 'O', 'O', 'B-colour', 'O'],
 ['B-brand', 'O', 'O', 'O', 'B-brand'],
 ['B-object', 'E-object', 'O', 'O'],
 ['B-brand',
  'B-object',
  'B-brand',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-brand', 'B-object', 'E-object', 'B-material', 'B-brand'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-brand', 'O', 'B-object', 'E-object', 'B-object', 'B-colour', 'O'],
 ['B-brand', 'O', 'B-object', 'I-object', 'E-object', 'B-colour'],
 ['O',
  'B-details',
  'B-object',
  'I-object',
  'E-object',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],


In [50]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs
import nltk 
from sklearn.model_selection import train_test_split
import pycrfsuite
import numpy as np
from sklearn.metrics import classification_report

In [51]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2)

In [52]:
trainer = pycrfsuite.Trainer(verbose=True)
for xseq, yseq in zip(X_train,Y_train):
	trainer.append(xseq,yseq)

trainer.set_params({
	'c1':0.1,
	'c2':0.01,
	'max_iterations':200,
	'feature.possible_transitions':True
	})

trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 109015
Seconds required: 0.365

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 222333.852996
Feature norm: 1.000000
Error norm: 113019.221966
Active features: 108729
Line search trials: 1
Line search step: 0.000004
Seconds required for this iteration: 0.515

***** Iteration #2 *****
Loss: 161125.571795
Feature norm: 1.953979
Error norm: 95921.760867
Active features: 105305
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.268

***** Iteration #3 *****
Loss: 115336.873749
Feature norm: 2.635067
Error norm: 49375.440637
Active features: 106679
Line search trials: 1
Line search step: 1.000000
Seconds requ

***** Iteration #41 *****
Loss: 34.327035
Feature norm: 30.881584
Error norm: 0.028185
Active features: 77
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.258

***** Iteration #42 *****
Loss: 34.326255
Feature norm: 30.872075
Error norm: 0.019415
Active features: 77
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.253

***** Iteration #43 *****
Loss: 34.325289
Feature norm: 30.871261
Error norm: 0.016152
Active features: 77
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254

***** Iteration #44 *****
Loss: 34.324006
Feature norm: 30.854003
Error norm: 0.021853
Active features: 77
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.250

***** Iteration #45 *****
Loss: 34.323879
Feature norm: 30.882556
Error norm: 0.025205
Active features: 78
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.254



In [53]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
Y_pred = [tagger.tag(xseq) for xseq in X_test]

In [55]:
i = 10

for x,y in zip(Y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
	print("%s (%s)"%(y,x))

cellonic (B-brand)
chargeur (B-object)
jvc (B-brand)
gz (O)
mg645 (O)


In [56]:
Y_pred

[['B-brand', 'B-object', 'O', 'B-brand', 'O', 'B-brand'],
 ['B-brand', 'E-brand', 'B-object', 'O', 'O', 'B-details'],
 ['B-brand', 'O', 'O', 'B-object', 'B-details', 'B-brand', 'O', 'O'],
 ['B-brand', 'O', 'O', 'B-object', 'O'],
 ['B-brand', 'O', 'B-object', 'I-object', 'E-object', 'B-colour', 'B-details'],
 ['B-brand', 'B-brand', 'O', 'O', 'B-object'],
 ['B-brand', 'B-brand', 'O', 'B-object', 'E-object', 'O', 'B-details'],
 ['B-brand',
  'B-brand',
  'O',
  'O',
  'B-object',
  'I-object',
  'E-object',
  'O',
  'O'],
 ['B-brand', 'O', 'B-object', 'O', 'O', 'O', 'O', 'O'],
 ['B-brand', 'E-brand', 'O', 'O', 'O', 'B-details', 'O', 'O'],
 ['B-brand', 'B-object', 'B-brand', 'O', 'O'],
 ['B-brand',
  'B-object',
  'B-brand',
  'O',
  'O',
  'O',
  'O',
  'B-details',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-brand', 'O', 'O', 'O'],
 ['B-brand',
  'B-object',
  'B-brand',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  