In [19]:
import sys, codecs, json, math, time, warnings
warnings.simplefilter( action='ignore', category=FutureWarning )

import nltk, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics, eli5
from sklearn.metrics import make_scorer
from collections import Counter
import matplotlib.pyplot as plt
from IPython.display import display    

import logging
import tensorflow as tf
import absl.logging
formatter = logging.Formatter('[%(levelname)s|%(filename)s:%(lineno)s %(asctime)s] %(message)s')
absl.logging.get_absl_handler().setFormatter(formatter)
absl.logging._warn_preinit_stderr = False
logger = tf.get_logger()
logger.setLevel(logging.INFO)

# number of CRF iterations to train for. Using 150 will provide much better results, but take a lot longer to compute.
# max_iter = 20

# number of ontonotes training files to load. Using a value of None will load the entire dataset, taking the longest
# to train but providing a much larger sentence corpus to train over and thus is able to learn a larger vocabulary.
# max_files = 70

# set of NE label types to display in results. this is simply to limit the amount of logging that is perfoemed later
# when displaying details such as state transitions and top N features per state.
display_label_subset = [ 'B-DATE', 'I-DATE', 'B-GPE', 'I-GPE', 'B-PERSON', 'I-PERSON', 'O' ]

In [20]:
# loads training and testing set form parsed json
# format = [ [ (toke, POS_tag, NER_IOB_Tag) ] ]
# example: [[('Since', 'IN', 'O'), ('then', 'RB', 'O'), (',', ',', 'O'), ('this', 'DT', 'O'), ('area', 'NN', 'O'), ('has', 'VBZ', 'O'), ('become', 'VBN', 'O'), ('a', 'DT', 'O'), ('prohibited', 'VBN', 'O'), ('zone', 'NN', 'O'), ('in', 'IN', 'O'), ('Hong', 'NNP', 'B-GPE'), ('Kong', 'NNP', 'I-GPE'), ('.', '.', 'O')]]  
def create_dataset( max_files = None ) :
	dataset_file = '../../corpus/comp3225/ontonotes_parsed.json'
    
	# load parsed ontonotes dataset
	readHandle = codecs.open( dataset_file, 'r', 'utf-8', errors = 'replace' )
	str_json = readHandle.read()
	readHandle.close()
	dict_ontonotes = json.loads( str_json )

	# make a training and test split
	list_files = list( dict_ontonotes.keys() )
	if len(list_files) > max_files :
		list_files = list_files[ :max_files ]
	nSplit = math.floor( len(list_files)*0.9 )
	list_train_files = list_files[ : nSplit ]
	list_test_files = list_files[ nSplit : ]

	# sent = (tokens, pos, IOB_label)
	list_train = []
	for str_file in list_train_files :
		for str_sent_index in dict_ontonotes[str_file] :
			# ignore sents with non-PENN POS tags
			if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue
			if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue

			list_entry = []

			# compute IOB tags for named entities (if any)
			ne_type_last = None
			for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
				strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
				strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
				ne_type = None
				if 'ne' in dict_ontonotes[str_file][str_sent_index] :
					dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
					if not 'parse_error' in dict_ne :
						for str_NEIndex in dict_ne :
							if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
								ne_type = dict_ne[str_NEIndex]['type']
								break
				if ne_type != None :
					if ne_type == ne_type_last :
						strIOB = 'I-' + ne_type
					else :
						strIOB = 'B-' + ne_type
				else :
					strIOB = 'O'
				ne_type_last = ne_type

				list_entry.append( ( strToken, strPOS, strIOB ) )

			list_train.append( list_entry )

	list_test = []
	for str_file in list_test_files :
		for str_sent_index in dict_ontonotes[str_file] :
			# ignore sents with non-PENN POS tags
			if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue
			if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue

			list_entry = []

			# compute IOB tags for named entities (if any)
			ne_type_last = None
			for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
				strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
				strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
				ne_type = None
				if 'ne' in dict_ontonotes[str_file][str_sent_index] :
					dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
					if not 'parse_error' in dict_ne :
						for str_NEIndex in dict_ne :
							if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
								ne_type = dict_ne[str_NEIndex]['type']
								break
				if ne_type != None :
					if ne_type == ne_type_last :
						strIOB = 'I-' + ne_type
					else :
						strIOB = 'B-' + ne_type
				else :
					strIOB = 'O'
				ne_type_last = ne_type

				list_entry.append( ( strToken, strPOS, strIOB ) )

			list_test.append( list_entry )

	return list_train, list_test

In [21]:
def sent2features(sent, word2features_func = None):
	return [word2features_func(sent, i) for i in range(len(sent))]

def sent2labels(sent):
	return [label for token, postag, label in sent]

def print_F1_scores( micro_F1 ) :
	for label in micro_F1 :
		logger.info( "%-15s -> f1 %0.2f ; prec %0.2f ; recall %0.2f" % ( label, micro_F1[label]['f1-score'], micro_F1[label]['precision'], micro_F1[label]['recall'] ) )

def print_transitions(trans_features):
	for (label_from, label_to), weight in trans_features:
		logger.info( "%-15s -> %-15s %0.6f" % (label_from, label_to, weight) )

def print_state_features(state_features):
	for (attr, label), weight in state_features:
		logger.info( "%0.6f %-15s %s" % (weight, label, attr) )

In [22]:
from sklearn.metrics import f1_score


def exec_task( max_files = None, max_iter = None, display_label_subset = [], word2features_func = None, train_crf_model_func = None ) :

	# make a dataset from english NE labelled ontonotes sents
	train_sents, test_sents = create_dataset( max_files = max_files )

	# create feature vectors for every sentence
	X_train = [sent2features(s, word2features_func = word2features_func) for s in train_sents]
	Y_train = [sent2labels(s) for s in train_sents]

	X_test = [sent2features(s, word2features_func = word2features_func) for s in test_sents]
	Y_test = [sent2labels(s) for s in test_sents]

	# print("Sentence Features:\n",X_train[0])
	# print("\n\nSentence Labels:\n",Y_train[0])

	# getting the set of labels that exist in the sentences
	set_labels = set([])
	for data in [Y_train,Y_test] :
		for n_sent in range(len(data)) :
			for str_label in data[n_sent] :
				set_labels.add( str_label )
	labels = list( set_labels )
	# logger.info( 'labels = ' + repr(labels) )

	# remove 'O' label as we are not usually interested in how well 'O' is predicted
	#labels = list( crf.classes_ )
	labels.remove('O')
	
	# Train CRF model
	crf = train_crf_model_func( X_train, Y_train, max_iter, labels )

	logger.info('Label transition weights learnt from dataset (for a subset of labels)')
	display( eli5.show_weights(crf, top=10, targets = display_label_subset, show=['transition_features']) )

	logger.info('Top 10 features per-target (for a subset of labels)')
	display( eli5.show_weights(crf, top=20, targets = display_label_subset, show=['targets']) )

	# compute the macro F1 score (F1 for instances of each label class averaged) in the test set
	Y_pred = crf.predict( X_test )
	sorted_labels = sorted(
		labels, 
		key=lambda name: (name[1:], name[0])
	)
	macro_scores = sklearn_crfsuite.metrics.flat_classification_report( Y_test, Y_pred, labels=sorted_labels, digits=3, output_dict = True )
	logger.info( '' )
	logger.info( 'macro F1 scores'  )
	print_F1_scores( macro_scores )

def task2_word2features(sent, i):

	word = sent[i][0]
	postag = sent[i][1]

	features = {
		'word' : word,
		'postag': postag,

		# token shape
		'word.lower()': word.lower(),
		'word.isupper()': word.isupper(),
		'word.istitle()': word.istitle(),
		'word.isdigit()': word.isdigit(),

		# token suffix
		'word.suffix': word.lower()[-3:],

		# POS prefix
		'postag[:2]': postag[:2],
	}
	if i > 0:
		word_prev = sent[i-1][0]
		postag_prev = sent[i-1][1]
		features.update({
			'-1:word.lower()': word_prev.lower(),
			'-1:postag': postag_prev,
			'-1:word.lower()': word_prev.lower(),
			'-1:word.isupper()': word_prev.isupper(),
			'-1:word.istitle()': word_prev.istitle(),
			'-1:word.isdigit()': word_prev.isdigit(),
			'-1:word.suffix': word_prev.lower()[-3:],
			'-1:postag[:2]': postag_prev[:2],
		})
	else:
		features['BOS'] = True

	if i < len(sent)-1:
		word_next = sent[i+1][0]
		postag_next = sent[i+1][1]
		features.update({
			'+1:word.lower()': word_next.lower(),
			'+1:postag': postag_next,
			'+1:word.lower()': word_next.lower(),
			'+1:word.isupper()': word_next.isupper(),
			'+1:word.istitle()': word_next.istitle(),
			'+1:word.isdigit()': word_next.isdigit(),
			'+1:word.suffix': word_next.lower()[-3:],
			'+1:postag[:2]': postag_next[:2],
		})
	else:
		features['EOS'] = True

	return features

# Function for training the CRF model taken from the sklearn library
# uses X_Train = sentences features
# uses Y_Train = sentence label
def task1_train_crf_model( X_train, Y_train, max_iter, labels ) :
	# train the basic CRF model
	crf = sklearn_crfsuite.CRF(
		algorithm='lbfgs',
		c1=0.3009931321261636,
		c2=0.04367114078367961,
		max_iterations=max_iter,
		all_possible_transitions=True,
	)
	crf.fit(X_train, Y_train)
	return crf

In [23]:
def task5_train_crf_model( X_train, Y_train, max_iter, labels ) :
	# randomized search to discover best parameters for CRF model
	crf = sklearn_crfsuite.CRF(
		algorithm='lbfgs', 
		max_iterations=max_iter, 
		all_possible_transitions=True
	)
	params_space = {
		'c1': scipy.stats.expon(scale=0.5),
		'c2': scipy.stats.expon(scale=0.05),
	}

	# optimize for micro F1 score
	f1_scorer = make_scorer( sklearn_crfsuite.metrics.flat_f1_score, average='weighted', labels=labels )

	logger.info( 'starting randomized search for hyperparameters' )
	n_folds = 2
	n_candidates = 50
	rs = sklearn.model_selection.RandomizedSearchCV(crf, params_space, cv=n_folds, verbose=1, n_jobs=-1, n_iter=n_candidates, scoring=f1_scorer)
	rs.fit(X_train, Y_train)

	# output the results
	logger.info( 'best params: {}'.format( rs.best_params_ ) )
	logger.info( 'best micro F1 score: {}'.format( rs.best_score_ ) )
	logger.info( 'model size: {:0.2f}M'.format( rs.best_estimator_.size_ / 1000000 ) )
	logger.info( 'cv_results_ = ' + repr(rs.cv_results_) )

	# visualize the results in hyperparameter space
	_x = [s['c1'] for s in rs.cv_results_['params']]
	_y = [s['c2'] for s in rs.cv_results_['params']]
	_c = [s for s in rs.cv_results_['mean_test_score']]


	fig = plt.figure()
	fig.set_size_inches(12, 12)
	ax = plt.gca()
	ax.set_yscale('log')
	ax.set_xscale('log')
	ax.set_xlabel('C1')
	ax.set_ylabel('C2')
	ax.set_title("Randomized Hyperparameter Search - F1 scores (blue min={:0.2}, red max={:0.2})".format( min(_c), max(_c) ))
	ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

	# return the best model
	crf = rs.best_estimator_
	return crf

In [24]:
exec_task( word2features_func = task2_word2features, train_crf_model_func = task1_train_crf_model, max_files = 350, max_iter = 100, display_label_subset = display_label_subset )

INFO:tensorflow:Label transition weights learnt from dataset (for a subset of labels)


From \ To,B-DATE,I-DATE,B-GPE,I-GPE,B-PERSON,I-PERSON,O
B-DATE,-2.747,5.712,0.0,-0.224,0.0,-0.107,0.478
I-DATE,-3.336,6.084,0.0,-0.001,0.915,-0.526,0.718
B-GPE,1.313,-0.895,-3.059,5.043,-1.32,-2.427,1.224
I-GPE,0.051,-0.034,-1.613,6.375,0.168,-0.814,-0.059
B-PERSON,-0.211,-0.51,-0.786,-1.315,-4.485,4.775,1.001
I-PERSON,0.321,-0.23,1.394,0.0,-2.478,5.884,0.065
O,2.364,-5.845,1.683,-5.61,3.215,-4.521,5.421


INFO:tensorflow:Top 10 features per-target (for a subset of labels)


Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+5.295,word.suffix:day,,,,,
+4.204,word.lower():years,,,,,
+3.975,word.suffix:ber,,,,,
+3.913,-1:word.lower():frightening,,,,,
+3.788,+1:word.lower():celebrations,,,,,
+3.114,-1:word.lower():department,,,,,
+3.108,word.lower():weeklong,,,,,
+3.108,word:weeklong,,,,,
+3.104,+1:word.lower():eighties,,,,,
+2.988,word:May,,,,,

Weight?,Feature
+5.295,word.suffix:day
+4.204,word.lower():years
+3.975,word.suffix:ber
+3.913,-1:word.lower():frightening
+3.788,+1:word.lower():celebrations
+3.114,-1:word.lower():department
+3.108,word.lower():weeklong
+3.108,word:weeklong
+3.104,+1:word.lower():eighties
+2.988,word:May

Weight?,Feature
+3.038,+1:word.lower():blast
+2.983,word.lower():sixties
+2.983,word:sixties
+2.688,word.suffix:nth
+2.601,word.suffix:ome
+2.587,-1:word.lower():nineteen
+2.552,+1:word.lower():thinking
+2.516,-1:word.lower():night
+2.503,-1:word.lower():late
+2.475,word.lower():years

Weight?,Feature
+4.566,word.suffix:'an
+4.232,+1:word.lower():rejection
+3.454,word.suffix:hai
+3.329,word.suffix:.s.
+3.266,+1:word.lower():places
+3.147,word:LA
+3.129,word.suffix:hou
+3.126,word.suffix:see
+3.033,word.suffix:nta
+3.011,word:anti-Israel

Weight?,Feature
+3.484,+1:word.lower():america
+2.779,+1:word.lower():gave
+2.771,+1:word.lower():hello
+2.568,+1:word.lower():advisor
+2.556,word:State
+2.453,word:Korea
+2.453,word.lower():korea
+2.401,word.lower():village
+2.297,word:city
+2.282,word.lower():province

Weight?,Feature
+3.793,-1:word.lower():secretary
+3.773,-1:word.suffix:tor
+3.418,word.suffix:ert
+3.098,word.suffix:son
+3.086,-1:word.lower():justice
+3.014,-1:word.lower():premier
+2.958,-1:word.lower():governor
+2.920,word.suffix:ick
+2.904,word.suffix:ito
+2.843,-1:word.lower():post

Weight?,Feature
+3.408,-1:word.lower():murtha
+2.911,-1:word.lower():john
+2.869,word.suffix:ald
+2.472,-1:word.lower():abramstein
+2.421,-1:word.suffix:eng
+2.368,word.lower():bush
+1.965,word.suffix:ell
+1.913,word.suffix:son
+1.883,-1:word.lower():bumiller
+1.810,+1:word.lower():bolton

Weight?,Feature
+7.318,BOS
+5.460,EOS
+4.640,postag[:2]:VB
+4.567,word.lower():president
+4.453,-1:word.lower():pill
+3.931,word.suffix:tor
+3.922,+1:word.lower():keep
+3.875,word:End
+3.624,+1:word.lower():primaries
+3.581,word.lower():secretary


INFO:tensorflow:
INFO:tensorflow:macro F1 scores
INFO:tensorflow:B-CARDINAL      -> f1 0.85 ; prec 0.89 ; recall 0.80
INFO:tensorflow:I-CARDINAL      -> f1 0.87 ; prec 1.00 ; recall 0.77
INFO:tensorflow:B-DATE          -> f1 0.86 ; prec 0.91 ; recall 0.81
INFO:tensorflow:I-DATE          -> f1 0.85 ; prec 0.86 ; recall 0.85
INFO:tensorflow:B-EVENT         -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:I-EVENT         -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:B-FAC           -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:I-FAC           -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:B-GPE           -> f1 0.93 ; prec 0.86 ; recall 1.00
INFO:tensorflow:I-GPE           -> f1 0.94 ; prec 0.94 ; recall 0.94
INFO:tensorflow:B-LANGUAGE      -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:I-LANGUAGE      -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:B-LAW           -> f1 0.00 ; prec 0.00 ; recall 0.00
INFO:tensorflow:I-LAW           -> f1 0.00 ; prec 0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
