In [1]:
import pandas as pd
import sys
import re
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
import glob
from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn import metrics
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maximlisnic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Read

In [2]:
def ReadInput(filename):

    # Open file
    file = open(filename, 'r')
    contents = file.read()
    file.close()

    return contents

def ReadKey(filename):

    # Open file
    file = open(filename, 'r')
    lines = file.read().splitlines()
    file.close()

    data = []

    # Read lines and parse
    for line in lines:
        items = line.split(': ', 1)

        slot = items[0]

        if len(items) == 1:
            continue
        elif items[1] == '---':
            continue
        elif slot == 'TEXT':
            continue
        elif bool(re.match('^[0-9]+\Z', items[1])):
            answer = items[1]
        else:
            answer = items[1].split('"')[1]

        answer = nltk.word_tokenize(answer + ' .')[:-1]
        slot_tags = ['B-' + slot] + ['I-' + slot]*(len(answer)-1)

        entry = {'slot': slot_tags, 'answer': answer}
        data.append(entry)

    return data

def ReadList(filename):
    """
    read and parse the helper lists
    """

    # Init output
    data = []
    
    # Open file
    file = open(filename, 'r')
    lines = file.read().splitlines()
    file.close()

    # Read lines and parse
    for line in lines:
        data.append(line)

    return data

# Helper

In [3]:
# Adapted from https://www.geeksforgeeks.org/kmp-algorithm-for-pattern-searching/
def KMPSearch(pat, txt):
	M = len(pat)
	N = len(txt)

	starting_indices = []

	lps = [0]*M
	j = 0 # index for pat[]

	computeLPSArray(pat, M, lps)

	i = 0 # index for txt[]
	while i < N:
		if pat[j] == txt[i]:
			i += 1
			j += 1

		if j == M:
			starting_indices.append((i-j))
			j = lps[j-1]

		elif i < N and pat[j] != txt[i]:
			if j != 0:
				j = lps[j-1]
			else:
				i += 1
	
	return starting_indices

def computeLPSArray(pat, M, lps):
	len = 0 

	lps[0] 
	i = 1

	while i < M:
		if pat[i]== pat[len]:
			len += 1
			lps[i] = len
			i += 1
		else:
			if len != 0:
				len = lps[len-1]
			else:
				lps[i] = 0
				i += 1


# Clean

In [4]:
def BuildInitialData(text, key):
    
    # Tokenize text
    sentences = nltk.sent_tokenize(text)
    words = [['PHI-2', 'PHI-1', 'PHI'] + nltk.word_tokenize(sentence) + ['OMEGA', 'OMEGA+1', 'OMEGA+2'] for sentence in sentences]
    words = [word for sentence in words for word in sentence]

    # Match all key tags
    bio_tags = ['O'] * len(words)
    for item in key:
        answer_length = len(item['answer'])
        indices = KMPSearch(item['answer'], words)
        if len(indices) == 0:
            continue
        for i in indices:
            bio_tags[i:i+answer_length] = item['slot']

    # Get POS tags
    words = nltk.pos_tag(words)

    # Build a dataframe
    data = pd.DataFrame(
        words,
        columns = ['WORD', 'POS']
    )
    data['LABEL'] = bio_tags
    data.loc[data['WORD'] == 'PHI', 'POS'] = 'PHIPOS'
    data.loc[data['WORD'] == 'OMEGA', 'POS'] = 'OMEGAPOS'
    data.loc[data['WORD'] == 'PHI-1', 'POS'] = 'PHI-1POS'
    data.loc[data['WORD'] == 'OMEGA+1', 'POS'] = 'OMEGA+1POS'
    data.loc[data['WORD'] == 'PHI-2', 'POS'] = 'PHI-2POS'
    data.loc[data['WORD'] == 'OMEGA+2', 'POS'] = 'OMEGA+1POS'

    return data

def BuildFeatures(data, prefixes, prepositions, suffixes, locations_list):
    """
    function to build the features
    """
    lemmatizer = WordNetLemmatizer()

    # Word attributes ------------------------------

    data['LEMMA'] = (
        data
        .apply(
            lambda x: lemmatizer.lemmatize(x['WORD']),
            axis = 1
         )
    )

    data['ABBR'] = (
        data
        .apply(
            lambda x: x['WORD'].endswith('.')
                      and bool(re.match('^[a-zA-Z.]+\Z', x['WORD']))
                      and bool(re.match('[a-zA-Z]', x['WORD']))
                      and len(x['WORD']) in [2,3,4],
            axis = 1
            )
    )

    data['CAP'] = (
        data
        .apply(
            lambda x: x['WORD'][0].isupper(),
            axis = 1
        )
    )

    data['NUM'] = (
        data
        .apply(
            lambda x: bool(re.match('[0-9]', x['WORD'])),
            axis = 1
        )
    )

    # Adjacent words ------------------------------

    data['NUM+1'] = (
        data
        ['NUM']
        .shift(-1)
    )

    data['NUM-1'] = (
        data
        ['NUM']
        .shift(1)
    )

    data['WORD+1'] = (
        data
        ['LEMMA']
        .shift(-1)
    )

    data['WORD-1'] = (
        data
        ['LEMMA']
        .shift(1)
    )

    data['WORD+2'] = (
        data
        ['LEMMA']
        .shift(-2)
    )

    data['WORD-2'] = (
        data
        ['LEMMA']
        .shift(2)
    )

    data['WORD+3'] = (
        data
        ['LEMMA']
        .shift(-3)
    )

    data['WORD-3'] = (
        data
        ['LEMMA']
        .shift(3)
    )

    data['POS+1'] = (
        data
        ['POS']
        .shift(-1)
    )

    data['POS-1'] = (
        data
        ['POS']
        .shift(1)
    )

    data['POS+2'] = (
        data
        ['POS']
        .shift(-2)
    )

    data['POS-2'] = (
        data
        ['POS']
        .shift(2)
    )

    data['POS+3'] = (
        data
        ['POS']
        .shift(-3)
    )

    data['POS-3'] = (
        data
        ['POS']
        .shift(3)
    )

    # Matching lists ------------------------------

    data['LOC'] = (
        data
        .apply(
            lambda x: x['WORD'].lower() in locations_list,
            axis = 1
        )
    )

    data['PREF'] = (
        data
        .apply(
            lambda x: x['WORD-1'] in prefixes,
            axis = 1
        )
    )

    data['SUFF'] = (
        data
        .apply(
            lambda x: x['WORD+1'] in suffixes,
            axis = 1
        )
    )

    # Globals ------------------------------

    data['LOWERCASE'] = (
        data
        .apply(lambda x: x['WORD'].lower(),
               axis = 1)
    )

    data['GLOBCAP'] = (
        data
        .apply(
            lambda x: bool(re.match('^[a-zA-Z]+\Z', x['WORD']))
                      and not x['WORD'].lower() in prepositions
                      and x['WORD-1'] != 'PHI'
                      and x['CAP'],
            axis = 1
        )
    )
    data['GLOBCAP'] = (
        data
        .groupby(['LOWERCASE'])
        ['GLOBCAP']
        .transform('any')
    )

    data['GLOBPREF'] = (
        data
        .apply(
            lambda x: bool(re.match('^[a-zA-Z]+\Z', x['WORD']))
                      and x['WORD-1'] != 'PHI'
                      and x['WORD-1'] in prefixes,
            axis = 1
        )
    )
    data['GLOBPREF'] = (
        data
        .groupby(['WORD'])
        ['GLOBPREF']
        .transform('any')
    )

    data['GLOBSUFF'] = (
        data
        .apply(
            lambda x: bool(re.match('^[a-zA-Z]+\Z', x['WORD']))
                      and x['WORD+1'] != 'OMEGA'
                      and x['WORD+1'] in suffixes,
            axis = 1
        )
    )
    data['GLOBSUFF'] = (
        data
        .groupby(['WORD'])
        ['GLOBSUFF']
        .transform('any')
    )

    # Clean up ------------------------------

    data = (
        data
        .query('WORD != "PHI" & WORD != "OMEGA"')
        .query('WORD != "PHI-1" & WORD != "OMEGA+1"')
        .query('WORD != "PHI-2" & WORD != "OMEGA+2"')
        .drop(columns = 'LOWERCASE')
        .reset_index(drop = True)
    )

    return data


In [5]:
# Read helpers
prefixes       = ReadList('./lists/prefixes.txt')
prepositions   = ReadList('./lists/prepositions.txt')
suffixes       = ReadList('./lists/suffixes.txt')
locations      = pd.read_csv('./lists/locations.csv')
locations_list = list(locations['country']) + list(locations['capital'])
locations_list = [x.lower() for x in locations_list]

all_docs = sorted(glob.glob('./data/development-docs/*'))
all_keys = sorted(glob.glob('./data/development-anskeys/*'))

data = []

for f_doc, f_key in zip(all_docs, all_keys):
    text = ReadInput(f_doc)
    key = ReadKey(f_key)
    x = BuildInitialData(text, key)
    x['DOC'] = f_doc.split('/')[-1]
    data.append(x)

#data = pd.concat(data)
#data = BuildFeatures(data, prefixes, prepositions, suffixes, locations_list)
data = [BuildFeatures(sent, prefixes, prepositions, suffixes, locations_list) for sent in data]

In [10]:
# For logit
all_docs = data['DOC'].unique()
train_docs = all_docs[0:300]
test_docs = all_docs[300:]

X_train = data.loc[:, data.columns != 'LABEL'][data['DOC'].isin(train_docs)]
y_train = data[data['DOC'].isin(train_docs)]['LABEL']

X_test = data.loc[:, data.columns != 'LABEL'][data['DOC'].isin(test_docs)]
y_test = data[data['DOC'].isin(test_docs)]['LABEL']

v = DictVectorizer(sparse=False)
X_train_v = v.fit_transform(X_train.to_dict('records'))
X_test_v = v.transform(X_test.to_dict('records'))

In [6]:
import pycrfsuite

In [27]:
# For CRF
train_docs = [doc.split('/')[-1] for doc in all_docs[0:300]]
test_docs = [doc.split('/')[-1] for doc in all_docs[300:]]

X_train = [sent.loc[:, sent.columns != 'LABEL'].to_dict('records') for sent in data if sent['DOC'][0] in train_docs]
y_train = [sent['LABEL'] for sent in data if sent['DOC'][0] in train_docs]

X_test = [sent.loc[:, sent.columns != 'LABEL'].to_dict('records') for sent in data if sent['DOC'][0] in test_docs]
y_test = [sent['LABEL'] for sent in data if sent['DOC'][0] in test_docs]

In [28]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 460 ms, sys: 18.4 ms, total: 479 ms
Wall time: 487 ms


In [44]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 300,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [45]:
%%time
trainer.train('test_crf_model.crfsuite')

CPU times: user 1min 12s, sys: 384 ms, total: 1min 12s
Wall time: 1min 13s


In [46]:
trainer.logparser.last_iteration

{'num': 1000,
 'scores': {},
 'loss': 2984.904879,
 'feature_norm': 47.521578,
 'error_norm': 11.279807,
 'active_features': 2407,
 'linesearch_trials': 3,
 'linesearch_step': 0.25,
 'time': 0.121}

In [47]:
tagger = pycrfsuite.Tagger()
tagger.open('test_crf_model.crfsuite')

<contextlib.closing at 0x7faf714d6860>

In [48]:
yhat = [tagger.tag(sent) for sent in X_test]

In [49]:
def ExtractSlot(slot, pred, sent):

    mask = [bool(re.match('[BI]-'+slot, tag)) for tag in pred]
    tags = np.array(pred)[mask]
    words = np.array([word['WORD'] for word in sent])[mask]

    seen = []
    results = []
    for tag, word in zip(tags,words):
        if tag[0] == 'B':
            results.append(' '.join(seen))
            seen = []
            seen.append(word)
        else:
            seen.append(word)
    results.append(' '.join(seen))
    results = list(set([item for item in results if item != '']))

    return results

In [50]:
answers = []

for sent, pred in zip(X_test, yhat):

    phrases = []
    for slot in ['ACQUIRED', 'ACQBUS', 'ACQLOC', 'DLRAMT', 'PURCHASER', 'SELLER', 'STATUS']:
        phrases.append(ExtractSlot(slot, pred, sent))
    
    doc_answer = {
        'TEXT': sent[0]['DOC'],
        'ACQUIRED': phrases[0],
        'ACQBUS': phrases[1],
        'ACQLOC': phrases[2],
        'DLRAMT': phrases[3],
        'PURCHASER': phrases[4],
        'SELLER': phrases[5],
        'STATUS': phrases[6],
    }
    answers.append(doc_answer)

In [145]:
clf = LogisticRegression(random_state=0, tol=0.1, solver='sag', verbose=1, n_jobs=-1).fit(X_train_v, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


convergence after 19 epochs took 979 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 16.3min finished


In [146]:
yhat = clf.predict(X_test_v)

cr = metrics.classification_report(yhat, y_test)
print(cr)

              precision    recall  f1-score   support

    B-ACQBUS       0.07      0.50      0.12         6
    B-ACQLOC       0.14      0.38      0.20         8
  B-ACQUIRED       0.43      0.58      0.49        73
    B-DLRAMT       0.51      0.68      0.58        28
 B-PURCHASER       0.54      0.51      0.52       104
    B-SELLER       0.05      0.40      0.09         5
    B-STATUS       0.48      0.65      0.55        55
    I-ACQBUS       0.15      0.54      0.24        26
    I-ACQLOC       0.07      0.29      0.11         7
  I-ACQUIRED       0.47      0.51      0.49       197
    I-DLRAMT       0.34      0.59      0.43        34
 I-PURCHASER       0.49      0.52      0.50       181
    I-SELLER       0.08      0.35      0.13        17
    I-STATUS       0.35      0.62      0.45        60
           O       0.98      0.95      0.96     10971

    accuracy                           0.92     11772
   macro avg       0.34      0.54      0.39     11772
weighted avg       0.94   

In [147]:
datahat = X_test.copy()
datahat['pred'] = yhat
datahat['true'] = y_test

In [148]:
answers = []

for doc in test_docs:
    sent = datahat[datahat['DOC'] == doc][datahat['pred'] != 'O']
    if len(sent) == 0:
        continue
    sent['type'] = (
        sent
        .apply(
            lambda x: x['pred'].split('-')[1],
            axis = 1
            )
    )

    phrases = []
    for TYPE in ['ACQUIRED', 'ACQBUS', 'ACQLOC', 'DLRAMT', 'PURCHASER', 'SELLER', 'STATUS']:
        phrase = ' '.join(sent[sent['type'] == TYPE]['WORD'])
        if phrase == '':
            phrases.append('---')
        else:
            phrases.append(phrase)
    
    doc_answer = {
        'TEXT': doc,
        'ACQUIRED': phrases[0],
        'ACQBUS': phrases[1],
        'ACQLOC': phrases[2],
        'DLRAMT': phrases[3],
        'PURCHASER': phrases[4],
        'SELLER': phrases[5],
        'STATUS': phrases[6],
    }
    answers.append(doc_answer)

  after removing the cwd from sys.path.


In [26]:
answers[110]

{'TEXT': '16029',
 'ACQUIRED': ['IPCO Corp', 'Prime Medical Products Inc'],
 'ACQBUS': [],
 'ACQLOC': [],
 'DLRAMT': ['4.9 mln dlrs'],
 'PURCHASER': [],
 'SELLER': [],
 'STATUS': []}

In [51]:
original_stdout = sys.stdout

with open('./scorer/test_results_crf.templates', 'w') as f:
    sys.stdout = f 
    for answer in answers:
        for key,value in answer.items():
            if len(value) == 0:
                print('%s: ---' % key)
            elif key == 'TEXT':
                print('%s: %s' % (key, value))
            else:
                for item in value:
                    print('%s: \"%s\"' % (key, item))
        print('')
    sys.stdout = original_stdout
            

In [56]:
# Create gold file
test_keys = ['./data/development-anskeys/' + num + '.key' for num in test_docs]
original_stdout = sys.stdout

with open('./scorer/test_gold.templates', 'w') as f:
    sys.stdout = f 
    for filename in test_keys:
        file = open(filename, 'r')
        lines = file.read()
        file.close()
        print(lines)
        print('')
    sys.stdout = original_stdout

# Create gold file

original_stdout = sys.stdout
with open('./scorer/all_gold.templates', 'w') as f:
    sys.stdout = f 
    for filename in all_keys:
        file = open(filename, 'r')
        lines = file.read()
        file.close()
        print(lines)
        print('')
    sys.stdout = original_stdout



In [55]:
# Build some input files

############################ Docs ###################################
# All -------------------
original_stdout = sys.stdout
with open('./all_docs.txt', 'w') as f:
    sys.stdout = f 
    for filename in all_docs:
        print(filename)
    sys.stdout = original_stdout

# Train -------------------
original_stdout = sys.stdout
train_filenames = ['./data/development-docs/' + num for num in train_docs]
with open('./train_docs.txt', 'w') as f:
    sys.stdout = f 
    for filename in train_filenames:
        print(filename)
    sys.stdout = original_stdout

# Test -------------------
original_stdout = sys.stdout
test_filenames = ['./data/development-docs/' + num for num in test_docs]
with open('./test_docs.txt', 'w') as f:
    sys.stdout = f 
    for filename in test_filenames:
        print(filename)
    sys.stdout = original_stdout

############################ Keys ###################################
# All -------------------
original_stdout = sys.stdout
with open('./all_keys.txt', 'w') as f:
    sys.stdout = f 
    for filename in all_keys:
        print(filename)
    sys.stdout = original_stdout

# Train -------------------
original_stdout = sys.stdout
train_keys = ['./data/development-anskeys/' + num + '.key' for num in train_docs]
with open('./train_keys.txt', 'w') as f:
    sys.stdout = f 
    for filename in train_keys:
        print(filename)
    sys.stdout = original_stdout

In [None]:
original_stdout = sys.stdout

with open('./all_docs.txt', 'w') as f:
    sys.stdout = f 
    for filename in all_docs:
        print(filename)
        print('')
    sys.stdout = original_stdout