In [1]:
# Data Import and Pre-Processing
import pandas as pd

In [21]:
train_words_raw = pd.read_csv('./data/train_notypes', sep='\t', header=None, names=['word', 'label'])
train_vocab = set(train_words_raw.word.unique())

In [25]:
import regex as re
from collections import namedtuple
class RegexFeatures(object):
    PATTERNS = {
        "isInitCapitalWord": re.compile(r'^[A-Z][a-z]+'),
        "isAllCapitalWord": re.compile(r'^[A-Z][A-Z]+$'),
        "isAllSmallCase": re.compile(r'^[a-z]+$'),
        "isWord": re.compile(r'^[a-zA-Z][a-zA-Z]+$'),
        "isAlphaNumeric": re.compile(r'^\p{Alnum}+$'),
        "isSingleCapLetter": re.compile(r'^[A-Z]$'),
        "containsDashes": re.compile(r'.*--.*'),
        "containsDash": re.compile(r'.*\-.*'),
        "singlePunctuation": re.compile(r'^\p{Punct}$'),
        "repeatedPunctuation": re.compile(r'^[\.\,!\?"\':;_\-]{2,}$'),
        "singleDot": re.compile(r'[.]'),
        "singleComma": re.compile(r'[,]'),
        "singleQuote": re.compile(r'[\']'),
        "isSpecialCharacter": re.compile(r'^[#;:\-/<>\'\"()&]$'),
        "fourDigits": re.compile(r'^\d\d\d\d$'),
        "isDigits": re.compile(r'^\d+$'),
        "isNumber": re.compile(r'^((\p{N}{,2}([,]?\p{N}{3})+)(\.\p{N}+)?)$'),
        "containsDigit": re.compile(r'.*\d+.*'),
        "endsWithDot": re.compile(r'\p{Alnum}+\.$'),
        "isURL": re.compile(r'^http[s]?://'),
        "isMention": re.compile(r'^(RT)?@[\p{Alnum}_]+$'),
        "isHashtag": re.compile(r'^#\p{Alnum}+$'),
        "isMoney": re.compile(r'^\$((\p{N}{,2}([,]?\p{N}{3})+)(\.\p{N}+)?)$'),
    }
    def __init__(self):
        print("Initialized RegexFeature")
    @staticmethod
    def process(word):
        features = dict()
        for k, p in RegexFeatures.PATTERNS.iteritems():
            if p.match(word):
                features[k] = True
        return features
    
    
def classification_report_to_df(report):
    report_list = []
    for i, line in enumerate(report.split("\n")):
        if i == 0:
            report_list.append(["class", "precision", "recall", "f1-score", "support"])
        else:
            line = line.strip()
            if line:
                if line.startswith("avg"):
                    line = line.replace("avg / total", "avg/total")
                line = re.split(r'\s+', line)
                report_list.append(tuple(line))
    return pd.DataFrame(report_list[1:], columns=report_list[0])


DATA_DIR="data/data/"
CLEANED_DIR="data/cleaned/"

Tag = namedtuple("Tag", ["token", "tag"])

def load_sequences(filename, sep="\t", notypes=False, test_data=False):
    sequences = []
    with open(filename) as fp:
        seq = []
        for line in fp:
            line = line.strip()
            if line:
                line = line.split(sep)
                if test_data:
                    assert len(line) == 1
                    line.append("?")
                if notypes:
                    line[1] = line[1][0]
                seq.append(Tag(*line))
            else:
                sequences.append(seq)
                seq = []
        if seq:
            sequences.append(seq)
    return sequences


def load_vocab(filename):
    vocab = set()
    with open(filename) as fp:
        for line in fp:
            line = line.strip()
            vocab.add(line)
    return vocab      

    
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
        
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr)) 
        
        
def plot_cm(y_test, y_pred, labels=[], axis=1):
    labels_s = dict((k,i) for i,k in enumerate(labels))
    cm = np.zeros((len(labels), len(labels)))
    for i,j in zip(sum(y_test, []), sum(y_pred, [])):
        i = labels_s[i]
        j = labels_s[j]
        cm[i,j] += 1
    with plt.rc_context(rc={'xtick.labelsize': 12, 'ytick.labelsize': 12,
                       'figure.figsize': (16,14)}):
        sns.heatmap(cm * 100/ cm.sum(axis=axis, keepdims=True),
                    #cmap=sns.cubehelix_palette(n_colors=100, rot=-.4, as_cmap=True),
                    cmap="Greys",
                    xticklabels=labels,
                    yticklabels=labels)
        plt.ylabel("True labels")
        plt.xlabel("Predicted labels")
        title = "Precision Plot"
        if axis== 0:
            title = "Recall Plot"
        plt.title(title)
    print(cm.shape)
    return cm


def print_sequences(sequences, predictions, filename, test_data=False, notypes=False):
    with open(filename, "wb+") as fp:
        for seq, pred in zip(sequences, predictions):
            for t, p in zip(seq, pred):
                token, tag = t
                if tag[0] == "U":
                    tag = "B%s" % tag[1:]
                if tag[0] == "E":
                    tag = "I%s" % tag[1:]
                if p[0] == "U":
                    p = "B%s" % p[1:]
                if p[0] == "E":
                    p = "I%s" % p[1:]
                if notypes:
                    tag = tag[0]
                    p = p[0]
                if test_data:
                    line = "\t".join((token, p))
                else:
                    line = "\t".join((token, tag, p))
                print >> fp, line
            print >> fp, ""
    print("Done")

In [28]:
WORD_SPLITTER = re.compile(r'[\p{Punct}\s]+')
class DictionaryFeatures:
    def __init__(self, dictDir):
        self.word2dictionaries = {}
        self.word2hashtagdictionaries = {}
        self.dictionaries = []
        i = 0
        for d in os.listdir(dictDir):
            print >> sys.stderr, "read dict %s"%d
            self.dictionaries.append(d)
            if d == '.svn':
                continue
            for line in open(dictDir + "/" + d):
                word = line.rstrip('\n')
                word = word.strip(' ').lower()
                word = WORD_SPLITTER.sub(" ", word)
                word_hashtag = "".join(WORD_SPLITTER.split(word))
                if not self.word2dictionaries.has_key(word):
                    self.word2dictionaries[word] = str(i)
                else:   
                    self.word2dictionaries[word] += "\t%s" % i
                if not self.word2hashtagdictionaries.has_key(word_hashtag):
                    self.word2hashtagdictionaries[word_hashtag] = str(i)
                else:
                    self.word2hashtagdictionaries[word_hashtag] += "\t%s" % i
            i += 1
    
    MAX_WINDOW_SIZE=6
    def GetDictFeatures(self, words, i):
        features = []
        phrase = ' '.join(words[i:i+1]).lower().strip(string.punctuation)
        phrase = WORD_SPLITTER.sub(" ", phrase)
        if self.word2dictionaries.has_key(phrase):
            for j in self.word2dictionaries[phrase].split('\t'):
                features.append('DICT=%s' % self.dictionaries[int(j)])
        for window in range(1, self.MAX_WINDOW_SIZE+1):
            ## Forward
            start=i
            end =i + window + 1
            if start > -1 and end < len(words) + 1:
                phrase = ' '.join(words[start:end]).lower().strip(string.punctuation)
                phrase = WORD_SPLITTER.sub(" ", phrase)
                if self.word2dictionaries.has_key(phrase):
                    for j in self.word2dictionaries[phrase].split('\t'):
                        features.append('DICTFWD[+%s]=%s' % (window, self.dictionaries[int(j)]))
            ## Backward
            start = i - window
            end =i+1
            if start > -1 and end < len(words) + 1:
                phrase = ' '.join(words[start:end]).lower().strip(string.punctuation)
                phrase = WORD_SPLITTER.sub(" ", phrase)
                if self.word2dictionaries.has_key(phrase):
                    for j in self.word2dictionaries[phrase].split('\t'):
                        features.append('DICTBCK[-%s]=%s' % (window, self.dictionaries[int(j)]))
            ## Window        
            start = i - window
            end =i+window+1
            if start > -1 and end < len(words) + 1:
                phrase = ' '.join(words[start:end]).lower().strip(string.punctuation)
                phrase = WORD_SPLITTER.sub(" ", phrase)
                if self.word2dictionaries.has_key(phrase):
                    for j in self.word2dictionaries[phrase].split('\t'):
                        features.append('DICTWIN[%s]=%s' % (window, self.dictionaries[int(j)]))
                        
        """
        for window in range(1,self.MAX_WINDOW_SIZE):
            start=max(i-window+1, 0)
            end = start + window
            phrase = ' '.join(words[start:end]).lower().strip(string.punctuation)
            phrase = WORD_SPLITTER.sub(" ", phrase)
            if self.word2dictionaries.has_key(phrase):
                for j in self.word2dictionaries[phrase].split('\t'):
                    features.append('DICT=%s' % self.dictionaries[int(j)])
                    if window > 1:
                        features.append('DICTWIN[%s]=%s' % (window, self.dictionaries[int(j)]))
                        
        """                
        return list(set(features))
    
    def GetHashtagDictFeatures(self, word):
        features = []
        if len(word) < 2 or word[0] != "#":
            return features
        word = word[1:].lower().strip(string.punctuation)
        if self.word2hashtagdictionaries.has_key(word):
            for j in self.word2hashtagdictionaries[word].split('\t'):
                features.append('DICT_HASHTAG=%s' % self.dictionaries[int(j)])
        return list(set(features))

In [30]:
train_sequences = load_sequences("data/train", sep="\t", notypes=False)
dev_sequences = load_sequences("data/dev", sep="\t", notypes=False)
test_sequences = load_sequences("data/test", sep="\t", notypes=False)

In [33]:
all_sentences = [[t[0] for t in seq] for seq in (train_sequences+dev_sequences)]

In [36]:
other_entities = {
    "isHashtag": [],
    "isMention": [],
    "isURL": [],
    "isMoney": [],
    "isNumber": [],
    "repeatedPunctuation": []
}
for seq in all_sentences:
    for t in seq:
        for k in other_entities.keys():
            if RegexFeatures.PATTERNS[k].match(t):
                other_entities[k].append(t)
for k, v in other_entities.items():
    print(k, len(v))

isHashtag 814
isMention 1950
isURL 955
isMoney 7
isNumber 325
repeatedPunctuation 1294


In [37]:
ENTITY_MAPPINGS={k: "__%s__" % k for k in other_entities.keys()}
ENTITY_MAPPINGS

{'isHashtag': '__isHashtag__',
 'isMention': '__isMention__',
 'isURL': '__isURL__',
 'isMoney': '__isMoney__',
 'isNumber': '__isNumber__',
 'repeatedPunctuation': '__repeatedPunctuation__'}

In [39]:
def preprocess_token(x, to_lower=False):
    for k in ENTITY_MAPPINGS.keys():
        if RegexFeatures.PATTERNS[k].match(x):
            return ENTITY_MAPPINGS[k]
    if to_lower:
        x = x.lower()
    return x

In [40]:
word2vec_sentences = [[preprocess_token(t[0], to_lower=True) for t in seq] for seq in (train_sequences+dev_sequences)]

In [43]:
from gensim.models import word2vec
model = word2vec.Word2Vec(word2vec_sentences, size=50, window=10, sg=1, hs=0, min_count=1, negative=10, workers=-1, iter=5)

In [None]:
# Model Start

In [125]:
import tensorflow as tf
tf.reset_default_graph()

In [126]:
# Model Params

learning_rate = 0.01
train_epoch = 10000
input_size = 10
batch_size = 100
num_units = 512
num_classes = 5

In [127]:
# Placeholders

inputs = tf.placeholder(tf.float32, [None, None, input_size], name='inputs')
labels = tf.placeholder(tf.int32, [None, None], name='labels')
batch_seq_len = tf.placeholder(tf.int32)
org_seq_len = tf.placeholder(tf.int32, [None])

In [128]:
# Bi-LSTM Cell
with tf.name_scope("BiLSTM"):
    with tf.variable_scope('forward'):
        lstm_fw = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
    with tf.variable_scope('backward'):
        lstm_bw = tf.nn.rnn_cell.LSTMCell(num_units, forget_bias=1.0, state_is_tuple=True)
    
    (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=lstm_fw,
                                                                     cell_bw=lstm_bw,
                                                                     inputs=inputs,
                                                                     sequence_length=org_seq_len,
                                                                     dtype=tf.float32,
                                                                     scope="BiLSTM")



outputs = tf.concat([output_fw, output_bw], axis=2)

# FC
W = tf.get_variable("W", [2 * num_units, num_classes], dtype=tf.float32)
b = tf.get_variable("b", [num_classes], dtype=tf.float32, initializer=tf.zeros_initializer())

outputs_flat = tf.reshape(outputs, [-1, 2 * num_units])
pred = tf.matmul(outputs_flat, W) + b
scores = tf.reshape(pred, [-1, batch_seq_len, num_classes])

# CRF

log_loss, trans_params = tf.contrib.crf.crf_log_likelihood(scores, labels, org_seq_len)
loss = tf.reduce_mean(-log_loss)

# viterbi Seq, score
viterbi_seq, viterbi_score = tf.contrib.crf.crf_decode(scores, trans_params, org_seq_len)

# Train Ops
train_opt = tf.train.AdamOptimizer(learning_rate)
train_op = train_opt.minimize(loss)

# Saver
saver = tf.train.Saver()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
# Train into Session

# Take From Batch
batch_inputs = []
batch_labels = []
batch_seq_lengths = 0

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(train_epoch):
        for batch_inputs, batch_labels, batch_len, batch_seq_lengths in batch(x_, y_, seq_len_train, batch_size, input_size):
            tf_viterbi_seq, _ = sess.run([viterbi_seq, train_op],
                                        feed_dict={inputs: batch_inputs,
                                                  labels: batch_labels,
                                                  batch_seq_len:batch_len,
                                                  org_seq_len: batch_seq_lengths})
        
            if i % 30 == 0:
                mask = (np.expand_dims(np.arrange(batch_len), axis=0) < np.expand_dims(batch_seq_lengths, axis=1))
                total_labels = np.sum(batch_seq_lengths)
                correct_labels = np.sum((batch_labels == tf_viterbi_seq)* mask)
                accuracy = 100.0 * correct_labels / float(total_labels)
                print("Epoch ", i, " Accuracy: ", accuracy)
        
    saver.save(sess, './model_crf')

In [None]:
# Test Accuracy and Pred

# Take From Batch
batch_test = []
batch_test_labels = []
batch_seq_t_lengths = 0

with tf.Sesion() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(train_epoch):
        for batch_inputs, batch_labels, batch_len, batch_seq_t_lengths in batch(x_t, y_t, seq_len_test, batch_test_size, input_size):
            tf_viterbi_seq = sess.run(viterbi_seq,
                                        feed_dict={inputs: batch_inputs,
                                                  labels: batch_labels,
                                                  batch_seq_len:batch_len,
                                                  org_seq_len: batch_seq_t_lengths})
        
            
            mask = (np.expand_dims(np.arrange(batch_len), axis=0) < np.expand_dims(batch_seq_lengths, axis=1))
            total_labels = np.sum(batch_seq_t_lengths)
            correct_labels = np.sum((batch_labels == tf_viterbi_seq)* mask)
            accuracy = 100.0 * correct_labels / float(total_labels)
            print("Test Accuracy: ", accuracy)
            print("Label: ", batch_labels[0].astype(int))
            print("Pred: ", tf_viterbi_sequence[0])