In [1]:
import os
import json

def get_data(folder_name):
    x = []
    y = []
    positions = []
    file_names = []

    for file in os.listdir(folder_name):
        if file.endswith(".txt"):
            file_name = os.path.join(folder_name, file[:-4])

            file_text = open(file_name + '.txt', encoding='utf8')
            try:
                file_truth = open(file_name + '.truth', encoding='utf8')

                try:
                    text = file_text.read()
                    truth = json.load(file_truth)
                    truth_changes = truth['changes']
                    truth_positions = truth['positions']

                    x.append(text)
                    y.append(truth_changes)
                    positions.append(truth_positions)
                    file_names.append(file[:-4])
                finally:
                    file_truth.close()
            finally:
                file_text.close()

    return x, y, positions, file_names

In [2]:
TRAINING_DIR = '../data/training'
X, y, positions, file_names = get_data(TRAINING_DIR)

In [6]:
import re


from nltk.corpus import words as corpus_words

URL_TOKEN = "_URL_"
NUMBER_TOKEN = "_LONG_NUM_"
CHAR_SEQUENCE_TOKEN = "_CHAR_SEQ_"
FILE_PATH_TOKEN = "_FILE_PATH_"
TRANSLITERATION_TOKEN = "_TRANSLITERATION_"
WORD_SPLIT_TOKEN = "_WORD_SPLIT_"
LONG_WORD_TOKEN = "_LONG_WORD_"

def contains_alnum(word):
    for character in word:
        if character.isalnum():
            return True
    return False

class BasicPreprocessor():

    def __init__(self):
        self.params = {
            "replace_long_numbers" : True,
            "long_word_threshold" : 50,
            "replace_long_char_sequences" : True,
            "replace_file_paths" : True,
            "try_split_words" : True,
            "add_split_token" : False,
        }
        self.url_regex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        # At least five digits are considered long - 4 digits can be a year, which might be interesting on its own
        self.number_regex = re.compile("\d{5,}")
        self.unix_path_regex = re.compile('^(?:/[^/]*)*$')
        self.windows_path_regex = re.compile('^(?:[a-zA-Z]\:|\\\\[\w\.]+\\[\w.$]+)\\(?:[\w]+\\)*\w([\w.])+$')
        self.words = corpus_words.words()

    '''
        This function is to be called before chunking.
        It does some basic processing on the text as
        a whole.
    '''
    def process_text(self, text):
        # Perform URL replacement
        text = self.url_regex.sub(URL_TOKEN, text)
        # Replace long numbers with tag if specified
        if self.params["replace_long_numbers"]:
            text = self.number_regex.sub(NUMBER_TOKEN, text)
        return text

    '''
        This function is to be called after tokenization.
        It goes through the token stream and does some
        filtering, replacement, token addition etc
    '''
    def process_word_list(self, word_list):
        output_words = []
        for word_candidate in word_list:
            word_len = len(word_candidate)
            # Sequence of three characters is ok, like ...
            if word_len < 4:
                output_words.append(word_candidate)
                continue
            # Check for long sequences of characters (no letters or digits) and replace them with tag if specified
            if self.params["replace_long_char_sequences"] and not contains_alnum(word_candidate):
                output_words.append(CHAR_SEQUENCE_TOKEN)
                continue
            if word_len < 10:
                output_words.append(word_candidate)
                continue
            # For words longer than 10 chars check if they are file paths or transliterations
            if self.params["replace_file_paths"]:
                if self.unix_path_regex.match(word_candidate):
                    # Decide whether it's path or transliteration based on unicode characters
                    if all(ord(char) < 128 for char in word_candidate):
                        output_words.append(FILE_PATH_TOKEN)
                        continue
                    else:
                        output_words.append(TRANSLITERATION_TOKEN)
                        continue
                if self.windows_path_regex.match(word_candidate):
                    output_words.append(FILE_PATH_TOKEN)
                    continue
            if word_len < 15:
                output_words.append(word_candidate)
                continue
            # For words longer than 15 chars try to split them into more than two pieces
            if self.params["try_split_words"]:
                word_parts = self.try_split_word(word_candidate)
                if len(word_parts) > 2:
                    if self.params["add_split_token"]:
                        output_words.append(WORD_SPLIT_TOKEN)
                    for part in word_parts:
                        output_words.append(part)
                    continue
            # For super long words replace them with token if requested
            threshold = self.params["long_word_threshold"]
            if threshold > 0 and word_len > threshold:
                output_words.append(LONG_WORD_TOKEN)
            else:
                output_words.append(word_candidate)

        return output_words

    def try_split_word(self, word):
        candidates = word.split('-')
        length = len(candidates)
        num_in_dict = 0
        for candidate in candidates:
            if candidate in self.words:
                num_in_dict = num_in_dict + 1
        if num_in_dict >= length / 2:
            return candidates
        else:
            return [word]

In [8]:
from nltk.tokenize import word_tokenize
import numpy as np
import math

preprocessor = BasicPreprocessor()

def get_segments_merge_last(text, n, chunks, wordFilter=None, process=False):
    segments = []
    words = word_tokenize(text)
    if process:
        words = preprocessor.process_word_list(words)
    x = len(words)
    if chunks:
        n = round(x / chunks)
    n = min(n, x)
    i = 0
    for i in range(0, x-x%n-n, n):
        segments.append(' '.join(words[i:i+n]))
    segments.append(' '.join(words[i+n:]))
    if wordFilter:
        segments = [wordFilter(s) for s in segments]
    return segments

def get_sliding_words(text, n, chunks=None, wordFilter=None, process=False):
    segments = []
    words = word_tokenize(text)
    if process:
        words = preprocessor.process_word_list(words)
    x = len(words)
    if chunks:
        n = round(x / chunks)
    n = min(n, x)
    i = 0
    overlap = round(n/2)
    for i in range(0, x-n-overlap, overlap):
        segments.append(' '.join(words[i:i+n]))
    segments.append(' '.join(words[i+overlap:]))
    if wordFilter:
        segments = [wordFilter(s) for s in segments]
    return segments

def word_chunks(X, n=100, chunks=None, wordFilter=None, sliding=False, process=False):
    if sliding:
        print('Sliding word chunks...')
        return np.array([get_sliding_words(text, n, chunks, wordFilter, process) for text in X])

    print('Word chunks...')
    return np.array([get_segments_merge_last(text, n, chunks, wordFilter, process) for text in X])

In [62]:
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

def lexical(X, feature_names=[]):
    print('Lexical features...')
    transformed = []

    for doc in X:
        segments = []

        for entry in doc:
            entry_char = list(entry)
            entry_word = word_tokenize(entry)
            entry_word_tagged = pos_tag(entry_word)
            entry_sent = sent_tokenize(entry)

            chars, char_features = lexical_chars(entry_char)
            words, word_features = lexical_words(entry_word_tagged)
            sentences, sentence_features = lexical_sentences(entry_sent)

            segments.append(chars + words + sentences)

        transformed.append(segments)

    feature_names.extend(char_features + word_features + sentence_features)

    return np.array(transformed)

def lexical_chars(chars):
    char_count = len(chars)

    char_analysis = {
        'semicolon_count': 0,
        'colon_count': 0,
        'spaces_count': 0,
        'apostrophes_count': 0,
        'parenthesis_count': 0
    }

    for char in chars:
        if char == ';': char_analysis['semicolon_count'] += 1
        if char == ':': char_analysis['colon_count'] += 1
        if char == ' ': char_analysis['spaces_count'] += 1
        if char == '\'': char_analysis['apostrophes_count'] += 1
        if char == '(': char_analysis['parenthesis_count'] += 1

    feature_names = list(char_analysis.keys())
    return [char_analysis[key]/char_count for key in feature_names], feature_names

def lexical_words(words_tagged):
    word_count = len(words_tagged)

    word_analysis = {
        'pronouns': 0,
        'prepositions': 0,
        'adjectives': 0,
        'adverbs': 0,
        'determiners': 0,
        'modals': 0,
        'nouns': 0,
        'personal_pronouns': 0,
        'verbs': 0,
        'word_len_gte_six': 0,
        'word_len_two_and_three': 0,
        'total_word_length': 0,
        'all_caps': 0,
        'capitalized': 0,
        'quotes_count': 0
    }

    for (word, tag) in words_tagged:
        if tag in ['PRP']: word_analysis['personal_pronouns'] += 1
        if tag.startswith('J'): word_analysis['adjectives'] += 1
        if tag.startswith('N'): word_analysis['nouns'] += 1
        if tag.startswith('V'): word_analysis['verbs'] += 1
        if tag in ['PRP', 'PRP$', 'WP', 'WP$']: word_analysis['pronouns'] += 1
        elif tag in ['IN']: word_analysis['prepositions'] += 1
        elif tag in ['RB', 'RBR', 'RBS']: word_analysis['adverbs'] += 1
        elif tag in ['DT', 'PDT', 'WDT']: word_analysis['determiners'] += 1
        elif tag in ['MD']: word_analysis['modals'] += 1
        if len(word) >= 6: word_analysis['word_len_gte_six'] += 1
        elif len(word) in [2, 3]: word_analysis['word_len_two_and_three'] += 1
        word_analysis['total_word_length'] += len(word)
        if word.isupper(): word_analysis['all_caps'] += 1
        if not word.isupper() and word[0].isupper(): word_analysis['capitalized'] += 1
        word_analysis['quotes_count'] += word.count('"') + word.count('``') + word.count('\'\'')


    feature_names = list(word_analysis.keys())
    return [word_analysis[key]/word_count for key in feature_names], feature_names

def lexical_sentences(sentences):
    sent_count = len(sentences)

    sent_analysis = {
        'question_sentences': 0,
        'period_sentences': 0,
        'exclamation_sentences': 0,
        'short_sentences': 0,
        'long_sentences': 0,
        'sentence_length': 0
    }

    for sent in sentences:
        if sent[len(sent) - 1] == '?': sent_analysis['question_sentences'] += 1
        if sent[len(sent) - 1] == '.': sent_analysis['period_sentences'] += 1
        if sent[len(sent) - 1] == '!': sent_analysis['exclamation_sentences'] += 1
        if len(sent) <= 100: sent_analysis['short_sentences'] += 1
        if len(sent) >= 200: sent_analysis['long_sentences'] += 1
        sent_analysis['sentence_length'] += len(sent)


    feature_names = list(sent_analysis.keys())
    return [sent_analysis[key]/sent_count for key in feature_names], feature_names

In [63]:
def pipeline(X):
    feature_names = []

    X = [preprocessor.process_text(x) for x in X]

    lexical_features = lexical(word_chunks(X, process=True), feature_names)
    
    return lexical_features, feature_names

In [64]:
XV, features = pipeline(X)

Word chunks...
Lexical features...


In [65]:
XV.shape

(2980,)

In [66]:
len(features)

26

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XV, y, test_size=0.33, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [68]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(7)

In [69]:
max_doc_length = 200
X_train = sequence.pad_sequences(X_train, maxlen=max_doc_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_doc_length)

In [70]:
# create the model
model = Sequential()
model.add(Bidirectional(LSTM(200, dropout=0.2), input_shape=(max_doc_length, len(features))))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_12 (Bidirectio (None, 400)               363200    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 401       
Total params: 363,601
Trainable params: 363,601
Non-trainable params: 0
_________________________________________________________________
None
Train on 1996 samples, validate on 984 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1fd9cb589e8>

In [71]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 63.62%
