In [1]:
import sys
print(sys.executable)

/Users/nuhil/miniconda3/envs/nlp/bin/python


### Linguistics Operations

In [2]:
import spacy
from spacy import displacy

nlp = spacy.load('en')

**Dependency Parse Tree**

In [3]:
doc = nlp(u'He lost \$100K in the share market in last couple of months')
print(doc.text)
for nc in doc.noun_chunks:
    print(nc)
displacy.render(doc, style='dep', jupyter=True, options={'distance' : 100, 'compact' : False})
for t in doc:
    print(t)

He lost \$100K in the share market in last couple of months
He
\$100K
the share market
last couple
months


He
lost
\$100
K
in
the
share
market
in
last
couple
of
months


**Named entity recognition**

In [4]:
displacy.render(doc, style='ent', jupyter=True)

**Dependency Children**

In [5]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

He nsubj lost VERB []
lost ROOT lost VERB [He, K, in]
\$100 compound K PROPN []
K dobj lost VERB [\$100, in]
in prep K PROPN [market]
the det market NOUN []
share compound market NOUN []
market pobj in ADP [the, share]
in prep lost VERB [couple]
last amod couple NOUN []
couple pobj in ADP [last, of]
of prep couple NOUN [months]
months pobj of ADP []


**Training the Entity Recogniser**

In [6]:
doc = nlp(u'I got flu.')
displacy.render(doc, style='ent', jupyter=True)

In [7]:
import random
from pathlib import Path

**Annotation of Custom Entity**

In [8]:
# Helper Function

# text = 'My son is in depression because of his grades'
# look = 'depression'
# if text.find(look) != -1:    
#     start = text.find(look)
#     end = text.find(look)+len(look)

# print(start, end, text[start:end])

# I have the flu
# I got a fever
# He was identified as diabetic positive after the blood test
# My daughter is having severe toothache 
# Heart disease is what I was afraid of and eventually got that
# My son is in depression because of his grades

**Train and Save the Custom Model**

In [9]:
# new entity label
LABEL = 'HEALTH'

# training data
# Note: If you're using an existing model, make sure to mix in examples of
# other entity types that spaCy correctly recognized before. Otherwise, your
# model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
TRAIN_DATA = [    
    ("I got flu.", {
        'entities': [(6, 9, 'HEALTH')]
    }),
    ("He has the flu.", {
        'entities': [(7, 14, 'HEALTH')]
    }),    
    ("He is sick.", {
        'entities': [(6, 10, 'HEALTH')]
    }),    
    ("I got a fever", {
        'entities': [(8, 13, 'HEALTH'), (6, 13, 'HEALTH')]
    }),        
    ("He was identified as diabetic positive after the blood test", {
        'entities': [(21, 29, 'HEALTH'), (49, 59, 'HEALTH'), (45, 59, 'HEALTH')]
    }),            
    ("My daughter is having severe toothache", {
        'entities': [(29, 38, 'HEALTH')]
    }),   
    ("Heart disease is what I was afraid of and eventually got that", {
        'entities': [(0, 13, 'HEALTH'), (6, 13, 'HEALTH')]
    }),       
    ("My son is in depression because of his grades", {
        'entities': [(13, 23, 'HEALTH')]
    }),    
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),    
]


# @plac.annotations(
#     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
#     new_model_name=("New model name for model meta.", "option", "nm", str),
#     output_dir=("Optional output directory", "option", "o", Path),
#     n_iter=("Number of training iterations", "option", "n", int))
def add_new_dre(model=None, new_model_name='custom_model', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.load('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'He has the flu. Andrew got flu. I lost $10 bucks.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


add_new_dre('en', 'custom_model', 'model')

Loaded model 'en'


  ret = sqrt(sqnorm)


{'ner': 42.270778634674109}
{'ner': 35.745225217444521}
{'ner': 30.057589331588179}
{'ner': 24.21131648940182}
{'ner': 26.694831935074767}
{'ner': 21.809140086647652}
{'ner': 28.685883187580792}
{'ner': 24.399925323408343}
{'ner': 20.425318012377694}
{'ner': 23.608125195151555}
{'ner': 22.002870753829114}
{'ner': 22.756646265962566}
{'ner': 17.117172025916926}
{'ner': 17.920637648543963}
{'ner': 22.060565742804712}
{'ner': 15.519913633222156}
{'ner': 10.782497671418788}
{'ner': 20.20362781767281}
{'ner': 16.393087865454085}
{'ner': 13.660432046351721}
Entities in 'He has the flu. Andrew got flu. I lost $10 bucks.'
HEALTH the flu
PERSON Andrew
MONEY $10
Saved model to model
Loading from model
HEALTH the flu
PERSON Andrew
MONEY $10


**Checking the Newly Trained Entity Recogniser**

In [10]:
nlp = spacy.load('model')
doc = nlp(u'I have the flu. I have $1000 loan. Shaka Khan is sick.')
displacy.render(doc, style='ent', jupyter=True)

In [11]:
doc = nlp(u'I got flu.')
displacy.render(doc, style='ent', jupyter=True)

In [12]:
# Identifying voice of sentence
# Helper function

# def voice_of_sentence(doc):
#     sentence_type = ""
#     for token in doc:
#         if token.dep_ == 'nsubj':
#             sentence_type = 'active'
#             break
#         elif token.dep_ == 'nsubjpass':
#             sentence_type = 'passive'

#     return sentence_type        

**DRE with Noun-Phrase Relation Identification**
Here, we extract DREs and then check the
dependency tree to find the *noun phrase* or possible *subject* it is referring to. By other word to identify `<Subjec, Predicate, Object>` Triplets

In [13]:
def dre_rel(model, dre_type, texts):
    nlp = spacy.load(model)

    for text in texts:
        doc = nlp(text)
        
#         displacy.render(doc, style='dep', jupyter=True, options={'distance' : 100, 'compact' : True})
        displacy.render(doc, style='ent', jupyter=True)
        
#         for token in doc:
#             print(token.text, token.dep_, token.head.text, token.head.pos_,
#                   [child for child in token.children])          
        
        relations = extract_dre_relations(doc, dre_type)
#         print(relations)
        for r1, r2, r3 in relations:
            
            entity_type = check_ent_in_noun_phrase(r3, dre_type)
            entity_type = 'LOCATION' if entity_type in ['LOC', 'GPE', 'ORG', 'FAC'] else entity_type
            print('Possible Subject: {}\t Noun Phrase: {:<10}\t Entity Type: {}\t Entity: {}'.format(str(r1), r2.text, entity_type, r3.text))


**Identifying possible subject**   
Helper function

In [14]:
PERSONS = ['I', 'ME', 'MY', 'MINE', 'YOU', 'YOUR', 'YOURS', 'HE', 'SHE', 
           'HIS', 'HER', 'HIM', 'THEY', 'THEM', 'THEMSELVES', 'OUR', 'WE']

In [15]:
def get_possible_subject(root):
    
    subject = [w for w in root.lefts if w.dep_ == 'nsubj']
    
    
    if subject:
        subject = subject[0]
    else:
        subject = [w for w in root.lefts if w.dep_ == 'nsubjpass']
        if subject:
            subject = subject[0]
        else:    
            possible_subjects = []
            for possible_subject in root.children:
                if possible_subject.dep_ == 'dobj':
                    possible_subjects.append(possible_subject)
                else:
                    possible_subjects.append('')
            subject = possible_subjects[0]      
     
    if type(subject) != str and subject.ent_type_ == 'PERSON':
        return subject
    else:        
        # Check if the subject is a Human
        for person in PERSONS:
            if person in str(subject).upper():        
                return subject   
        
    return 'Not a First/Second Person'    

In [16]:
# Split noun phrase again into ents
def check_ent_in_noun_phrase(w, dre_type=None):
    if type(w) != str and w.ent_type_ != '':
        return w.ent_type_ if w.ent_type_ in dre_type else ''
    else:
        doc = nlp(str(w))
        for token in doc:
            if token.ent_type_ != '':
                return token.ent_type_ if token.ent_type_ in dre_type else ''
    return ''        

def extract_dre_relations(doc, dre_type):
    # merge entities and noun chunks into one token
#     print("Ents + Noun Chunks")
#     print(list(doc.ents))
#     print(list(doc.noun_chunks))
    spans = list(doc.ents) + list(doc.noun_chunks)
    
    for span in spans:
        ms = span.merge()
    
#     test = filter(lambda w: check_ent_in_noun_phrase(w, dre_type), doc)
#     print('list...', list(test))
    
    relations = []
    for dre in filter(lambda w: check_ent_in_noun_phrase(w, dre_type) != '', doc):
#         print('-', dre.text, dre.dep_)    
        if dre.dep_ in ('attr', 'dobj'):
            root = dre.head
            subject = get_possible_subject(root)
            if type(subject) == str and subject == 'Not a First/Second Person':
                root = root
            else:
                root = subject
            relations.append((subject, root, dre)) 
        elif dre.dep_ == 'pobj' and dre.head.dep_ == 'prep':
            if dre.head.head.dep_ == 'pobj' and dre.head.head.head.dep_ == 'prep':
                root = dre.head.head.head.head
                if root.dep_ == 'acomp':
                    root = root.head
                subject = get_possible_subject(root)                 
                relations.append((subject, root, dre))                  
            elif dre.head.head.dep_ in ('attr', 'ROOT'):
                root = dre.head.head
                subject = get_possible_subject(root)                 
                relations.append((subject, root, dre)) 
            elif dre.head.head.head.dep_ in ('attr', 'ROOT'):
                root = dre.head.head.head
                subject = get_possible_subject(root)                 
                relations.append((subject, root, dre))                          
        elif dre.dep_ == 'appos' and dre.head.dep_ == 'pobj':
            if dre.head.head.head.dep_ == 'ROOT':
                root = dre.head.head.head
                subject = get_possible_subject(root)                 
                relations.append((subject, root, dre))
        elif dre.dep_ == 'npadvmod':
            if dre.head.head.head.dep_ == 'ROOT':
                root = dre.head.head.head
                subject = get_possible_subject(root)                 
                relations.append((subject, root, dre))
        elif dre.dep_ == 'amod':
            if dre.head.head.head.dep_ == 'ROOT':
                root = dre.head.head.head
                subject = get_possible_subject(root)                 
                relations.append((subject, root, dre))                
    return relations

**Text to be tested below**

In [17]:
texts = [
#     'I live in W Hale Street',
#     'Meet me in the Coffee Shop',
#     'My office is located at Main St. Boise, Idaho',
#     'I will go to Town Square mall',
#     'Town Square mall is a good place to hangout'
#     'Steve works for Google',
#     'Paris is the capital of France',
    
#     'Our meeting will be at 3pm in the US Bank building',
#     'I went to office yesterday',
#     'I am going to fire him if he does not reply within tomorrow',
#     'We are planing to leave for Paris on 31st December in early morning',

#     'My company lost $1 million dollar revenue in last quarter',
#     'Steve owes 500 dollars to me',
#     'I am happy with my salary of $24,500 per year',
#     'Net income was $9.4 million compared to the prior year of $2.7 million',
    
#     'I have the flu',
#     'I got fever',
#     'My son was identified as diabetic positive',
#     'My daughter is having severe toothache',
#     'My son is in depression because of his grades',
#     'Heart disease is what I was afraid of and eventually got that',

#     'Worldwide production of apples in 2013 was 90.8 million tonnes',
#     'I was not called at Starbucks by him',
#     'My son\'s name is Andrew and got the flu',
    'Okey, I will meet you in Starbucks at 7pm sharp'
]

### Test the Linguistics Approach

In [18]:
dre_rel('model', ['LOC', 'GPE', 'ORG', 'FAC','DATE', 'TIME', 'MONEY', 'HEALTH'], texts)

Possible Subject: I	 Noun Phrase: meet      	 Entity Type: LOCATION	 Entity: Starbucks


**Some Linguistics Factors to be Considered for Rule based manipulation**   
Helper constants

In [19]:
# SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
# OBJECTS = ["dobj", "dative", "attr", "oprd"]
# ADJECTIVES = ["acomp", "advcl", "advmod", "amod", "appos", "nn", "nmod", "ccomp", "complm",
#               "hmod", "infmod", "xcomp", "rcmod", "poss"," possessive"]
# COMPOUNDS = ["compound"]
# PREPOSITIONS = ["prep"]

# PERSONS = ['I', 'ME', 'MY', 'MINE', 'YOU', 'YOUR', 'YOURS', 'HE', 'SHE', 
#            'HIS', 'HER', 'HIM', 'THEY', 'THEM', 'THEMSELVES', 'OUR']

**Finding a verb with a subject otherwise expand the context**
To Do: R&D

In [20]:
# # Because the syntactic relations form a tree, every word has exactly one head.
# from spacy.symbols import nsubj, VERB

# first_sentence = "How is your son?"
# second_sentence = "Got the flu."
# context = ""

# doc = nlp(u''+second_sentence)

# displacy.render(doc, style='dep', jupyter=True, options={'distance' : 200})

# verbs = set()
# for possible_subject in doc:
#     if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
#         verbs.add(possible_subject.head)
        
# if len(verbs) == 0:
#     context = first_sentence + " " + second_sentence
#     print(context)       

# doc = nlp(u''+context.lower())
# displacy.render(doc, style='dep', jupyter=True, options={'distance' : 200})

# context = []
# for token in doc:
#     if token.dep_ == 'nsubj' or token.dep_ == 'dobj' or token.dep_ == 'ROOT':
#         context.append(str(token))
    
# print(' '.join(context))    

## Crawl and Store Data
* Amazon product reviews = 1000
* Amazon food reviews = 1000
* Hotel reviews = 1000
* Posts on medical forum = 1000
* StackOverflow posts = 1000   
**Total: 5000**

In [21]:
# import csv

# with open('/Users/nuhil/Downloads/Questions_2.csv', encoding='utf-8') as csvfile:
#     readCSV = csv.reader(csvfile, delimiter=',')
    
#     i = 0
#     for row in readCSV:
#         if i >= 1001:
#             break
#         review = row[6]
#         with open("data/so_questions/"+str(i).zfill(5)+"_so_questions.txt", "w") as f:
#             f.write(review.strip())
# #         print(i)
#         i += 1         
        

**Classify by Linguistics**

**Hand Craft the Data**

**Prepare Data for ANN Training**

Data | Disclosure/Nondisclosure | Typical Model
--- | ---
I got the flu. | 1 | 1
Flu is a dangerous disease. | 0 | 1
I like eating. | 0 | 0
Lets eat at 7pm in Pizza Hut. | 1 | 1
Flu 7pm Pizza Hut. | 0 | 1
Divorce gives you depression. | 0 | 1
I got divorced. | 1 | 1
Paris is capital of France. | 0 | 1

### A CNN + LSTM Neural Network

In [22]:
# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Flatten

from keras.layers import Dropout

Using TensorFlow backend.


In [23]:
number_of_data_in_each_class = 20

# Remaining is the test dataset
train_with_percentage = 50 

split_file_at = int(number_of_data_in_each_class*(train_with_percentage/100))-1
train_untill = split_file_at
test_from = train_untill+1

print("Split files at: ", split_file_at)
print("Train Until: 0 -", train_untill)
print("Test From: ", test_from, "-", number_of_data_in_each_class-1)

Split files at:  9
Train Until: 0 - 9
Test From:  10 - 19


In [24]:
LOCATION = ['LOC', 'GPE', 'ORG', 'FAC']
ALL_ENTITIES = LOCATION + ['PERSON', 'HEALTH', 'MONEY', 'DATE', 'TIME']

In [25]:
def modify_sentence(sen):
    doc = nlp(u''+sen)

    modified_tokens = []
    modified_tokens_dep = []
    modified_tokens_pos = []
        
    for t in doc:
        if t.ent_type_ == '':
            if t.text.upper() in PERSONS:
                modified_tokens.append('PERSON')
            else:
                modified_tokens.append(t.text)
        else:
            modified_tokens.append(t.ent_type_)
            
#     print(modified_tokens, len(modified_tokens))
    
    for t in doc:
        modified_tokens_dep.append(t.dep_)

#     print(modified_tokens_dep, len(modified_tokens_dep))
    
    
    for t in doc:
        modified_tokens_pos.append(t.pos_)
        
#     print(modified_tokens_pos, len(modified_tokens_pos))    
            
    sen_length = len(modified_tokens)

    i = 0   
    start = 0
    for token in modified_tokens:
        if token.upper() in ALL_ENTITIES:
            start = i
            break

        i = i+1

    j = 0
    end = None
    for token in list(reversed(modified_tokens)):
        if token.upper() in ALL_ENTITIES:
            end = j
            break

        j = j+1

    end = (sen_length - (end)) if end != None else 0    

    return start, end, modified_tokens, modified_tokens_dep, modified_tokens_pos

**Getting Boundary and Different Representations in Between that**

In [26]:
sentence = 'Ok, I will meet you in Starbucks at 7pm sharp'
start, end, modified_tokens, modified_tokens_dep, modified_tokens_pos = modify_sentence(sentence)
print(modified_tokens[start: end])
print(modified_tokens_dep[start: end])
print(modified_tokens_pos[start: end])

['PERSON', 'will', 'meet', 'PERSON', 'in', 'GPE']
['nsubj', 'aux', 'ROOT', 'dobj', 'prep', 'pobj']
['PRON', 'VERB', 'VERB', 'PRON', 'ADP', 'PROPN']


In [27]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords


def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text


# load a doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    
    start, end, modified_tokens, modified_tokens_dep, modified_tokens_pos = modify_sentence(doc)
    
    # print(modified_tokens)
    vocab.update(modified_tokens)
    
    # print(modified_tokens_dep)    
    vocab.update(modified_tokens_dep)
    
    # print(modified_tokens_pos)    
    vocab.update(modified_tokens_pos)    
    
    # print('-'*50)
    
# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    # walk through all files in the folder
    for filename in sorted(listdir(directory)):
        if not filename.endswith(".txt"):
            continue

        # skip any posts in the train set
        # skip any posts in the test set
        file_number = int(filename[:5])
        if is_trian and file_number > train_untill:
            continue
                            
        if not is_trian and file_number < test_from:
            continue               
            
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)
        
        
# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('data/non_disclosure', vocab, True)
process_docs('data/disclosure', vocab, True)

# print the size of the vocab
print("Vocabulary length: ", len(vocab))
# print the top words in the vocab
print("Top 100 vocabulary with count: ", vocab.most_common(100))        

Vocabulary length:  135
Top 100 vocabulary with count:  [('NOUN', 39), ('VERB', 36), ('ADP', 24), ('prep', 21), ('PROPN', 21), ('ROOT', 20), ('pobj', 20), ('nsubj', 18), ('ADJ', 15), ('PERSON', 15), ('MONEY', 13), ('compound', 13), ('det', 10), ('DET', 10), ('aux', 10), ('PRON', 10), ('ORG', 10), ('NUM', 8), ('nummod', 7), ('to', 7), ('DATE', 6), ('amod', 6), ('is', 6), ('the', 5), ('dobj', 5), ('ADV', 5), ('in', 5), ('npadvmod', 4), ('advcl', 4), ('GPE', 4), ('TIME', 4), ('on', 3), ('attr', 3), ('relcl', 3), ('acomp', 3), ('SYM', 3), ('not', 3), ('neg', 3), ('appos', 3), (',', 3), ('a', 3), ('punct', 3), ('PUNCT', 3), ('PART', 3), ('poss', 3), ('if', 3), ('mark', 3), ('LOC', 3), ('will', 3), ('of', 2), ('nmod', 2), ('am', 2), ('nsubjpass', 2), ('auxpass', 2), ('does', 2), ('advmod', 2), ('live', 2), ('cc', 2), ('conj', 2), ('CCONJ', 2), ('are', 2), ('at', 2), ('FAC', 2), ('for', 2), ('xcomp', 2), ('Turing', 1), ('was', 1), ('one', 1), ('persons', 1), ('who', 1), ('worked', 1), ('first

In [28]:
# keep tokens with a min occurrence
min_occurane = 1
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print("Tokens that appeared more than twice: ", len(tokens))

def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

# save tokens to a vocabulary file
save_list(tokens, 'data/vocab/vocab.txt')

Tokens that appeared more than twice:  135


In [29]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab, modify_type=None):
    
    # To Do: Check from Saved Vocab
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory, vocab, is_trian, modify_type=None):
    documents = list()
    documents_dep = list()    
    documents_pos = list()   
    
    # walk through all files in the folder
    for filename in sorted(listdir(directory)):
        if not filename.endswith(".txt"):
            continue        
            
        # skip any posts in the test set
        file_number = int(filename[:5])
        if is_trian and file_number > train_untill:
            continue
                            
        if not is_trian and file_number < test_from:
            continue                        
            
        # create the full path of the file to open
        path = directory + '/' + filename
        
        # load the doc            
        doc = load_doc(path)
        start, end, modified_tokens, modified_tokens_dep, modified_tokens_pos = modify_sentence(doc)
        
        # clean doc
        if modify_type == 'dep':        
            mod_sentence = ' '.join(modified_tokens_dep[start:end])
            # print(mod_sentence)
            documents_dep.append(mod_sentence)
        elif modify_type == 'pos':      
            mod_sentence = ' '.join(modified_tokens_pos[start:end])
            # print(mod_sentence)
            documents_pos.append(mod_sentence)            
        else:    
            mod_sentence = ' '.join(modified_tokens[start:end])
            # print(mod_sentence)
            documents.append(mod_sentence)
            
    if modify_type == 'dep':
        return documents_dep
    elif modify_type == 'pos':
        return documents_pos        
    else:    
        return documents

In [30]:
# load the vocabulary
vocab_filename = 'data/vocab/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
print("Total vocabulary: ", len(vocab))

Total vocabulary:  135


In [31]:
positive_docs = process_docs('data/disclosure', vocab, True)
negative_docs = process_docs('data/non_disclosure', vocab, True)
train_docs = negative_docs + positive_docs
print(train_docs, len(train_docs), '\n')

positive_docs_dep = process_docs('data/disclosure', vocab, True, 'dep')
negative_docs_dep = process_docs('data/non_disclosure', vocab, True, 'dep')
train_docs_dep = negative_docs_dep + positive_docs_dep
print(train_docs_dep, len(train_docs_dep), '\n')

positive_docs_pos = process_docs('data/disclosure', vocab, True, 'pos')
negative_docs_pos = process_docs('data/non_disclosure', vocab, True, 'pos')
train_docs_pos = negative_docs_pos + positive_docs_pos
print(train_docs_pos, len(train_docs_pos), '\n')

all_docs_in_dif_rep = train_docs + train_docs_dep + train_docs_pos
# print(all_docs_in_dif_rep, len(all_docs_in_dif_rep), '\n')

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(all_docs_in_dif_rep)

# sequence encode
# To Do: Check Different Encoding Scope
# encoded_docs = tokenizer.texts_to_sequences(train_docs)
encoded_docs = tokenizer.texts_to_sequences(all_docs_in_dif_rep)
print(encoded_docs, len(encoded_docs))

['DATE DATE', 'MONEY MONEY MONEY', 'MONEY MONEY MONEY', 'PERSON PERSON nothing morning MONEY MONEY MONEY', 'PERSON know , bankruptcy is a legal process which happens when a client does not have enough money', 'GPE', 'Money flu GPE', 'MONEY MONEY', '', 'ORG', 'PERSON live in LOC LOC LOC', 'PERSON in ORG ORG ORG', 'PERSON office is located at ORG ORG ORG , GPE', 'PERSON will go to FAC FAC', 'PERSON works for ORG', 'PERSON will keep MONEY MONEY', 'PERSON meeting will be at TIME TIME in the ORG ORG', 'PERSON went to HEALTH DATE', 'PERSON am going to fire PERSON if PERSON does not reply within DATE', 'PERSON are planing to leave for GPE on DATE DATE in TIME TIME'] 20 

['det npadvmod', 'nmod nummod dobj', 'quantmod nummod nsubjpass', 'compound ROOT appos npadvmod nmod nummod appos', 'nsubj parataxis punct nsubj ROOT det amod attr nsubj relcl advmod det nsubj aux neg advcl amod dobj', 'nsubj', 'compound compound ROOT', 'nummod pobj', '', 'compound', 'nsubj ROOT prep compound compound pobj', 

In [32]:
# pad sequences
# get the biggest post as per its contents
# max_length = max([len(s.split()) for s in train_docs])
max_length = max([len(s.split()) for s in all_docs_in_dif_rep])

# define training data
# Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
Xtrain = pad_sequences(encoded_docs[:20], maxlen=max_length, padding='post')
print("Total number of Training Data: ", len(Xtrain))


Xtrain_dep = pad_sequences(encoded_docs[20:40], maxlen=max_length, padding='post')
print("Total number of Training Data dep: ", len(Xtrain_dep))


Xtrain_pos = pad_sequences(encoded_docs[40:60], maxlen=max_length, padding='post')
print("Total number of Training Data pos: ", len(Xtrain_pos))

# define training labels
# put 0s for the first <split_file_at> entries and 1s for last <split_file_at> entries.
# Because, in the <train_docs> list we have the public docs first
# and private docs later.
# From now on we are assuming 0 for public data, 1 for private data
ytrain = array([0 for _ in range((train_untill+1))] + [1 for _ in range((train_untill+1))])
print("Total number of Training labels: ", len(ytrain))



Total number of Training Data:  20
Total number of Training Data dep:  20
Total number of Training Data pos:  20
Total number of Training labels:  20


In [33]:
# load all test posts
positive_docs = process_docs('data/disclosure', vocab, False)
negative_docs = process_docs('data/non_disclosure', vocab, False)
test_docs = negative_docs + positive_docs
print("Total number of Test docs: ", len(test_docs))

positive_docs_dep = process_docs('data/disclosure', vocab, False, 'dep')
negative_docs_dep = process_docs('data/non_disclosure', vocab, False, 'dep')
test_docs_dep = negative_docs_dep + positive_docs_dep
print("Total number of Test docs dep: ", len(test_docs_dep))

positive_docs_pos = process_docs('data/disclosure', vocab, False, 'pos')
negative_docs_pos = process_docs('data/non_disclosure', vocab, False, 'pos')
test_docs_pos = negative_docs_pos + positive_docs_pos
print("Total number of Test docs pos: ", len(test_docs_pos))

all_test_docs_in_dif_rep = test_docs + test_docs_dep + test_docs_pos

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(all_test_docs_in_dif_rep)
# pad sequences
# Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
Xtest = pad_sequences(encoded_docs[:20], maxlen=max_length, padding='post')

Xtest_dep = pad_sequences(encoded_docs[20:40], maxlen=max_length, padding='post')

Xtest_pos = pad_sequences(encoded_docs[40:60], maxlen=max_length, padding='post')

# define test labels
ytest = array([0 for _ in range((number_of_data_in_each_class - test_from))] + [1 for _ in range((number_of_data_in_each_class - test_from))])
print("Total number of Test labels: ", len(ytest))

Total number of Test docs:  20
Total number of Test docs dep:  20
Total number of Test docs pos:  20
Total number of Test labels:  20


In [34]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size (largest integer value): ", vocab_size)

Vocabulary size (largest integer value):  88


**1. Using both CNN and LSTM**   
CNN will give the knowledge of spatial features to the LSTM as sequence

In [35]:
# model = Sequential()
# model.add(Embedding(vocab_size, 100, input_length=max_length))
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(LSTM(100))
# model.add(Dropout(0.2))
# model.add(Dense(1, activation='sigmoid'))

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # fit network
# model.fit(Xtrain, ytrain, epochs=20, verbose=2)

# # evaluate
# loss, acc = model.evaluate(Xtest, ytest, verbose=1)
# print('Test Accuracy: %f' % (acc*100))

**2. Using CNN Only with Dropout**

In [36]:
# model = Sequential()
# model.add(Embedding(vocab_size, 32, input_length=max_length))
# model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# model.add(Flatten())
# model.add(Dense(10, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(1, activation='sigmoid'))

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # fit network
# model.fit(Xtrain, ytrain, epochs=20, verbose=2)

# # evaluate
# loss, acc = model.evaluate(Xtest, ytest, verbose=1)
# print('Test Accuracy: %f' % (acc*100))

**3. Using Multichannel CNN**

In [37]:
from keras.layers import Input
from keras.layers.merge import concatenate
from keras.models import Model
from keras.utils.vis_utils import plot_model

# fix random seed for reproducibility
numpy.random.seed(7)

# channel 1
inputs1 = Input(shape=(max_length,))
embedding1 = Embedding(vocab_size, 100)(inputs1)
conv1 = Conv1D(filters=32, kernel_size=2, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPooling1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)

# channel 2
inputs2 = Input(shape=(max_length,))
embedding2 = Embedding(vocab_size, 100)(inputs2)
conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
drop2 = Dropout(0.5)(conv2)
pool2 = MaxPooling1D(pool_size=2)(drop2)
flat2 = Flatten()(pool2)

# channel 3
inputs3 = Input(shape=(max_length,))
embedding3 = Embedding(vocab_size, 100)(inputs3)
conv3 = Conv1D(filters=32, kernel_size=2, activation='relu')(embedding3)
drop3 = Dropout(0.5)(conv3)
pool3 = MaxPooling1D(pool_size=2)(drop3)
flat3 = Flatten()(pool3)

# merge
merged = concatenate([flat1, flat2, flat3])
# interpretation
dense1 = Dense(10, activation='relu')(merged)
outputs = Dense(1, activation='sigmoid')(dense1)
model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# summarize
# print(model.summary())

model.fit([Xtrain,Xtrain_dep,Xtrain_pos], ytrain, epochs=20, batch_size=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11719fd68>

In [38]:
# evaluate model on test dataset dataset
loss, acc = model.evaluate([Xtest,Xtest_dep,Xtest_pos], ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 85.000002


**Using Multichannel LSTM Network**

In [40]:
from keras.layers import Input
from keras.layers.merge import concatenate
from keras.models import Model

# fix random seed for reproducibility
# numpy.random.seed(7)

max_length = 18
vocab_size = 42560

# channel 1
inputs1 = Input(shape=(max_length,))
embedding1 = Embedding(vocab_size, 100)(inputs1)
lstm1 = LSTM(100, dropout=0.2)(embedding1)

# channel 2
inputs2 = Input(shape=(max_length,))
embedding2 = Embedding(vocab_size, 100)(inputs2)
lstm2 = LSTM(100, dropout=0.2)(embedding2)

# channel 3
inputs3 = Input(shape=(max_length,))
embedding3 = Embedding(vocab_size, 100)(inputs3)
lstm3 = LSTM(100, dropout=0.2)(embedding3)

# merge
merged = concatenate([lstm1, lstm2, lstm3])

# dropout
# dropped = Dropout(0.2)(merged)

# interpretation
dense = Dense(100, activation='relu')(merged)
dense = Dense(10, activation='relu')(dense)
outputs = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# summarize
print(model.summary())
plot_model(model, show_shapes=True, to_file='multichannel-lstm.png')

model.fit([Xtrain,Xtrain_dep,Xtrain_pos], ytrain, epochs=20, batch_size=10)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_7 (InputLayer)             (None, 18)            0                                            
____________________________________________________________________________________________________
input_8 (InputLayer)             (None, 18)            0                                            
____________________________________________________________________________________________________
input_9 (InputLayer)             (None, 18)            0                                            
____________________________________________________________________________________________________
embedding_7 (Embedding)          (None, 18, 100)       4256000     input_7[0][0]                    
___________________________________________________________________________________________

<keras.callbacks.History at 0x132beb9e8>

In [41]:
# evaluate model on test dataset dataset
loss, acc = model.evaluate([Xtest,Xtest_dep,Xtest_pos], ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 85.000002


**Make Prediction**

In [41]:
# turn a doc into clean tokens
def clean_doc(doc):
    """
    Turn a doc into clean tokens
    """
    
    doc = modify_sentence(doc)
    print(doc)

    # split into tokens by white space
    tokens = doc.split()
    
#     # remove punctuation from each token
#     table = str.maketrans('', '', punctuation)
#     tokens = [w.translate(table) for w in tokens]
    
#     people = []
#     for w in tokens:
#         if w.upper() in PERSONS:
#             people.append(w)    
    
    # remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
    
    # filter out stop words
#     stop_words = set(stopwords.words('english'))
#     tokens = [w for w in tokens if not w in stop_words]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    tokens = ' '.join(tokens)

    return tokens

def process_new_doc(path):
    """
    Process a single doc with post
    """
    # define a new list
    documents = list()
    
    # load doc
    doc = load_doc(path)
    # clean doc
    tokens = clean_doc(doc)
    # add to list
    documents.append(tokens)
    
    return documents

def predict_privacy(path, max_length, tokenizer, model):
    """
    Give a new unseen doc to predict it's privacy
    """
    new_doc = process_new_doc(path)
    # sequence encode
    # To Do:
    # 1. Will check the current <tokentizer> in memory
    encoded_doc = tokenizer.texts_to_sequences(new_doc)
    
    # pad sequences
    max_length = max_length
    Xpredict = pad_sequences(encoded_doc, maxlen=max_length, padding='post')
    
    # prediction
    ypredict = model.predict(Xpredict, verbose=1)
    print("Privacy Score: {0} \nRounded To: {1}".format(ypredict, round(ypredict[0,0])))
    
    return round(ypredict[0,0])

def predict_privacy_by_text(text, max_length, tokenizer, model):
    # define a new list
    documents = list()
    # clean doc
    tokens = clean_doc(text)
    # add to list
    documents.append(tokens)
    
    # sequence encode
    # To Do:
    # 1. Will check the current <tokentizer> in memory
    encoded_doc = tokenizer.texts_to_sequences(documents)
    
    # pad sequences
    max_length = max_length
    Xpredict = pad_sequences(encoded_doc, maxlen=max_length, padding='post')
    
    # prediction
    ypredict = model.predict(Xpredict, verbose=0)
    print("Privacy Score: {0} \nRounded To: {1}".format(ypredict, round(ypredict[0,0])))
    
    return round(ypredict[0,0])


In [2]:
# import nltk.data
# sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

# new_doc = 'data/predict/content.txt'
# print("\nContent is Public" if predict_privacy(new_doc, max_length, tokenizer, model) == 0.0 else "\n\x1b[31mContent is Private\x1b[0m")


In [1]:
# with open(new_doc) as f:
#     text = f.read()
#     sentences = sent_detector.tokenize(text.strip())
        
# for sentence in sentences:
#     print(sentence+"\n")
#     print("Sentence is Public" if predict_privacy_by_text(sentence, max_length, tokenizer, model) == 0.0 else "\x1b[31mSentence is Private\x1b[0m")
#     print("-"*50)