# Doc2Vec Process

We divide this process into three steps:

- **Doc2Vec Model Trainng**: Using contracts, we train a doc2vec model to turn contract sentences into representations.

- **Processing a New Contract**: Given the doc2vec model, we start the process in a new contract.

    - Norm Extraction: First, we extract the norms from the new contract;
    - Then, we create a representation for each norm using the doc2vec model.
    
- **Conflict Identification**: Using the norm representations, we can have two different paths to follow:

    - T-SNE: Manual identification of modal verbs. (Experimental)
    - Norm Comparisons: Compare norms and find the most similar among them based on a threshold.

### Doc2Vec Model Training

In [1]:
# -*- coding:utf-8 -*-
import os
import sys
import pickle
import argparse
import logging
from random import shuffle
from convert_to_sentences import convert_to_sentences
from time import gmtime, strftime
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sentence_classifier.sentence_classifier import SentenceClassifier

Using TensorFlow backend.


In [17]:
# CONSTANTS.
TRAIN = True
TRAIN_PATH = 'dataset/manufact_cntrcs.txt'
PREPROCESS = False
TEST = False
TEST_PATH = False
MODEL = False
MODEL_PATH = False

In [2]:
# Set argparse.
parser = argparse.ArgumentParser(description='Convert sentences and paragraphs into a dense representation.')

# Set logger.
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')

file_handler = logging.FileHandler('logs/doc2vec.log')
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)

In [3]:
# Set sentence classifier.
sent_cls_path = 'sentence_classifier/classifiers/17-11-03_18:45/sentence_classifier_17-11-03_18:45.pkl'
sent_cls_names_path = 'sentence_classifier/classifiers/17-11-03_18:45/sentence_classifier_dict_17-11-03_18:45.pkl'
sent_cls = SentenceClassifier()
sent_cls.load_classifier(sent_cls_path)
sent_cls_names = pickle.load(open(sent_cls_names_path, 'r'))
sent_cls.set_names(sent_cls_names)

In [4]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
        self.sentences = []

    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            pred = sent_cls.predict_class(line)
            if pred[0]:
                yield TaggedDocument(words=line.split(), tags=['SENT_%s' % uid])
            else:
                continue

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [5]:
def get_model_path():

    logger.info('Generating output path.')
    if not os.path.isdir('models'):
        os.makedirs('models')

    return 'models/model_' + strftime("%Y-%m-%d_%H-%M-%S.doc2vec", gmtime())

In [6]:
def train_model(sentences, model=None):
    logger.info('Training model.')

    if not model:
        model = Doc2Vec(size=100, window=2, min_count=2, workers=2, alpha=0.025, min_alpha=0.025)

    model.build_vocab(sentences)

    for epoch in range(10):
        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        model.alpha -= 0.002  # decrease the learning rate
        model.min_alpha = model.alpha  # fix the learning rate, no decay

    output_path = get_model_path()

    logger.info('Saving trained model.')
    model.save(output_path)

    return output_path

In [7]:
def create_sent_dict(sentences):

    s_dict = dict()

    for sent in sentences:
        s_dict[sent[1][0]] = sent[0]

    return s_dict

In [19]:
if TRAIN:

    file_path = TRAIN_PATH

    logger.info('Receive training path: %s' % file_path)

    # Get sentences.
    if PREPROCESS:
        logger.info('Preprocessing file.')
        file_path = convert_to_sentences(file_path)

    sentences = LabeledLineSentence(file_path)

    # Create a dict to convert a sent code into its respective sentence.
    sent_dict = create_sent_dict(sentences)

    if not MODEL:
        output_model = train_model(sentences)
    else:
        old_model = Doc2Vec.load(MODEL_PATH)
        output_model = train_model(sentences, old_model)

    base, _ = os.path.splitext(output_model)

    # Save the dict.
    pickle.dump(sent_dict, open(base + '.pkl', 'w'))

elif TEST:
    model = Doc2Vec.load(TEST_PATH)
    # print model.docvecs.most_similar(20)
    print model.infer_vector('This shall be respected.')

else:
    print "Nothing to do here."

### Processing a new contract

In [21]:
import pickle
from nltk.tokenize import sent_tokenize

In [None]:
sent_classifier = 'sentence_classifier/classifiers/17-11-03_18:45/sentence_classifier_17-11-03_18:45.pkl'

In [None]:
def extract_norms(path_to_classifier):
    
    norms = []
    
    return norms

In [None]:
contract_path = ''

# Read contract text.
text = open(contract_path, 'r').read()

# Extract sentences.
sentences = sent_tokenize(text)

# Extract Norms.


# Get norm representations.
