In [520]:
import gensim
import xml.etree.ElementTree as etree 
from random import shuffle
import multiprocessing

In [521]:
def get_annotation(instance):
    """Get the annotation.
    Given annotation in the form: <Annotation Label="correct(1)|correct_but_incomplete(0)|contradictory(0)|incorrect(0)">
    It returns the one of corresonding label which has 1 ie. correct in the above given annotation
    """
    annotation_label = instance[4].attrib['Label']
    annoted_position = [int(s) for s in annotation_label if s.isdigit()].index(1)
    if annoted_position is 0:
        return "correct"
    elif annoted_position is 1:
        return "correct_but_incomplete"
    elif annoted_position is 2:
        return "contradictory"
    elif annoted_position is 3:
        return "incorrect"
    
def get_refrence_answer(instance):
    """Parse the refrence answer.
    Returns the list of refrence asnwers
    """
    ref_answers = instance[5].text
    ref_answers = ref_answers.splitlines()
    answers = [answer.split(":", 1)[1].strip() for answer in ref_answers[1:]]
    return answers
    
class Instance(object):
    def __init__(self, instance):
        self.id = instance.attrib['ID']
        self.ProblemDescription = instance[1].text
        self.Question = instance[2].text
        self.Answer = instance[3].text
        self.Annotation = get_annotation(instance)
        self.ReferenceAnswers = get_refrence_answer(instance) 

In [522]:
#read DT-Gradev1 corpus
def read_dtGrade_corpus(file_name):
    """Read the dtGrade  corpus 
    Returns list of object of Instance type
    """
    tree = etree.parse(file_name) # load entire document as an object
    root = tree.getroot() # get refrence to the root element i.e. instances
    instances = root.findall('{http://www.w3.org/2005/Atom}Instance')
    for instance in instances:
        yield Instance(instance)              

In [523]:
# Get training data for doc2vec model
def get_all_paragraphs(instances):
    ProblemDescription = []
    Question = []
    Answer = []
    ReferenceAnswers =[]
    for instance in instances:
        ProblemDescription.append(instance.ProblemDescription)
        Question.append(instance.Question)
        Answer.append(instance.Answer)
        ReferenceAnswers.extend(instance.ReferenceAnswers)
    all_paragraphs = ProblemDescription + Question + Answer + ReferenceAnswers
    return all_paragraphs

In [524]:
# prepare paragraph to be ready to feed to genism doc2vec model
def get_taggeDocument(paragraphs, tokens_only=False):
    for i, paragraph in enumerate(paragraphs):
        if tokens_only:
            yield gensim.utils.simple_preprocess(paragraph)
        else:
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(paragraph), [i])

In [525]:
# load dataset
file_name ="DT-Gradev1.0_data/DT-Grade_v1.0_dataset.xml"
corpus = list(read_dtGrade_corpus(file_name)) #returns list of object of Instance type and shuffle them

# form training data
all_paragraphs = get_all_paragraphs(corpus)
shuffle(all_paragraphs)
training_data = list(get_taggeDocument(all_paragraphs))

#Divide the data set into training, validation and test dataset
#train_corpus = corpus[:630]
#validation_corpus = corpus[630:765]
#test_corpus = corpus[765:]

In [529]:
# Build models and save them
cores = multiprocessing.cpu_count() # count number of processor 
model_DM = gensim.models.doc2vec.Doc2Vec(dbow_words=1, vector_size=300, window=8, min_count=1, sample=1e-5, negative=5, workers=cores,  dm=1, dm_concat=1, epochs=1000, alpha=0.025, min_alpha=0.0001)
model_DBOW = gensim.models.doc2vec.Doc2Vec(dbow_words=1, vector_size=300, window=5, min_count=1, sample=1e-5, negative=5, workers=cores, dm=0, dm_concat=1, epochs=400, alpha=0.025, min_alpha=0.0001)
model_DM.build_vocab(training_data) # build vocab
model_DBOW.build_vocab(training_data) #build vocab
%time model_DM.train(training_data, total_examples=model.corpus_count, epochs=model.epochs)
model_DM.save('models/model_DM.doc2vec') #save model
%time model_DBOW.train(training_data, total_examples=model.corpus_count, epochs=model.epochs)
model_DBOW.save('models/model_DBOW.doc2vec') #save model
