In [1]:
from __future__ import print_function
from __future__ import unicode_literals

import itertools
import codecs
from os import path
import csv
from copy import deepcopy

from SemanticCorruption import *

In [2]:
def load_MSRP(filename):
    with codecs.open(filename,'r', b"utf-8" ) as fh:
        nlines = 0
        for line in fh.readlines():
            nlines+=1
            if nlines==1:
                continue
            isparaphrase, id1, id2, str1, str2 = line.split("\t") #the quality fielld is 1 for phraphrases and 0 for not
            if int(isparaphrase)== 1:
                yield((int(id1),str1.strip()),(int(id2),str2.strip()))

            
            
                

In [None]:
def create_eval_corpora(base_paraphrases, folder, max_corruption_level=10):
    """We want to create the baseline corpus as a new line seperated sentences, so that it works well with Sorchers URAE system.
    Thus all the metadata is stored in seperate files referencing the line numbers"""
    
    global phrase_line_num 
    phrase_line_num = 0 #line numebrs are always refered to after icrementing them
    openned_filehandles = []
    try:
        phrases_fh = codecs.open(path.join(folder, "phrases.txt"),'w', b"utf-8" )
        openned_filehandles.append(phrases_fh)

        microsoft_ids_fh = open(path.join(folder, "microsoft_ids.txt"),'w')
        openned_filehandles.append(microsoft_ids_fh)
        microsoft_ids_csv = csv.writer(microsoft_ids_fh)
        microsoft_ids_csv.writerow(["phrase_line_number","microsoft_id"])

        paraphrases_fh = open(path.join(folder, "paraphrases.txt"),"w")
        openned_filehandles.append(paraphrases_fh)
        paraphrases_csv = csv.writer(paraphrases_fh)
        paraphrases_csv.writerow(["phrase_line_num", "paraphrase_line_num"])

        #Open all the alway open files
        noun_sym_semantic_corruption_csvs=[]
        for level in range(1,max_corruption_level+1):
            sc_fh = open(path.join(folder, str(level)+"noun_sym_semantic_corruptions.txt"),"w") 
            openned_filehandles.append(sc_fh)
            sc_csv = csv.writer(sc_fh)                                         
            sc_csv.writerow(["uncorrupt_phrase_line_num", "corrupt_phrase_line_num"])
            noun_sym_semantic_corruption_csvs.append(sc_csv)
            
        verb_anto_semantic_corruption_csvs=[]
        for level in range(1,max_corruption_level+1):
            sc_fh = open(path.join(folder, str(level)+"verb_anto_semantic_corruptions.txt"),"w") 
            openned_filehandles.append(sc_fh)
            sc_csv = csv.writer(sc_fh)                                         
            sc_csv.writerow(["uncorrupt_phrase_line_num", "corrupt_phrase_line_num"])
            verb_anto_semantic_corruption_csvs.append(sc_csv)

        ##Recorder Functions
        def add_phrase(phrase):
            global phrase_line_num
            words, tagged_words = tokenize_and_tag(phrase)
            phrases_fh.write(' '.join(words))
            phrases_fh.write("\n")
            phrase_line_num+=1
            return words, tagged_words, phrase_line_num
        
        def add_corruptions(words, tagged_words, phrase_ln):
            global phrase_line_num
            
            short_phrase_indexes = get_phrases_indexes(tagged_words,3)
            
            noun_corrupted_phases = leveled_semantic_corrupt_sentences_from_pretagged(words, tagged_words, get_noun_synonyms_of_most_common_,short_phrase_indexes)
            for corruption, noun_sym_level_sc_csv in zip(noun_corrupted_phases, noun_sym_semantic_corruption_csvs):
                phrases_fh.write(corruption)
                phrases_fh.write("\n")
                phrase_line_num+=1
                noun_sym_level_sc_csv.writerow([phrase_ln, phrase_line_num])
                
            verb_corrupted_phases = leveled_semantic_corrupt_sentences_from_pretagged(words, tagged_words, get_verb_antos_of_most_common_,short_phrase_indexes)
            for corruption, verb_anto_level_sc_csv in zip(verb_corrupted_phases, verb_anto_semantic_corruption_csvs):
                phrases_fh.write(corruption)
                phrases_fh.write("\n")
                phrase_line_num+=1
                verb_anto_level_sc_csv.writerow([phrase_ln, phrase_line_num])
        
    
        for ((m_id1,phrase1),(m_id2,phrase2)) in base_paraphrases:
            #Add the phrases, and the corruptions
            words1, tagged_words1, ln1 = add_phrase(phrase1)
            add_corruptions(words1, tagged_words1, ln1)
            
            words2, tagged_words2, ln2 = add_phrase(phrase2)
            add_corruptions(words2, tagged_words2, ln2)

            #add to the record of microsoft ids
            microsoft_ids_csv.writerow([ln1,m_id1])
            microsoft_ids_csv.writerow([ln2,m_id2])

            #add the paraphases, in both directions
            paraphrases_csv.writerow([ln1,ln2])
            paraphrases_csv.writerow([ln2,ln1])


            
            
    finally:
        for fh in openned_filehandles:
            fh.close()

In [None]:
corp_gen = itertools.chain(
        load_MSRP("corpora/MSRP/msr_paraphrase_test.txt"),
        load_MSRP("corpora/MSRP/msr_paraphrase_train.txt")
)

create_eval_corpora(corp_gen,"prepared_corpora/msrp_ns_va_nophrase_mfcwsd")