In [1]:
from __future__ import print_function
from __future__ import unicode_literals

import itertools
import codecs
from os import path
import csv as csv_module
from copy import deepcopy

from SemanticCorruption import *

In [2]:
def load_MSRP(filename):
    with codecs.open(filename,'r', b"utf-8" ) as fh:
        nlines = 0
        for line in fh.readlines():
            nlines+=1
            if nlines==1:
                continue
            isparaphrase, id1, id2, str1, str2 = line.split("\t") #the quality fielld is 1 for phraphrases and 0 for not
            if int(isparaphrase)== 1:
                yield((int(id1),str1.strip()),(int(id2),str2.strip()))

            
            
                

In [None]:
def create_eval_corpora(base_paraphrases, folder, max_corruption_level=10):
    """We want to create the baseline corpus as a new line seperated sentences, so that it works well with Sorchers URAE system.
    Thus all the metadata is stored in seperate files referencing the line numbers"""
    
    global phrase_line_num 
    phrase_line_num = 0 #line numebrs are always refered to after icrementing them
    openned_filehandles = []
    try:
        phrases_fh = codecs.open(path.join(folder, "phrases.txt"),'w', b"utf-8" )
        openned_filehandles.append(phrases_fh)

        def open_csv(filename, *headings):
            fh = open(path.join(folder, filename),'w')
            openned_filehandles.append(fh)
            csv = csv_module.writer(fh)
            return csv
        

        microsoft_ids_csv = open_csv("microsoft_ids.txt", "phrase_line_number","microsoft_id")
        paraphrases_csv = open_csv("paraphrases.txt", "phrase_line_num", "paraphrase_line_num")
        
        def open_series_of_csvs(base_filename, max_level, *headings):
            return [open_csv(str(level)+base_filename, *headings) for level in range(1,max_level+1)]
        
        def open_series_of_corpuption_csvs(base_filename):
            return open_series_of_csvs(base_filename, max_corruption_level, "uncorrupt_phrase_line_num", "corrupt_phrase_line_num")
        

        noun_random_semantic_corruption_csvs = open_series_of_corpuption_csvs("noun_random_semantic_corruptions.txt")
        noun_sym_semantic_corruption_csvs = open_series_of_corpuption_csvs("noun_sym_semantic_corruptions.txt")
        verb_sym_semantic_corruption_csvs= open_series_of_corpuption_csvs("verb_sym_semantic_corruptions.txt")
        verb_random_semantic_corruption_csvs= open_series_of_corpuption_csvs("verb_random_semantic_corruptions.txt")
        verb_anto_semantic_corruption_csvs= open_series_of_corpuption_csvs("verb_anto_semantic_corruptions.txt")
        #adj_anto_semantic_corruption_csvs= open_series_of_corpuption_csvs("adj_anto_semantic_corruptions.txt")
        #adj_sym_semantic_corruption_csvs= open_series_of_corpuption_csvs("adj_sym_semantic_corruptions.txt")
        
        
        
        def write_phrase(phrase):
            global phrase_line_num
            phrases_fh.write(phrase)
            phrases_fh.write("\n")
            phrase_line_num+=1
            return phrase_line_num
        
        ##Recorder Functions
        def add_phrase(phrase):
            words, tagged_words = tokenize_and_tag(phrase)
            phrase_line_num = write_phrase(' '.join(words))
            return words, tagged_words, phrase_line_num
        
        def add_corruptions(words, tagged_words, base_phrase_ln):
            short_phrase_indexes = get_phrases_indexes(tagged_words,3)
            
            def add_corrpution(csvs, corrupting_method, skip_indexes=short_phrase_indexes):
                corrupt_phrases = leveled_semantic_corrupt_sentences_from_pretagged(words,
                                                                                    tagged_words, 
                                                                                    corrupting_method,
                                                                                    skip_indexes)
                
                for corrupt_phrase, csv in zip(corrupt_phrases, csvs):
                    phrase_line_num = write_phrase(corrupt_phrase)
                    csv.writerow([base_phrase_ln, phrase_line_num])
            
            
            
            add_corrpution(noun_sym_semantic_corruption_csvs, get_noun_synonyms_of_most_common)
            add_corrpution(verb_sym_semantic_corruption_csvs, get_verb_synonyms_of_most_common)
            add_corrpution(verb_anto_semantic_corruption_csvs, get_verb_antos_of_most_common)
            add_corrpution(noun_random_semantic_corruption_csvs, get_noun_randoms, [])
            add_corrpution(verb_random_semantic_corruption_csvs, get_verb_randoms, [])
    
        for ((m_id1,phrase1),(m_id2,phrase2)) in base_paraphrases:
            #Add the phrases, and the corruptions
            words1, tagged_words1, ln1 = add_phrase(phrase1)
            add_corruptions(words1, tagged_words1, ln1)
            
            words2, tagged_words2, ln2 = add_phrase(phrase2)
            add_corruptions(words2, tagged_words2, ln2)

            #add to the record of microsoft ids
            microsoft_ids_csv.writerow([ln1,m_id1])
            microsoft_ids_csv.writerow([ln2,m_id2])

            #add the paraphases, in both directions
            paraphrases_csv.writerow([ln1,ln2])
            paraphrases_csv.writerow([ln2,ln1])


            
            
    finally:
        for fh in openned_filehandles:
            fh.close()

In [None]:
corp_gen = itertools.chain(
        load_MSRP("corpora/MSRP/msr_paraphrase_test.txt"),
        load_MSRP("corpora/MSRP/msr_paraphrase_train.txt")
)

create_eval_corpora(corp_gen,"prepared_corpora/msrp_4")