In [63]:
import pickle
import  string
import re
import os
import unicodedata
from collections import Counter

In [64]:
class LoadData(object):
    def __init__(self, filename, filename_2=None, pair=False):
        self.filename = filename
        if filename_2:
            self.filename_2 = filename_2
        self.pair = pair
        if pair:
            self.text = None
        else:
            self.source_text = None
            self.target_text = None
        # store the source sentences and the target sentences
        self.source_sentences = []
        self.target_sentences = []
        # counters for  source sentences and target sentences
        self._src_counter = Counter()
        self._target_counter = Counter()
        self._special_tokens = ['<PAD>', '<GO>', '<EOS>']
        
    # load the data
    def load_text(self, encode='utf-8'):
        f = open(self.filename, 'r', encoding=encode)
        self.text = f.read()
        if self.pair:  # if source and target are within the same file
            f.close()
            return self.text
        self.source_text = self.text
        f_2 = open(self.filename_2, 'r')
        self.target_text = f_2.read()
        f_2.close()
        return self.source_text, self.target_text
        
    # split the data
    def _split_data(self):
        if self.pair:
            lines = self.text.split('\n')
            pair_sentences = [line.split('\t') for line in lines if line]
            self.source_sentences = [line[0] for line in pair_sentences]
            self.target_sentences = [line[1] for line in pair_sentences]
            return self.source_sentences, self.target_sentences
        else:
            source_lines = self.source_text.split('\n')
            self.source_sentences = [line for line in source_lines ]
            target_lines = self.target_text.split('\n')
            self.target_sentences = [line for line in target_lines ]
            return self.source_sentences, self.target_sentences
    # clean the data
    def clean_data(self):
        source_data, target_data = self._split_data()
        # surround  punctuations by whitespace
        punc_suround = re.compile('[?.!,¿]')
        source_data = [re.sub(r"([?.!,¿])", r" \1 ", phrase) for phrase in source_data]
        target_data = [re.sub(r"([?.!,¿])", r" \1 ", phrase) for phrase in target_data]
        source_data = [re.sub(r'[" "]+', " ", phrase) for phrase in source_data]
        target_data = [re.sub(r'[" "]+', ' ', phrase)for phrase in target_data]
        
        # transform all letters to latin ascii
        source_data = [unicodedata.normalize('NFD', phrase).encode('ascii', 'ignore').decode('UTF-8') for phrase in source_data]
        target_data = [unicodedata.normalize('NFD', phrase).encode('ascii', 'ignore').decode('UTF-8') for phrase in target_data]
#         source_data = [phrase.decode('UTF-8') for phrase in source_data]
#         target_data = [phrase.decode('UTF-8') for phrase in target_data]
        #  convert all sentences to lowercase
        source_data = [phrase.lower() for  phrase in source_data]
        target_data = [phrase.lower() for phrase in target_data]
        # delete everything except a-z ,.!?
        source_data = [re.sub('[^a-z?,!.¿]', ' ',phrase) for  phrase in source_data]
        target_data = [re.sub('[^a-z?,!.¿]', ' ',phrase) for phrase in target_data]
        
        # add special end token for target sentences #TODO ADDED NEW !!
        end_token = '{} '.format(self._special_tokens[-1])
        target_data = [phrase + end_token for phrase in target_data]
        
        return source_data, target_data
        
        
    def save_txt(self,list_phrases, save_path):
            
#         f = open(save_path, 'w')
#         for phrase in list_phrases:
#             f.write(phrase)
#         f.close()
        split_path = save_path.split("/")
        print(len(split_path))
        if(os.path.exists("/".join(split_path[:- 1])) == False and len(split_path) >= 2):
            os.makedirs("/".join(split_path[:- 1]))
        with open(save_path, 'w') as saved_file:
            for phrase in list_phrases:
                print(phrase, file=saved_file)
        print('all sentences are saved !')
            
        
    def save_binary(self, list_phrases_src, list_phrases_target, save_path):
        
        split_path = save_path.split("/")
        print(len(split_path))
        if(os.path.exists("/".join(split_path[:- 1])) == False and len(split_path) >= 2):
            os.makedirs("/".join(split_path[:- 1]))
        with open(save_path, 'wb') as file_:
            pickle.dump([list_phrases_src, list_phrases_target], file_)

        print('file saved')
    
    def load_binary(self, file_path):
        with open(file_path, 'rb') as read_file:
            get_data = pickle.load(read_file)
        return get_data
        
    # count the unique tokens
    def count_tokens(self, sentences_list, src=True):
        for sentence in sentences_list:
            if src:
                self._src_counter.update(sentence.split())
            else:
                self._target_counter.update(sentence.split())
        print('tokens are counted !')
    
    # save tokens in a file
    def save_tokens(self, file_name, common_words=None, src_counter=True):
        if src_counter:
            counter = self._src_counter
            special_tokens = self._special_tokens
        else:
            counter = self._target_counter
            special_tokens = [self._special_tokens[0]]
        
        split_path = file_name.split("/")
        print(len(split_path))
        if(os.path.exists("/".join(split_path[:- 1])) == False and len(split_path) >= 2):
            os.makedirs("/".join(split_path[:- 1]))
        with open(file_name, 'w') as file_:
            for special in special_tokens:
                print(special, file=file_)
            
            if not common_words:
                for word in counter.keys():
                    print(word, file=file_)
                print('all {} tokens are saved'.format(len(counter) + len(special_tokens)))
                    
            else:
                for word, _ in counter.most_common(common_words):
                    print(word, file=file_)
                print('the top {} tokens are saved'.format(common_words + len(special_tokens)))
            
            
        



In [65]:
path = 'deu.txt'
load_file = LoadData(path, pair=True)
pair_language = load_file.load_text()
english_sentences, german_sentences = load_file.clean_data()
print(english_sentences[: 10])
print(german_sentences[: 10])

['hi . ', 'hi . ', 'run ! ', 'wow ! ', 'wow ! ', 'fire ! ', 'help ! ', 'help ! ', 'stop ! ', 'wait ! ']
['hallo ! <EOS> ', 'gru gott ! <EOS> ', 'lauf ! <EOS> ', 'potzdonner ! <EOS> ', 'donnerwetter ! <EOS> ', 'feuer ! <EOS> ', 'hilfe ! <EOS> ', 'zu hulf ! <EOS> ', 'stopp ! <EOS> ', 'warte ! <EOS> ']


In [66]:
load_file.save_txt(english_sentences, 'words/eng_sentences.txt')
load_file.save_txt(german_sentences, 'words/german_sentences.txt')
load_file.save_binary(english_sentences, german_sentences, 'words/ger_eng.p')

2
all sentences are saved !
2
all sentences are saved !
2
file saved


In [67]:
eng, ger = load_file.load_binary('words/ger_eng.p')

print(eng[: 10])
print(ger[: 10])

['hi . ', 'hi . ', 'run ! ', 'wow ! ', 'wow ! ', 'fire ! ', 'help ! ', 'help ! ', 'stop ! ', 'wait ! ']
['hallo ! <EOS> ', 'gru gott ! <EOS> ', 'lauf ! <EOS> ', 'potzdonner ! <EOS> ', 'donnerwetter ! <EOS> ', 'feuer ! <EOS> ', 'hilfe ! <EOS> ', 'zu hulf ! <EOS> ', 'stopp ! <EOS> ', 'warte ! <EOS> ']


In [68]:
# count tokens
load_file.count_tokens(english_sentences)
load_file.count_tokens(german_sentences, src=False)
# save tokens in a file
print(len(load_file._src_counter))
load_file.save_tokens('words/eng_words.txt')
print(len(load_file._target_counter))
load_file.save_tokens('words/ger_words.txt', src_counter=False)

tokens are counted !
tokens are counted !
15104
2
all 15107 tokens are saved
33073
2
all 33074 tokens are saved


In [30]:
list_phrases = ["hello", "go"]
path_test ="john/lim/kim.txt"
split_path = path_test.split("/")
print(split_path)
if(os.path.exists("/".join(split_path[:- 1])) == False):
    os.makedirs("/".join(split_path[:- 1]))
with open(path_test, 'w') as saved_file:
    
            for phrase in list_phrases:
                print(phrase, file=saved_file)

['john', 'lim', 'kim.txt']


In [69]:
  
with open("all_data.p", 'rb') as file:
    source_data, target_data = pickle.load(file)

print(source_data[: 10])
print(len(source_data))

print("**" * 20)
print(target_data[: 10])
print(len(target_data))
    

['hi . ', 'hi . ', 'run ! ', 'wow ! ', 'wow ! ', 'fire ! ', 'help ! ', 'help ! ', 'stop ! ', 'wait ! ']
192881
****************************************
['hallo ! <EOS> ', 'gru gott ! <EOS> ', 'lauf ! <EOS> ', 'potzdonner ! <EOS> ', 'donnerwetter ! <EOS> ', 'feuer ! <EOS> ', 'hilfe ! <EOS> ', 'zu hulf ! <EOS> ', 'stopp ! <EOS> ', 'warte ! <EOS> ']
192881


In [70]:
with open('words/ger_eng.p', 'rb') as file:
    ssource_data, ttarget_data = pickle.load(file)

print(ssource_data[: 10])
print(len(ssource_data))
print("**" * 20)
print(ttarget_data[: 10])
print(len(ttarget_data))

['hi . ', 'hi . ', 'run ! ', 'wow ! ', 'wow ! ', 'fire ! ', 'help ! ', 'help ! ', 'stop ! ', 'wait ! ']
192881
****************************************
['hallo ! <EOS> ', 'gru gott ! <EOS> ', 'lauf ! <EOS> ', 'potzdonner ! <EOS> ', 'donnerwetter ! <EOS> ', 'feuer ! <EOS> ', 'hilfe ! <EOS> ', 'zu hulf ! <EOS> ', 'stopp ! <EOS> ', 'warte ! <EOS> ']
192881
