In [1]:
import glob
import gzip
import re
import pandas as pd 
import numpy as np

## DATA LOAD FOR TRAINING - from WMT 2019 task

- __Train sets__
1. Gujarati-English
2. Hindi-English - large corpus 

### - loading Training set 

- 5 tsv.gz files
- __Additional corpus__ : gu-en parallel corpus that contains 65K sentences by Uka Tarsadia University, Gujarat, India 

In [2]:
parallel_data = []
FILE_PATH = './dataset/train/'

#extracting and loading Five tsv.gz files  into the dataframe
file_list = glob.glob(FILE_PATH +'*.tsv.gz')
for filename in file_list:
    gujarat_eng = pd.read_csv(filename, delimiter='\t', header=None, error_bad_lines=False, names=['source', 'target'])
    parallel_data.append(gujarat_eng)

#gujarati additional corpus 
with open(FILE_PATH + 'train.en.txt') as f:
    en = f.read().split('\n')
    
with open(FILE_PATH + 'train.gu.txt') as f:
    gu = f.read().split('\n')

corp =  pd.DataFrame({'source': gu, 'target':en})   
frame = pd.concat(parallel_data, axis=0, ignore_index=True)
corpus = [corp, frame]
corpus = pd.concat(corpus, axis=0)

# adding column that marks the original language for future transliteration to Hindi
corpus['origin_lang'] = 'gujarati'



  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
corpus.head()

Unnamed: 0,source,target,origin_lang
0,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ.,A bicycle replica with a clock as the front wh...,gujarati
1,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ,A black Honda motorcycle parked in front of a ...,gujarati
2,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ,A room with blue walls and a white sink and door.,gujarati
3,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...,A car that seems to be parked illegally behind...,gujarati
4,હવામાં ઉડતી મોટી પેસેન્જર વિમાન.,A large passenger airplane flying through the ...,gujarati


- __Hindi-English corpus__ 

In [4]:
import codecs, string 

with open(FILE_PATH + '/training.txt') as f:
    hin_en = f.read().split('\n')
    
hin_en_parallel = []

for sent in hin_en:
    try:
        hin, en = sent.split('\t')
        a = (hin, en)
        if u'\u0900' <= hin <= u'\u097f': # to check if the sentence is in Hindi
            if len(a) == 2 and a[0] != 0 and a[1]!=0 : #to get rid of dirty data that contains other languages or empty strings
                hin_en_parallel.append(a)
        
        
    except ValueError: 
        pass

# to dataframe
hin_en_df = pd.DataFrame(hin_en_parallel, columns = ['source', 'target'])
hin_en_df['origin_lang'] = 'hindi' 

In [5]:
hin_en_df.head()

Unnamed: 0,source,target,origin_lang
0,सदस्य hi-2,User hi-2,hindi
1,राष्ट्रीय विज्ञान दिवस,National Science Day,hindi
2,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout,hindi
3,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer,hindi
4,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel,hindi


In [6]:
# Gujarati-English + Hindi-English corpus 
corpus = [corpus, hin_en_df]
corpus = pd.concat(corpus, ignore_index=True)

In [7]:
corpus

Unnamed: 0,source,target,origin_lang
0,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ.,A bicycle replica with a clock as the front wh...,gujarati
1,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ,A black Honda motorcycle parked in front of a ...,gujarati
2,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ,A room with blue walls and a white sink and door.,gujarati
3,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...,A car that seems to be parked illegally behind...,gujarati
4,હવામાં ઉડતી મોટી પેસેન્જર વિમાન.,A large passenger airplane flying through the ...,gujarati
...,...,...,...
1725228,कार्यक्रम को इनके जरिए लाइव स्ट्रीम किया जाएगा:,The programme will be streamed live via:,hindi
1725229,मानव संसाधन विकास मंत्रालय का फेसबुक पेज: http...,Ministry of Education Facebook Page: https://w...,hindi
1725230,"यूजीसी यूट्यूब चैनल, पीआईबी यूट्यूब चैनल,","UGC YouTube Channel, PIB YouTube Channel,",hindi
1725231,यूजीसी ट्विटर हैंडल (@ugc_india) : https://twi...,UGC Twitter Handle (@ugc_india) : https://twit...,hindi


### - Checking the features of the corpus

In [8]:
print(f'The size of the 5 Gujarati-English parallel corpus: {len(corpus)}')
print(f'The size of the Hindi-English parallel corpus: {len(hin_en_df)}')
print(f'The total size of the parallel corpus: {len(hin_en_df)+len(corpus)}')

The size of the 5 Gujarati-English parallel corpus: 1725233
The size of the Hindi-English parallel corpus: 1504696
The total size of the parallel corpus: 3229929


## 2. Preprocessing/Denoising corpus

1. Deletion 
- NULL data
- foreign character ratio over 50% for better performance 
- special characters in the sentences


In [9]:
# General deletion for both languages 
def null_preprocess(corpus):
    corpus.replace('', np.nan, inplace=True)
    corpus.dropna(inplace=True)
    
    return corpus

def character_ratio(data):
    len_string = len(data)
    len_non_latin = len("".join(re.findall(r'[^a-zA-Z\s]+', data)))
    ratio = len_non_latin/len_string
    
    if ratio > 0.5:
        return True 
    
    else:
        return False 

# method for target language preprocessing - English
def eng_preprocess(corpus):
    print(f'The size of corpus before preprocessing: {len(corpus)}')
    
    #null deletion 
    corpus = null_preprocess(corpus)
    print(f'The size of corpus after preprocessing (null deletion): {len(corpus)}')

    # special character deletion
    corpus['target'] = corpus['target'].str.replace(pat=r'[^\w]', repl=r' ', regex=True)
    corpus['target'] = corpus['target'].str.replace(pat=r'\s{2,}', repl=r' ', regex=True)
    
    #foreign char ratio deletion (to get rid of the english sentences that contain a lot of foreign characters)
    idx_foreign = corpus[(corpus['target'].apply(character_ratio) == True)].index  
    corpus = corpus.drop(idx_foreign)
    print(f'The size of corpus after preprocessing (foreign character ratio deletion): {len(corpus)}')
    
    #delete row sentence length over 120
    corpus['target_sent_len'] = [len(text) for text in corpus.target]
    idx_sent_over_120 = corpus[(corpus['target_sent_len'] > 120)].index
    corpus = corpus.drop(idx_sent_over_120)
    print(f'The size of corpus after preprocessing (sentence length over 120 deletion): {len(corpus)}')

    return corpus


# method for source language preprocessing - Gujarati
def guj_preprocess(corpus):

    from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
    from indicnlp.tokenize import indic_tokenize 
    from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator

    #remove_nuktas=False
    factory = IndicNormalizerFactory()
    gu_normalizer = factory.get_normalizer("gu", remove_nuktas=False)
    hi_normalizer = factory.get_normalizer("hi", remove_nuktas=False)
    
    corpus['source_normalized'] = np.where(corpus['origin_lang']=='gujarati', corpus.source.apply(lambda x: gu_normalizer.normalize(x)), corpus.source.apply(lambda x: hi_normalizer.normalize(x)))
    
    corpus['source_normalized_sent_len'] = [len(text) for text in corpus.source_normalized]
    
    #tokenize
    
    corpus['source_tokenized'] = np.where(corpus['origin_lang']=='gujarati', corpus.source_normalized.apply(lambda x: indic_tokenize.trivial_tokenize(x, lang='gu')), corpus.source_normalized.apply(lambda x: indic_tokenize.trivial_tokenize(x, lang='hi')))
    corpus['source_tokenized'] = corpus.source_tokenized.apply((lambda x: " ".join(x)))
    
    #script conversion
    corpus['guja_2_hindi'] = np.where(corpus['origin_lang']=='gujarati', 
                                      corpus.source_tokenized.apply(lambda x: UnicodeIndicTransliterator.transliterate(x, 'gu', 'hi')), 
                                      corpus.source_tokenized.apply(lambda x : x))

    return corpus

In [10]:
corpus = eng_preprocess(corpus)

The size of corpus before preprocessing: 1725233
The size of corpus after preprocessing (null deletion): 1725194
The size of corpus after preprocessing (foreign character ratio deletion): 1723261
The size of corpus after preprocessing (sentence length over 120 deletion): 1421614


In [11]:
corpus = guj_preprocess(corpus)

In [12]:
corpus.head()

Unnamed: 0,source,target,origin_lang,target_sent_len,source_normalized,source_normalized_sent_len,source_tokenized,guja_2_hindi
0,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ.,A bicycle replica with a clock as the front wh...,gujarati,50,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ.,47,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ .,फ्रन्ट व्हील तरीके घडियाळ साथे सायकल प्रतिकृति .
1,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ,A black Honda motorcycle parked in front of a ...,gujarati,53,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ,47,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ,गेरेजनी सामे पार्क करेली ब्लेक होन्डा मोटरसायकल
2,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ,A room with blue walls and a white sink and door,gujarati,49,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ,48,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ,वादळी दिवालो अने सफेद सिंक अने बारणुं धरावती खंड
3,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...,A car that seems to be parked illegally behind...,gujarati,67,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...,95,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...,एक कार के जे कानूनी रीते पार्क करेली कारनी पाछ...
4,હવામાં ઉડતી મોટી પેસેન્જર વિમાન.,A large passenger airplane flying through the ...,gujarati,50,હવામાં ઉડતી મોટી પેસેન્જર વિમાન.,32,હવામાં ઉડતી મોટી પેસેન્જર વિમાન .,हवामां उडती मोटी पेसेन्जर विमान .


In [13]:
# Checking the transliteration from Gujarati to Hindi
print(corpus.source[2],'\t' ,corpus.guja_2_hindi[2])

વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ 	 वादळी दिवालो अने सफेद सिंक अने बारणुं धरावती खंड


## 3. Subword level tokenization using Sentencepiece -(BPE)

__1. Baseline__
- Based on `Hindi-English` corpus

__2. Multilingual__ 
- Based on `Hindi-English` + `Gujarati - English` (translated to Hindi script) corpus 

In [None]:
# dataframe to store hindi-english model   
baseline = pd.DataFrame()
multilingual = pd.DataFrame()

In [None]:
hi = corpus_prac.source[(corpus_prac.origin_lang=='hindi')]

with open('./dataset/train/sentencepiece/hi/hi_train.txt', 'w', encoding='utf-8') as f:
    for line in hi:
        f.write(line+'\n')

In [15]:
import sentencepiece as spm

In [14]:
baseline = pd.DataFrame()
multilingual = pd.DataFrame()

## Baseline
### 3.1. Hindi subword tokenization

In [15]:
SENT_FILE_PATH = './dataset/train/sentencepiece/'
hi = corpus.source_tokenized[(corpus.origin_lang=='hindi')]
with open(SENT_FILE_PATH + 'hi/hi_train.txt', 'w', encoding='utf-8') as f:
    for line in hi:
        f.write(line+'\n')

In [17]:
import os
import sentencepiece as spm

input_file = SENT_FILE_PATH + 'hi/hi_train.txt'
sp_model_root = SENT_FILE_PATH + 'hi/'

if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
    print('directory created.')
    
vocab_size = 32000    
sp_model_name = f'tokenizer_{vocab_size}'
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'bpe'
character_coverage = 1.0
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)
print('train done')

train done


In [18]:
sp = spm.SentencePieceProcessor()
sp.Load(f'{sp_model_path}.model')
tokens = sp.encode_as_pieces(hi[220560])
ids = sp.encode_as_ids(hi[220560])
print(hi[220560])
print(tokens)
print(ids)

चुनाव को हटाएं ( C _ )
['▁चुनाव', '▁को', '▁हटाएं', '▁(', '▁C', '▁_', '▁)']
[3615, 157, 10529, 178, 1312, 355, 176]


In [20]:
baseline['source'] = [sp.encode_as_ids(text) for text in corpus.guja_2_hindi]

In [21]:
baseline.head()

Unnamed: 0,source
0,"[4429, 1395, 25708, 2698, 309, 2228, 31822, 34..."
1,"[164, 5150, 12829, 410, 31655, 5139, 943, 296,..."
2,"[2494, 21488, 368, 26313, 1217, 3699, 11718, 1..."
3,"[186, 261, 136, 1883, 3758, 2287, 187, 5139, 9..."
4,"[122, 31669, 3829, 31662, 18959, 206, 9504, 12..."


### 3.2. English subword tokenization (Hindi)

In [22]:
en_hi = corpus.target[(corpus.origin_lang=='hindi')]

with open(SENT_FILE_PATH + 'en_hi/en_hi_train.txt', 'w', encoding='utf-8') as f:
    for line in en_hi:
        f.write(line+'\n')

In [23]:
input_file = SENT_FILE_PATH + 'en_hi/en_hi_train.txt'
sp_model_root = SENT_FILE_PATH + 'en_hi/'

if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
    print('directory created!')

    
vocab_size = 32000    
sp_model_name = f'tokenizer_{vocab_size}'
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'bpe'
character_coverage = 1.0
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)
print('train done')

train done


In [28]:
sp = spm.SentencePieceProcessor()
sp.Load(f'{sp_model_path}.model')
tokens = sp.encode_as_pieces(en_hi[220538])
ids = sp.encode_as_ids(en_hi[220538])
print(en_hi[220538])
print(tokens)
print(ids)

National Science Day
['▁National', '▁Science', '▁Day']
[1274, 4407, 1023]


In [29]:
baseline['target'] = [sp.encode_as_ids(text) for text in corpus.target]

### 3.3 Gujarati + Hindi in Hindi script

In [30]:
with open(SENT_FILE_PATH + 'gu_hi/gu_hi_train.txt', 'w', encoding='utf-8') as f:
    for line in corpus.guja_2_hindi:
        f.write(line+'\n')

In [31]:
input_file = SENT_FILE_PATH + 'gu_hi/gu_hi_train.txt'
sp_model_root = 'gu_hi/'

if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
    print('directory created!')
    
vocab_size = 32000    
sp_model_name = f'tokenizer_{vocab_size}'
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'bpe'
character_coverage = 1.0
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)
print('train done')

directory created!
train done


In [32]:
sp = spm.SentencePieceProcessor()
sp.Load(f'{sp_model_path}.model')
tokens = sp.encode_as_pieces(corpus.guja_2_hindi[200000])
ids = sp.encode_as_ids(corpus.guja_2_hindi[200000])
print(corpus.guja_2_hindi[200000])
print(tokens)
print(ids)

1850मां शहेरनी दरेक वाणिज्यिक अने आर्थिक बाबतो कपास साथे संबंधित हती .
['▁18', '50', 'मां', '▁शहेरनी', '▁दरेक', '▁वाणिज्यिक', '▁अने', '▁आर्थिक', '▁बाबतो', '▁कपास', '▁साथे', '▁संबंधित', '▁हती', '▁.']
[2068, 6314, 227, 3014, 4163, 6354, 318, 1804, 6564, 12094, 450, 1261, 2193, 150]


In [33]:
multilingual['source'] = [sp.encode_as_ids(text) for text in corpus.guja_2_hindi]

In [34]:
multilingual

Unnamed: 0,source
0,"[12260, 11492, 1498, 4647, 450, 3874, 18543, 150]"
1,"[19598, 199, 1890, 1383, 4096, 8879, 175, 1538..."
2,"[2291, 8034, 318, 1057, 1797, 318, 6453, 4714,..."
3,"[172, 253, 135, 564, 4348, 1650, 1383, 4096, 5..."
4,"[8630, 5665, 2364, 5476, 1499, 150]"
...,...
1421609,"[1086, 161, 3698, 6302, 19302, 5492, 262, 28719]"
1421610,"[1865, 2985, 853, 3066, 163, 19634, 6240, 3157..."
1421611,"[856, 341, 243, 21548, 4571, 152, 14499, 21548..."
1421612,"[856, 341, 243, 15088, 12314, 15944, 185, 1951..."


### 3.4 English subword tokenization (Hindi+Gujarati)

In [37]:
with open(SENT_FILE_PATH + 'en_gu_hi/en_gu_hi_train.txt', 'w', encoding='utf-8') as f:
    for line in corpus.target:
        f.write(line+'\n')

In [39]:
input_file = SENT_FILE_PATH + 'en_gu_hi/en_gu_hi_train.txt'
sp_model_root = SENT_FILE_PATH + 'en_gu_hi/'
if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
    print('directory created!')

    
vocab_size = 32000    
sp_model_name = f'tokenizer_{vocab_size}'
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'bpe'
character_coverage = 1.0
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)
print('train done')

train done


In [40]:
sp = spm.SentencePieceProcessor()
sp.Load(f'{sp_model_path}.model')
tokens = sp.encode_as_pieces(corpus.target[1])
ids = sp.encode_as_ids(corpus.target[1])
print(corpus.target[1])
print(tokens)
print(ids)

A black Honda motorcycle parked in front of a garage 
['▁A', '▁black', '▁H', 'onda', '▁motorcycle', '▁parked', '▁in', '▁front', '▁of', '▁a', '▁garage']
[155, 1392, 207, 16673, 1734, 1443, 151, 1283, 143, 122, 13976]


In [41]:
multilingual['target'] = [sp.encode_as_ids(text) for text in corpus.target]

## TEST DATASET LOAD

In [42]:
# tgz extract 

import tarfile
from bs4 import BeautifulSoup, SoupStrainer

In [43]:
# dev = tarfile.open('./dataset/dev.tgz', 'r:gz')
# for item in dev:
#     dev.extract(item, './dataset/')

TEST_FILE_PATH = './dataset/test/'        
test = tarfile.open(TEST_FILE_PATH + 'test.tgz', 'r:gz')
for item in test:
    test.extract(item, TEST_FILE_PATH)