In [289]:
import numpy as np
import os

In [288]:
MAINPATH = '/Users/marcototolo/Projects/dantegen/'

In [129]:
PATH = '/Users/marcototolo/Projects/dantegen/data/raw/divcomm.txt'

In [364]:
def get_indexes(text):
    # get indexes of books and return them in a dictionary
    indexes = {
        'start_inf' : text.find('Inferno • Canto I'),
        'end_inf' : text.find('PURGATORIO') - 7,
        'start_pur' : text.find('Purgatorio • Canto I'),
        'end_pur' : text.find('PARADISO') - 7,
        'start_par' : text.find('Paradiso • Canto I'),
        'end_par' : end
    }
    return indexes

In [71]:
def to_roman(n):
    # transforms a number to its roman numeral equivalent
    roman = ''
    dec = n // 10
    for i in range(dec):
        roman += 'X'
    n = n - dec*10
    if n<4:
        for i in range(n):
            roman += 'I'
        return roman
    if n==4:
        roman += 'IV'
        return roman
    if n<9:
        roman += 'V'
        for i in range(n-5):
            roman += 'I'
        return roman
    roman += 'IX'
    return roman

In [136]:
def get_chapter(text, ch):
    # returns the specified chapter from text
    start = text.find('Canto ' + to_roman(ch))
    text = text[start + len('Canto ' + to_roman(ch)) + 3:]
    end = text.find('\n\n\n\n') +1
    return text[: end]

In [321]:
def get_tercet(text,n):
    # returns the text starting at the specified tercet
    n = max(n,1) # first tercet is indexed 1
    start = 0
    for i in range(n-1):
        start += text[start:].find('\n\n') + 1
    return text[start + min(1,n-1):]

In [365]:
def get_text(path, book=0, chapter=0, tercet=0, length = 200):
    '''
    Open the text file and get a snippet of it.
    book:
        0: random book
        1: inferno
        2: purgatorio
        3: paradiso
    chapter and tercet: 0 for random, >0 for specific chapter/tercet
    length: snippet's length. All of it if length = 0
    '''
    with open(path) as f:
        raw = f.read()
    start = raw.find('LA DIVINA COMMEDIA\n')
    end = raw.find('l’amor che move il sole e l’altre stelle.\n')
    text = raw[start:end + len('l’amor che move il sole e l’altre stelle.\n')]
    text = text.replace('\n  ', '\n') # get rid of spaces after newlines
    indexes = get_indexes(text)
    start = [np.random.choice(np.arange(len(text)-length)), 
             indexes['start_inf'], indexes['start_pur'], indexes['start_par']]
    end = [len(text), indexes['end_inf'], indexes['end_pur'], indexes['end_par']]
    text = text[start[book]:end[book]]
    if chapter==0:
        if length==0: return text #if length=0 return whole chunk
        start = np.random.choice(np.arange(len(text)-length))
        return text[start:start + length]
    text = get_chapter(text, chapter)
    if tercet==0:
        if length==0: return text #if length=0 return whole chunk
        start = np.random.choice(np.arange(len(text)-length))
        return text[start:start + length]
    text = get_tercet(text, tercet)
    if length==0: return text #if length=0 return whole chunk
    return text[:length]

In [265]:
len(text)

561095

In [247]:
a = 'ciaociao'

In [249]:
a[:3]

'cia'

In [248]:
a[3:]

'ociao'

In [370]:
def create_train_val_test(path, train=0.7, val=0.2):
    '''
    Splits the text into 3 parts according to the given parameters.
    Text chunks are taken evenly from the 3 books and sequentially (train/val/test)
    '''
    inf = get_text(PATH, book=1, length=0)
    pur = get_text(PATH, book=2, length=0)
    par = get_text(PATH, book=3, length=0)
    train_text = inf[:int(len(inf)*train)] + '\n\n\n\n' + \
            pur[:int(len(pur)*train)] + '\n\n\n\n' + \
            par[:int(len(par)*train)]
    val_text = inf[int(len(inf)*train):int(len(inf)*(train + val))] + '\n\n\n\n' + \
            pur[int(len(pur)*train):int(len(pur)*(train + val))] + '\n\n\n\n' + \
            par[int(len(par)*train):int(len(par)*(train + val))]
    test_text = inf[int(len(inf)*(train + val)):] + '\n\n\n\n' + \
            pur[int(len(pur)*(train + val)):] + '\n\n\n\n' + \
            par[int(len(par)*(train + val)):]
    texts = [train_text, val_text, test_text]
    dirs = [MAINPATH + 'data/processed/trn/', 
            MAINPATH + '/data/processed/val/', 
            MAINPATH + '/data/processed/tst/']
    files = ['trn.txt','val.txt','tst.txt']
    for text, path, file in zip(texts, dirs, files):
        if not os.path.exists(path):
            os.makedirs(path)
        with open(path+file,'w') as f:
            f.write(text)    

In [371]:
create_train_val_test(PATH)

In [356]:
inf = get_text(PATH, book=1, length=0)

In [357]:
len(inf)

532366

In [358]:
len(text)

561095