In [1]:
import re
import os
import time
from pathlib import Path
from cltk.tokenizers.lat.lat import LatinWordTokenizer
from cltk.lemmatize.lat import LatinBackoffLemmatizer
from cltk.lemmatize.lat import RomanNumeralLemmatizer
from cltk.stops.lat import STOPS

tokenizer = LatinWordTokenizer()
lemmatizer = LatinBackoffLemmatizer()
num_lemmatizer = RomanNumeralLemmatizer()
stops = STOPS

In [2]:
def preprocess(text):
    text = text.lower()
    r = r'[^a-z\s]'
    r0 = ' '
    text = re.sub(r, r0, text)
    text = text.replace('j', 'i')
    text = text.replace('v', 'u')
    return text

In [3]:
def lemmata(text):
    tokens = tokenizer.tokenize(text)
    num_temp = num_lemmatizer.lemmatize(tokens)
    tokens_clear = [i[0] for i in num_temp if not i[1]]
    lemmata = lemmatizer.lemmatize(tokens_clear)
    result = ' '.join([re.sub(r'[^a-z]', '', i[1]) for i in lemmata if not re.sub(r'[^a-z]', '', i[1]) in stops])
    return result

In [4]:
base_dir = 'corpus'
base_dir_n = 'corpus_preprocessed'

for target in ['aimait', 'terribles']:
    target_dir = os.path.join(base_dir, target)
    target_dir_n = os.path.join(base_dir_n, target)
    
    for author in os.listdir(target_dir):
        author_dir = os.path.join(target_dir, author)
        author_dir_n = os.path.join(target_dir_n, author)
        if not Path(author_dir_n).exists():
            os.makedirs(author_dir_n)

        if os.path.isdir(author_dir):
            for filename in os.listdir(author_dir):
                if filename.endswith('.txt'):
                    file_path = os.path.join(author_dir, filename)
                    file_path_n = os.path.join(author_dir_n, filename)

                    if not Path(file_path_n).exists():
                        with open(file_path, 'r', encoding='utf-8') as file:
                            text = file.read()
                        text = preprocess(text)
                        text = lemmata(text)
                        with open(file_path_n, 'w') as file:
                            file.write(text)
