In [25]:
from sudachipy import tokenizer
from sudachipy import dictionary

tokenizer_obj = dictionary.Dictionary().create()

In [26]:
def tokenize(text):
    # Set the tokenizer mode as B which might stand for a specific mode according to your tokenizer settings
    mode = tokenizer.Tokenizer.SplitMode.C

    # Step 1: Split input text into lines
    lines = text.split('\n')

    # Step 2: Remove any lines that are empty strings
    # The list comprehension iterates over all lines and keeps only the ones that are non-empty
    non_empty_lines = [line for line in lines if line]

    # Step 3: Remove leading and trailing whitespace from each line
    # The list comprehension iterates over all non-empty lines and strips the whitespace
    trimmed_lines = [line.strip() for line in non_empty_lines]

    # Initialize an empty list to store the tokenized lines
    tokens = []

    # Step 4: Tokenize each line
    # For each line, we tokenize it into spans (words or phrases),
    # replace spaces with '_', and append the tokenized line to 'tokens'
    for line in trimmed_lines:
        tokenized_line = ["_".join([m.surface(), m.part_of_speech()[0], m.dictionary_form()])
                          for m in tokenizer_obj.tokenize(line, mode)]
        tokens.append(" ".join(tokenized_line))

    # Step 5: Join the tokenized lines back together with line breaks between them
    tokenized_text = "\n".join(tokens)

    return tokenized_text


In [27]:
import os
import glob

# Define the directory that contains the text files
input_dir = '/Users/luke/Projects/work/textbook-corpus/utils/nonfiction/'

# Define the directory to save the tokenized files
output_dir = os.path.join(input_dir, 'tagged')

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Get a list of all .txt files in the input directory
txt_files = glob.glob(os.path.join(input_dir, '*.txt'))

# Iterate over each text file
for txt_file in txt_files:
    # Open the text file and read its content
    with open(txt_file, 'r') as file:
        content = file.read()
    
    print(f'Tokenising {txt_file}')
    # Tokenize the content of the file
    tokenized_content = tokenize(content)

    # Create the output file path
    # os.path.splitext(txt_file)[0] gets the file name without the extension
    # os.path.basename gets the base name of the file (without directories)
    output_file = os.path.join(output_dir, os.path.basename(os.path.splitext(txt_file)[0]) + '_tagged.txt')

    # Write the tokenized content to the output file
    with open(output_file, 'w') as file:
        file.write(tokenized_content)

print("Tokenization process completed!")


Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/060.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/074.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/048.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/049.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/075.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/061.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/088.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/077.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/063.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/062.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/076.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfiction/089.txt
Tokenising /Users/luke/Projects/work/textbook-corpus/utils/nonfi