In [1]:
# Import required modules

import os
import re
import csv
import json

import sys
sys.path.insert(0, '../../')

import tokenizer

In [2]:
# Add all texts to a list

text_collection = []
texts_file = 'texts/japan_elaws.jsonl'

with open(texts_file, 'r') as f:
    for ln in f:
        data = json.loads(ln)
        title = data.get('title','')        # returns '' if 'title' key is not present
        body = data.get('body','')          # returns '' if 'body' key is not present

        # Text cleanup functions
        body = re.sub(r'\n+', '\n', body)   # replace multiple newlines with only one

        text_collection.append(title + " - " + body)

In [3]:
# Set the output location

input_dir = '/Users/luke/Projects/corpora/corpora/laws/'
output_dir = os.path.join(input_dir, 'tagged')
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Set variables for file name incrementation

strings_per_file = 500                  # Number of strings per file
file_number = 0                         # The current file number
line_number = 0                         # The current line within the file

# Buld the word list and tagged corpus
word_list = []
word_index = {}

output_file = open(f'texts/laws-{file_number}.txt', 'w') # Open the first file

for text in text_collection:
    tokens = tokenizer.tokenize(text)
    tokenizer.build_word_list(tokens, word_list, word_index)
    output_file.write(tokenizer.tag(tokens) + '\n')
    
    line_number += 1                    # Increment the line number

    if line_number == strings_per_file: # Move to the next when limit reached
        output_file.close()             # Close the current file
        file_number += 1                # Move to the next file number
        line_number = 0                 # Reset the line number

        # Open the next file
        output_file = open(f'tagged/laws-{file_number}.txt', 'w')

output_file.close()                     # Close the last file

In [6]:
# Write the word list to file

with open('wordlist.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    for row in word_list:
        writer.writerow(row)