In [1]:
import string
import pandas as pd
import os

In [11]:
target_directory = os.path.expanduser("~/Downloads/bertopic_research/bertopic_test_data_models/Dickens")
os.chdir(target_directory)

In [19]:
input_file_path = "./expectations.txt"
output_dir_path = "./expectations_chunked"

print(f"Current working directory: {os.getcwd()}")

if not os.path.exists(input_file_path):
    print(f"File does not exist: {input_file_path}")
else:
    print(f"File exists: {input_file_path}")

# create the output directory if it does not exist
if not os.path.exists(output_dir_path):
    os.mkdir(output_dir_path)

Current working directory: /Users/magnushome/Downloads/bertopic_research/bertopic_test_data_models/Dickens
File exists: ./expectations.txt


In [20]:
# define a function to clean the text
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # lower case
    text = text.lower()
    # remove new lines and line breaks
    text = text.replace("\n", " ").replace("\r", "")
    return text

In [21]:
# define the default chunk size
default_chunk_size = 50
default_overlap_size = 0

In [22]:
# define a function to divide the text into chunks and write them to files
def divide_text_into_chunks(tokens, chunk_size=default_chunk_size, overlap_size=default_overlap_size):
    # calculate the number of chunks
    num_chunks = (len(tokens) - chunk_size) // (chunk_size - overlap_size) + 1
    # create a data frame to store the chunks
    data = pd.DataFrame(columns=["text"])
    # iterate over the chunks
    for i in range(num_chunks):
        # get the chunk
        start = i * (chunk_size - overlap_size)
        end = start + chunk_size
        chunk = tokens[start:end]
        # create the file name
        file_name = "expectations_{:05d}.txt".format(i+1)
        # create the file path
        file_path = os.path.join(output_dir_path, file_name)
        # write the chunk to the file
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(" ".join(chunk))
        # add the chunk to the data frame
        data.loc[file_name] = [" ".join(chunk)]
    return data

In [23]:
# read the input file
with open(input_file_path, "r", encoding="utf-8") as f:
    text = f.read()

# clean the text
text = clean_text(text)

# tokenize the text
tokens = text.split()

In [24]:
# divide the text into chunks and write them to files
data = divide_text_into_chunks(tokens)

# print the data frame
print(data)

                                                                     text
expectations_00001.txt  great expectations 1867 edition by charles dic...
expectations_00002.txt  explicit than pip so i called myself pip and c...
expectations_00003.txt  likeness of either of them for their days were...
expectations_00004.txt  with curly black hair from the character and t...
expectations_00005.txt  beside their grave and were sacred to the memo...
...                                                                   ...
expectations_03684.txt  of you said estella have you of late very ofte...
expectations_03685.txt  that remembrance i have given it a place in my...
expectations_03686.txt  very glad to do so glad to part again estella ...
expectations_03687.txt  say that to me then you will not hesitate to s...
expectations_03688.txt  and good to me as you were and tell me we are ...

[3688 rows x 1 columns]


In [25]:
# save the data frame to a CSV file
csv_file_path = "expectations_chunked.csv"
data.to_csv(csv_file_path)