# Chunking Strings v2

v2 is the same as v1 except that it takes in a directory of inputFiles and compiles them into a single csv output file.

In [97]:
import re

def to_camel_case(input_string):
    # Remove all special characters and spaces using regex
    cleaned_string = re.sub(r'[^a-zA-Z0-9\s]', '', input_string)
    
    # Split the cleaned string into words
    words = cleaned_string.split()
    
    # Convert to camel case
    if not words:
        return ""
    
    camel_case_string = words[0].lower()  # Keep the first word in lowercase
    camel_case_string += ''.join(word.capitalize() for word in words[1:])  # Capitalize subsequent words
    
    return camel_case_string

# Example usage
input_string = "Hello, World! How are you?"
output_string = to_camel_case(input_string)
print(output_string)  # Output: helloWorldHowAreYou

helloWorldHowAreYou


In [98]:
#Inputs

readPath = r"C:\Users\srobi\OneDrive\Documents\Data\eBooks\summaTheologica"
readPathSingle = r"C:\Users\srobi\OneDrive\Documents\BibleStudy\eBooks\theCatechismOfTheCatholicChurch.txt" #None

savePath0 = r"C:\Users\srobi\OneDrive\Documents\Data\textFiles"

if readPathSingle:
    readPath = readPathSingle

title = readPath.split('\\')[-1].split('.')[0]
    
if len(title.split(' '))>1:
    title = to_camel_case(title)

print(f'{title}\n')

csvFileName = title
savePath = f'{savePath0}/{csvFileName}.csv'
print(savePath)

theCatechismOfTheCatholicChurch

C:\Users\srobi\OneDrive\Documents\Data\textFiles/theCatechismOfTheCatholicChurch.csv


In [99]:
if not readPathSingle:
    
    #Get latest file
    import glob, os
    
    # Get the latest file from the directory
    list_of_files = glob.glob(f'{readPath}/*.txt') # * means all if need specific format then *.csv
    latest_file = max(list_of_files, key=os.path.getctime)
    print(latest_file)

else:
    list_of_files = [readPathSingle]

In [100]:
list_of_files[:3]

['C:\\Users\\srobi\\OneDrive\\Documents\\BibleStudy\\eBooks\\theCatechismOfTheCatholicChurch.txt']

In [101]:
def create_overlapping_chunks(text, chunk_size, overlap):
    """
    Breaks `text` into overlapping chunks of length `chunk_size`.
    Consecutive chunks overlap by `overlap` characters.
    """
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        # Get the substring for this chunk
        chunk = text[start:end]
        chunks.append(chunk)
        
        # Move the start forward so we have the desired overlap
        # (if you want to avoid infinite loops, ensure chunk_size > overlap)
        start += (chunk_size - overlap)
        
        # Optional: break if the chunk wouldn't have any new characters
        if start >= len(text):
            break
    
    return chunks

In [102]:
from collections import defaultdict

mydict = defaultdict(list)

for file in list_of_files:

    title = file.split('\\')[-1].split('.')[0].replace(' ', '')
    print(title)

    import chardet

    # Replace 'example.txt' with your actual text file path
    # file = 'example.txt'
    
    # Detect file encoding
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        print(result)
    
    # Replace 'example.txt' with your actual text file path
    with open(file, 'r', encoding=result['encoding']) as f:
        text = f.read()
    
    # Customize your desired chunk_size and overlap
    chunk_size = 1000
    overlap = chunk_size//2
    
    chunks = create_overlapping_chunks(text, chunk_size, overlap)
    
    print(len(chunks))
    
    for i,x in enumerate(chunks):
        mydict['title'].append(title)
        mydict['index'].append(i+1)
        mydict['len'].append(len(chunks))
        mydict['text'].append(x)
        mydict['filePath'].append(file)

theCatechismOfTheCatholicChurch
{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
3019


In [103]:
df = pd.DataFrame(mydict)
df.to_csv(savePath, index=False)
display(df)

<IPython.core.display.Javascript object>

Unnamed: 0,title,index,len,text,filePath
0,theCatechismOfTheCatholicChurch,1,3019,--- Page 1 ---\nCATECHISM OF THE CATHOLIC CHUR...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
1,theCatechismOfTheCatholicChurch,2,3019,n. 27-49\nI. The Desire for God nn. 27-30\n\nI...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
2,theCatechismOfTheCatholicChurch,3,3019,ANSMISSION OF DIVINE REVELATION n. 74\nI. The ...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
3,theCatechismOfTheCatholicChurch,4,3019,ife of the Church nn. 131-133\n\nIN BRIEF nn. ...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
4,theCatechismOfTheCatholicChurch,5,3019,97\n\nCHAPTER ONE I BELIEVE IN GOD THE FATHER ...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
...,...,...,...,...,...
3014,theCatechismOfTheCatholicChurch,3015,3019,e petitions is the\nglory of the Father: the s...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
3015,theCatechismOfTheCatholicChurch,3016,3019,"on, the Church looks first to Christ's return\...",C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
3016,theCatechismOfTheCatholicChurch,3017,3019,ishment necessary to\neveryone for subsistence...,C:\Users\srobi\OneDrive\Documents\BibleStudy\e...
3017,theCatechismOfTheCatholicChurch,3018,3019,"t into temptation"" we are asking God\nnot to a...",C:\Users\srobi\OneDrive\Documents\BibleStudy\e...


In [104]:
print(df['text'].iloc[0])

--- Page 1 ---
CATECHISM OF THE CATHOLIC CHURCH

Table of Contents

PROLOGUE

I. The life of man - to know and love God _ nn. 1-3

II. Handing on the Faith: Catechesis nn. 4-10

III. The Aim and Intended Readership of the Catechism
nn. 11-12

IV. Structure of this Catechism nn. 13-17

V. Practical Directions for Using this Catechism nn. 18-22

VI. Necessary Adaptations nn. 23-25

PART ONE: THE PROFESSION OF FAITH

SECTION ONE "I BELIEVE" - "WE BELIEVE" n. 26

CHAPTER ONE MAN'S CAPACITY FOR GOD nn. 27-49
I. The Desire for God nn. 27-30

II. Ways of Coming to Know God non. 31-35

II. The Knowledge of God According to the Church nn. 36-38
IV. How Can We Speak about God? nn.39-43

IN BRIEF nn. 44-49

--- Page 2 ---
CHAPTER TWO GOD COMES TO MEET MAN 1. 50
Article 1 THE REVELATION OF GOD

I. God Reveals His "Plan of Loving Goodness" nn. 51-53
II. The Stages of Revelation nn. 54-64

III. Christ Jesus -- "Mediator and Fullness of All Revelation" nn. 65-
67

IN BRIEF nn. 68-73

Article 2 THE TR