In [1]:
import os
from tqdm import tqdm
# !pip install python-docx
from docx import Document

In [2]:
enc = 'iso-8859-15'

def get_txt_from_docx(filename):
    doc = Document(filename)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)

    return '\n'.join(full_text)

def get_txt(filename):
    with open(filename, 'r', encoding=enc) as fp:
        lines = fp.read()
    
    return lines

In [19]:
import os
import collections
import json

data_directory = "/work/pi_dhruveshpate_umass_edu/project_19/Data/MC1"

doc_index = 0
documents = {}

for folder in ["HistoricalDocuments", "News Articles"]:
    folder_path = os.path.join(data_directory, folder)
    for name in os.listdir(folder_path):
        if name in [".DS_Store"]:
            continue
        elif name.endswith(".docx"):
            file_path = os.path.join(data_directory, folder, name)
            document = get_txt_from_docx(file_path)
            documents[doc_index] = [{
                "doc_text": document.strip(),
            }]
            doc_index += 1
        else:
            sub_folder_path = os.path.join(data_directory, folder, name)
            news_channel = name.split('.')[0]  # Extract the news channel name
            for txtname in os.listdir(sub_folder_path):
                file_path = os.path.join(sub_folder_path, txtname)
                if txtname.endswith(".txt"):
                    txtdata = get_txt(file_path)
                    documents[doc_index] = [{
                        "doc_text": txtdata.strip(),
                    }]
                    doc_index += 1
                else:
                    print("*" * 20)
                    print(f"Ignoring {file_path}")
                    print("*" * 20)

# Convert to JSON format
json_data = json.dumps({"Documents": documents}, indent=4)
print(json_data)     

{
    "Documents": {
        "0": [
            {
                "doc_text": "<EXCERPTS from>The Application and Validation of a Social Movement Model in Understanding the Evolution and State of One Grassroots Social Movement in Kronos: Protectors of Kronos\nA Thesis Presented for the Degree of Master of Humanities from the University of Abila, Kronos\nOctober 2005\nKyrla Halford\nKyrla.halford@uabila.ac.kronos\n\nThis dissertation was submitted in partial fulfillment of the requirements for the Degree of Master of Arts in Humanities at the University of Abila, Kronos, and the author is solely responsible for the content.\n\nThis project was sponsored in part by the One World Research Institute for Social Development and the Center for Change Now.\n\nRecommended Citation\nHalford, Kyrla, \u201cThe Application and Validation of Social Movement Models in Understanding the Evolution and State of One Grassroots Social Movement in Kronos: Protectors of Kronos\u201d (2005). Master\u2019s Th

In [6]:
# ! pip install --upgrade langchain
# ! pip install -qU langchain-text-splitters
# ! pip install nltk

In [20]:
from langchain_community.document_loaders import JSONLoader
from pprint import pprint
data = json_data
pprint(data)


('{\n'
 '    "Documents": {\n'
 '        "0": [\n'
 '            {\n'
 '                "doc_text": "<EXCERPTS from>The Application and Validation '
 'of a Social Movement Model in Understanding the Evolution and State of One '
 'Grassroots Social Movement in Kronos: Protectors of Kronos\\nA Thesis '
 'Presented for the Degree of Master of Humanities from the University of '
 'Abila, Kronos\\nOctober 2005\\nKyrla '
 'Halford\\nKyrla.halford@uabila.ac.kronos\\n\\nThis dissertation was '
 'submitted in partial fulfillment of the requirements for the Degree of '
 'Master of Arts in Humanities at the University of Abila, Kronos, and the '
 'author is solely responsible for the content.\\n\\nThis project was '
 'sponsored in part by the One World Research Institute for Social Development '
 'and the Center for Change Now.\\n\\nRecommended Citation\\nHalford, Kyrla, '
 '\\u201cThe Application and Validation of Social Movement Models in '
 'Understanding the Evolution and State of One Grassro

In [21]:
import json
import nltk
from nltk.tokenize import word_tokenize
from langchain.text_splitter import NLTKTextSplitter

json_data = json.loads(json_data)

chunk_size = 1000
overlap_ratio = 0.1
chunk_overlap = int(overlap_ratio * chunk_size)

def split_into_chunks(text):
    text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len)
    chunks = text_splitter.split_text(text)
    result = []
    for idx, chunk in enumerate(chunks):
        chunk_tokens = len(word_tokenize(chunk))
        result.append({
            "chunk_index": idx,
            "chunk_text": chunk,
            "chunk_tokens": chunk_tokens
        })
    return result

def chunk_text_data(data):
    chunked_data = {
        "Documents": {}
    }

    # Chunk documents
    for doc_index, doc in data.get("Documents", {}).items():
        chunked_data["Documents"][doc_index] = []
        for idx, document in enumerate(doc):
            text = document.get("doc_text", "")
            chunks = split_into_chunks(text)
            chunked_data["Documents"][doc_index].extend(chunks)

    return chunked_data

chunked_json_data = chunk_text_data(json_data)

chunked_json_str = json.dumps(chunked_json_data, indent=4)
print(chunked_json_str)

{
    "Documents": {
        "0": [
            {
                "chunk_index": 0,
                "chunk_text": "<EXCERPTS from>The Application and Validation of a Social Movement Model in Understanding the Evolution and State of One Grassroots Social Movement in Kronos: Protectors of Kronos\nA Thesis Presented for the Degree of Master of Humanities from the University of Abila, Kronos\nOctober 2005\nKyrla Halford\nKyrla.halford@uabila.ac.kronos\n\nThis dissertation was submitted in partial fulfillment of the requirements for the Degree of Master of Arts in Humanities at the University of Abila, Kronos, and the author is solely responsible for the content.\n\nThis project was sponsored in part by the One World Research Institute for Social Development and the Center for Change Now.\n\nRecommended Citation\nHalford, Kyrla, \u201cThe Application and Validation of Social Movement Models in Understanding the Evolution and State of One Grassroots Social Movement in Kronos: Protectors of K

In [22]:
chunked_json_data = chunk_text_data(json_data)

chunked_json_str = json.dumps(chunked_json_data, indent=4)

# print(chunked_json_str)

with open('mc1_chunked_data.json', 'w') as json_file:
    json_file.write(chunked_json_str)
print("Writing to JSON file is complete.")

Writing to JSON file is complete.
