In [13]:
import json

# Assuming you have a Document class
class Document:
    def __init__(self, metadata, page_content):
        self.metadata = metadata
        self.page_content = page_content

    def __repr__(self):
        # Customize the representation to match the output format you want
        return f"Document(metadata={self.metadata}, page_content={self.page_content})"

# Function to read the JSON file and load it as a list of Document objects
def load_documents_from_json(filename='documents.json'):
    with open(filename, 'r') as f:
        document_dicts = json.load(f)
    
    # Convert each dictionary back into a Document object
    document_list = [Document(d['metadata'], d['page_content']) for d in document_dicts]
    return document_list

# Example usage
loaded_data = load_documents_from_json()

# Now you can access the loaded Document objects
# for doc in loaded_data:
    # print(doc.metadata)
    # print(doc.page_content[:100])  # Print first 100 characters of page content


In [18]:
loaded_data[0].page_content

'806Peptic Ulcer DiseaseFrancis K.L. Chan, James Y.W. Lau53An ulcer in the GI tract can be defined as a 5 mm or larger break in the lining of the mucosa, with appreciable depth at endoscopy or with histologic evidence of submucosal extension. An erosion is a break less than 5 mm. The distinction between an ulcer and an ero\xadsion is somewhat arbitrary. The term PUD is used to include ulcer\xadations and erosions in the stomach and duodenum from a number of causes. These lesions are called “peptic” because the enzyme pepsin, proteolytic at an acidic pH (see Chapter 51), plays a major role in causing the mucosal breaks, regardless of the inciting agent.Decades of research focused on the role of gastric acid secretion and the effects of stress, personality type, and genetics in the pathogenesis of PUD. The discovery of the histamine-2 (H2) receptor and development of H2RAs,1 and subsequently PPIs, led to major changes in the management of PUD. The discovery of Hp and its role in PUD (see

In [19]:
loaded_data[0].metadata

{'source': 'Gastrointestinal and Liver Disease.pdf',
 'file_path': 'Gastrointestinal and Liver Disease.pdf',
 'page': 0,
 'total_pages': 272,
 'format': 'PDF 1.7',
 'title': 'Management of Nonvariceal Upper Gastrointestinal Bleeding: Guideline Recommendations From the International Consensus Group',
 'author': 'Alan N.\xa0Barkun, Majid\xa0Almadi, Ernst J.\xa0Kuipers, Loren\xa0Laine, Joseph\xa0Sung, Frances\xa0Tse, Grigorios I.\xa0Leontiadis, Neena S.\xa0Abraham, Xavier\xa0Calvet, Francis K.L.\xa0Chan, James\xa0Douketis, Robert\xa0Enns, Ian M.\xa0Gralnek, Vipul\xa0Jairath, Dennis\xa0Jensen, James\xa0Lau, Gregory Y.H.\xa0Lip, Romaric\xa0Loffroy, Fauze\xa0Maluf-Filho, Andrew C.\xa0Meltzer, Nageshwar\xa0Reddy, John R.\xa0Saltzman, John K.\xa0Marshall, and Marc\xa0Bardou',
 'subject': 'Ann Intern Med 2019.171:805-822',
 'keywords': '',
 'creator': 'XPP',
 'producer': 'Adobe LiveCycle PDF Generator; modified using iText 4.2.0 by 1T3XT',
 'creationDate': '13th September 2020',
 'modDate': 'D:

In [27]:
import re

# Function to remove non-printable characters
def clean_text(text):
    # Use regex to remove any non-printable characters (including \xad)
    return re.sub(r'[^\x20-\x7E]', '', text)

# Function to clean the metadata by removing non-printable characters in string values
def clean_metadata(metadata):
    cleaned_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, str):
            # Clean string values in metadata
            cleaned_metadata[key] = clean_text(value)
        else:
            # Keep non-string values as they are
            cleaned_metadata[key] = value
    return cleaned_metadata

# Process the list of Document objects
cleaned_documents = []

for doc in loaded_data:
    # Clean the page_content by removing non-printable characters
    cleaned_content = clean_text(doc.page_content)
    
    # Clean the metadata by removing non-printable characters in string values
    cleaned_meta = clean_metadata(doc.metadata)
    
    # Create a new Document object with cleaned content and cleaned metadata
    cleaned_document = Document(metadata=cleaned_meta, page_content=cleaned_content)
    
    # Append the cleaned document to the new list
    cleaned_documents.append(cleaned_document)

# Now cleaned_documents contains documents with cleaned metadata and page_content

In [28]:
cleaned_documents[0].page_content

'806Peptic Ulcer DiseaseFrancis K.L. Chan, James Y.W. Lau53An ulcer in the GI tract can be defined as a 5 mm or larger break in the lining of the mucosa, with appreciable depth at endoscopy or with histologic evidence of submucosal extension. An erosion is a break less than 5 mm. The distinction between an ulcer and an erosion is somewhat arbitrary. The term PUD is used to include ulcerations and erosions in the stomach and duodenum from a number of causes. These lesions are called peptic because the enzyme pepsin, proteolytic at an acidic pH (see Chapter 51), plays a major role in causing the mucosal breaks, regardless of the inciting agent.Decades of research focused on the role of gastric acid secretion and the effects of stress, personality type, and genetics in the pathogenesis of PUD. The discovery of the histamine-2 (H2) receptor and development of H2RAs,1 and subsequently PPIs, led to major changes in the management of PUD. The discovery of Hp and its role in PUD (see Chapter 5

In [32]:
cleaned_documents[0].metadata

{'source': 'Gastrointestinal and Liver Disease.pdf',
 'file_path': 'Gastrointestinal and Liver Disease.pdf',
 'page': 0,
 'total_pages': 272,
 'format': 'PDF 1.7',
 'title': 'Management of Nonvariceal Upper Gastrointestinal Bleeding: Guideline Recommendations From the International Consensus Group',
 'author': 'Alan N.Barkun, MajidAlmadi, Ernst J.Kuipers, LorenLaine, JosephSung, FrancesTse, Grigorios I.Leontiadis, Neena S.Abraham, XavierCalvet, Francis K.L.Chan, JamesDouketis, RobertEnns, Ian M.Gralnek, VipulJairath, DennisJensen, JamesLau, Gregory Y.H.Lip, RomaricLoffroy, FauzeMaluf-Filho, Andrew C.Meltzer, NageshwarReddy, John R.Saltzman, John K.Marshall, and MarcBardou',
 'subject': 'Ann Intern Med 2019.171:805-822',
 'keywords': '',
 'creator': 'XPP',
 'producer': 'Adobe LiveCycle PDF Generator; modified using iText 4.2.0 by 1T3XT',
 'creationDate': '13th September 2020',
 'modDate': 'D:20240611221533Z',
 'trapped': ''}

In [33]:
import json

# Assuming Document class has 'metadata' and 'page_content' attributes
# Define a function to convert the Document object into a dictionary
def document_to_dict(doc):
    return {
        'metadata': doc.metadata,   # assuming doc.metadata is already a dictionary
        'page_content': doc.page_content  # assuming doc.page_content is a string
    }

# Function to save list of Document objects to JSON
def save_documents_to_json(document_list, filename='cleaned_documents.json'):
    # Convert the list of Document objects to a list of dictionaries
    document_dicts = [document_to_dict(doc) for doc in document_list]
    
    # Write the list of dictionaries to a JSON file
    with open(filename, 'w') as f:
        json.dump(document_dicts, f, indent=4)

# Example usage with your data list
# Assuming 'data' is a list of Document objects
save_documents_to_json(cleaned_documents)

In [34]:
import json

# Assuming you have a Document class
class Document:
    def __init__(self, metadata, page_content):
        self.metadata = metadata
        self.page_content = page_content

    def __repr__(self):
        # Customize the representation to match the output format you want
        return f"Document(metadata={self.metadata}, page_content={self.page_content})"

# Function to read the JSON file and load it as a list of Document objects
def load_documents_from_json(filename='cleaned_documents.json'):
    with open(filename, 'r') as f:
        document_dicts = json.load(f)
    
    # Convert each dictionary back into a Document object
    document_list = [Document(d['metadata'], d['page_content']) for d in document_dicts]
    return document_list

# Example usage
test_load_data = load_documents_from_json()

# Now you can access the loaded Document objects
# for doc in loaded_data:
    # print(doc.metadata)
    # print(doc.page_content[:100])  # Print first 100 characters of page content


In [41]:
type(raw_documents[0])

langchain_core.documents.base.Document

In [43]:
type(loaded_data[0])

__main__.Document

In [44]:
# Import the Document class from langchain
from langchain_core.documents.base import Document

# Function to remove non-printable characters
def clean_text(text):
    # Use regex to remove any non-printable characters (including \xad)
    return re.sub(r'[^\x20-\x7E]', '', text)

# Function to clean the metadata by removing non-printable characters in string values
def clean_metadata(metadata):
    cleaned_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, str):
            # Clean string values in metadata
            cleaned_metadata[key] = clean_text(value)
        else:
            # Keep non-string values as they are
            cleaned_metadata[key] = value
    return cleaned_metadata

# Process the list of __main__.Document objects and convert them to langchain Document objects
cleaned_documents = []

for doc in loaded_data:
    # Clean the page_content by removing non-printable characters
    cleaned_content = clean_text(doc.page_content)
    
    # Clean the metadata by removing non-printable characters in string values
    cleaned_meta = clean_metadata(doc.metadata)
    
    # Create a new langchain Document object with cleaned content and cleaned metadata
    cleaned_document = Document(metadata=cleaned_meta, page_content=cleaned_content)
    
    # Append the cleaned langchain Document object to the new list
    cleaned_documents.append(cleaned_document)

# Now cleaned_documents contains langchain_core Document objects


In [47]:
type(cleaned_documents[0])

langchain_core.documents.base.Document

In [48]:
import pickle

# Save the cleaned_documents list to a file
with open('cleaned_documents.pkl', 'wb') as file:
    pickle.dump(cleaned_documents, file)

print("Documents saved successfully!")

Documents saved successfully!


In [49]:
import pickle

# Load the cleaned_documents list from the file
with open('cleaned_documents.pkl', 'rb') as file:
    data = pickle.load(file)

print("Documents loaded successfully!")


Documents loaded successfully!


In [51]:
type(data[0])

langchain_core.documents.base.Document