# Dependencies

In [1]:
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, CSVLoader, Docx2txtLoader
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import re
import pickle

# Directory with documents

In [2]:
data_dir = '../data/docs'
LOCAL_VECTOR_STORE_DIR = Path("../data").resolve().parent.joinpath("data", "vector_stores")

# Clear text

In [3]:
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()


# Function to load documents

In [4]:
def load_documents(data_dir):
    """
    Load documents from a data_dir. Supports txt, pdf, csv, and docx formats.
    Applies cleaning to the text.
    """
    documents = []

    # Configure the directory as a Path object
    directory = Path(data_dir)

    # Load txt files
    txt_loader = DirectoryLoader(
        directory.as_posix(), glob="**/*.txt", loader_cls=TextLoader, show_progress=True
    )
    documents.extend(txt_loader.load())

    # Load pdf files
    pdf_loader = DirectoryLoader(
        directory.as_posix(), glob="**/*.pdf", loader_cls=PyPDFLoader, show_progress=True
    )
    documents.extend(pdf_loader.load())

    # Load csv files
    csv_loader = DirectoryLoader(
        directory.as_posix(),
        glob="**/*.csv",
        loader_cls=CSVLoader,
        show_progress=True,
        loader_kwargs={"encoding": "utf8"}
    )
    documents.extend(csv_loader.load())

    # Load docx files
    doc_loader = DirectoryLoader(
        directory.as_posix(), glob="**/*.docx", loader_cls=Docx2txtLoader, show_progress=True
    )
    documents.extend(doc_loader.load())

    # Apply text cleaning to all loaded documents
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)

    return documents

# Test the function
data_directory = "../data/docs"
loaded_documents = load_documents(data_directory)

# Print the number of documents loaded and a preview of the cleaned text
print(f"Loaded {len(loaded_documents)} documents.")
print(f"First cleaned document:\n{loaded_documents[0].page_content[:500]}")


0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.74it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]

Loaded 5 documents.
First cleaned document:
Course 1: Blockchain Fundamentals What You'll Learn ● Understand the basic concepts of blockchain, its history, and its importance in the digital world. ● Explore the underlying technology, including cryptography and consensus mechanisms. ● Identify different types of blockchain (public, private, and hybrid). ● Discover the benefits of blockchain, such as security, transparency, and decentralization. Prerequisites ● None. This course is introductory and perfect for beginners. Duration ● 5 weeks 





# Load and split documents

In [5]:
def split_into_fragments(documents, chunk_size=1200, chunk_overlap=100):
    """
    Split cleaned documents into smaller fragments.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    fragments = splitter.split_documents(documents)
    return fragments

# Split documents into fragments
fragments = split_into_fragments(loaded_documents)

# Print fragment information
print(f"Number of fragments created: {len(fragments)}")
print(f"First fragments:\n{[frag.page_content[:500] for frag in fragments[:3]]}")


Number of fragments created: 9
First fragments:
["Course 1: Blockchain Fundamentals What You'll Learn ● Understand the basic concepts of blockchain, its history, and its importance in the digital world. ● Explore the underlying technology, including cryptography and consensus mechanisms. ● Identify different types of blockchain (public, private, and hybrid). ● Discover the benefits of blockchain, such as security, transparency, and decentralization. Prerequisites ● None. This course is introductory and perfect for beginners. Duration ● 5 weeks ", "Included Materials ● Detailed video tutorials. ● Pre-configured contract templates for XRPL. ● Access to simulation tools. Certification ● Certificate awarded after completing a practical project, such as creating a token. Course 3: Introduction to Cryptocurrency Trading What You'll Learn ● Understand the basic concepts of trading in the cryptocurrency market. ● Analyze charts and conduct technical and fundamental analysis. ● Explore differen

# Save fragments for the next step

In [6]:
with open("../data/fragments.pkl", "wb") as f:
    pickle.dump(fragments, f)

print(f"Fragments created: {len(fragments)}")

Fragments created: 9


In [7]:
print(loaded_documents[0])  # Exibir o conteúdo do primeiro documento


page_content="Course 1: Blockchain Fundamentals What You'll Learn ● Understand the basic concepts of blockchain, its history, and its importance in the digital world. ● Explore the underlying technology, including cryptography and consensus mechanisms. ● Identify different types of blockchain (public, private, and hybrid). ● Discover the benefits of blockchain, such as security, transparency, and decentralization. Prerequisites ● None. This course is introductory and perfect for beginners. Duration ● 5 weeks (3 hours per week). Included Materials ● Recorded and live classes. ● PDF guides and step-by-step tutorials. ● Quizzes to test your knowledge. Certification ● Certificate of completion available after completing the quizzes and a final project. Course 2: Blockchain in Practice with XRPL What You'll Learn ● Create and configure a wallet on the XRP Ledger (XRPL). ● Execute transactions using XRPL. ● Issue and trade digital tokens. ● Explore practical use cases such as NFT issuance an

In [8]:
print(fragments[:3])  # Exibir os primeiros 3 fragmentos


[Document(page_content="Course 1: Blockchain Fundamentals What You'll Learn ● Understand the basic concepts of blockchain, its history, and its importance in the digital world. ● Explore the underlying technology, including cryptography and consensus mechanisms. ● Identify different types of blockchain (public, private, and hybrid). ● Discover the benefits of blockchain, such as security, transparency, and decentralization. Prerequisites ● None. This course is introductory and perfect for beginners. Duration ● 5 weeks (3 hours per week). Included Materials ● Recorded and live classes. ● PDF guides and step-by-step tutorials. ● Quizzes to test your knowledge. Certification ● Certificate of completion available after completing the quizzes and a final project. Course 2: Blockchain in Practice with XRPL What You'll Learn ● Create and configure a wallet on the XRP Ledger (XRPL). ● Execute transactions using XRPL. ● Issue and trade digital tokens. ● Explore practical use cases such as NFT i

In [9]:
print(f"Last page content:\n{loaded_documents[-1].page_content}")


Last page content:
Question: How does payment in XRP work? ● Answer: Payment is made directly through the chatbot interface. Here's the step-by-step process: 1. Select the Course: ○ The user tells the chatbot which course they wish to purchase. ○ The chatbot confirms the price in XRP, including the discount. 2. Wallet Address: ○ The chatbot provides the wallet address to send the payment. ○ If required, a destination tag will also be provided to identify the transaction. 3. Make the Payment: ○ The user transfers XRP from their wallet to the provided address. ○ If the user does not have a wallet, the chatbot can guide them on creating one on XRPL. 4. Transaction Confirmation: ○ The chatbot automatically checks the blockchain to confirm receipt of payment. ○ The transaction status (confirmed or pending) is shared with the user. 5. Course Access: ○ Upon payment confirmation, the chatbot sends immediate access to the course. ○ A confirmation email with course details is also sent. Discount