### Get Raw Text

In [11]:
import os
from urllib import request
import nltk
import re

def get_raw_text(title,url,path):
    # Check if the file is stored locally
    filename = path + title + '.txt'
    if os.path.isfile(filename) and os.stat(filename).st_size != 0:
        print("{title} file already exists".format(title=title))
        with open(filename, 'r') as f:
            raw = f.read()

    else:
        print("{title} file does not already exist. Grabbing from Project Gutenberg".format(title=title))
        response = request.urlopen(url)
        raw = response.read().decode('utf-8-sig')
        print("Saving {title} file".format(title=title))
        with open(filename, 'w') as outfile:
            outfile.write(raw)
    return filename, raw

In [13]:
title = 'The History Of The Decline And Fall Of The Roman Empire, Complete'
url = 'https://www.gutenberg.org/files/893/893-0.txt'
path = 'text_files/'

filename, raw = get_raw_text(title,url,path)

The History Of The Decline And Fall Of The Roman Empire, Complete file does not already exist. Grabbing from Project Gutenberg


Saving The History Of The Decline And Fall Of The Roman Empire, Complete file


In [14]:
print(raw)

The Project Gutenberg EBook of The History of The Decline and Fall of the Roman Empire, by Edward Gibbon

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions
whatsoever.  You may copy it, give it away or re-use it under the terms of
the Project Gutenberg License included with this eBook or online at
www.gutenberg.org.  If you are not located in the United States, you'll have
to check the laws of the country where you are located before using this ebook.

Title: The History of The Decline and Fall of the Roman Empire
       Volume 4

Author: Edward Gibbon

Commentator: H. H. Milman

Release Date: April, 1997 [EBook #893]
[Most recently updated: March 23, 2020]

Language: English

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK DECLINE AND FALL OF THE ROMAN EMPIRE ***




Produced by David Reed, Dale R. Fredrickson and David Widger





HISTORY OF THE DECLINE AND FALL OF TH

### Split Text and Upload Chunks to Blob Storage

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
load_dotenv()

storageaccount = os.getenv('STORAGE_ACCOUNT')
storage_creds = os.getenv('SAS_TOKEN')

def split_text(raw):
    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 1000,
        chunk_overlap  = 0,
        length_function = len,
        is_separator_regex = False,
    )

    texts = text_splitter.create_documents([raw])
    print(texts[0])
    print(texts[1])
    return texts

def upload_blobs(texts, storageaccount, book_title):
    blob_service = BlobServiceClient(
        account_url=f"https://{storageaccount}.blob.core.windows.net", credential=storage_creds
    )
    blob_container = blob_service.get_container_client('silver')
    
    
    for i in range(40): # //range(len(texts))
        blob_name = f'{book_title}/{i}.txt'
        
        print(blob_name)
        print(texts[i])
        blob_container.upload_blob(blob_name, texts[i].page_content, overwrite=True)
    return

texts = split_text(raw)
upload_blobs(texts, storageaccount, title)

page_content="The Project Gutenberg EBook of The History of The Decline and Fall of the Roman Empire, by Edward Gibbon\r\n\r\nThis eBook is for the use of anyone anywhere in the United States and most\r\nother parts of the world at no cost and with almost no restrictions\r\nwhatsoever.  You may copy it, give it away or re-use it under the terms of\r\nthe Project Gutenberg License included with this eBook or online at\r\nwww.gutenberg.org.  If you are not located in the United States, you'll have\r\nto check the laws of the country where you are located before using this ebook.\r\n\r\nTitle: The History of The Decline and Fall of the Roman Empire\r\n       Volume 4\r\n\r\nAuthor: Edward Gibbon\r\n\r\nCommentator: H. H. Milman\r\n\r\nRelease Date: April, 1997 [EBook #893]\r\n[Most recently updated: March 23, 2020]\r\n\r\nLanguage: English\r\n\r\nCharacter set encoding: UTF-8\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK DECLINE AND FALL OF THE ROMAN EMPIRE ***\r\n\r\n\r\n\r\n\r\nProdu