In [1]:
from bs4 import BeautifulSoup
import future
import pandas as pd
import csv
import itertools
import uuid
import requests
import re

## A. Scrape texts of the entire work

In [2]:
# function for scraping entire text from ToposText with given html

def topostext(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise FileNotFoundError("Failed to retrieve HTML content: " + url)
    
    data = []
    soup = BeautifulSoup(response.content, features="lxml")
    paragraphs = soup.find_all("p")  # Find all <p> tags instead of filtering by class

    for paragraph in paragraphs:
        match = re.search(r'§\s+(\d+\.\d+\.\d*)\s+(.*)$', paragraph.text)
        if match:
            Chapternparagraph = match.group(1)  # Extract the reference from the pattern
            Text = match.group(2)  # Extract the text from the pattern
            Reference = paragraph.get("id")  # Indicate book, chapter, paragraph
            UUID4 = uuid.uuid4()  # Create a unique ID

            data.append({
                'UUID4': UUID4,
                'Reference': Reference,
                'Chapter&Paragraph': Chapternparagraph,     
                'Text': Text
            })

    df = pd.DataFrame(data)
    return df

In [3]:
# link for digitized text of Natural History_book1-11
url1 = "https://topostext.org/work/148"

# link for digitized text of Natural History_book12-37
url2 = "https://topostext.org/work/153"

# construct the dataframe for two parts of the digitized text with the topostext function
df1 = topostext(url1)
df2 = topostext(url2)

In [4]:
df1.shape

(1158, 4)

In [5]:
df2.shape

(2335, 4)

In [6]:
df1.head()

Unnamed: 0,UUID4,Reference,Chapter&Paragraph,Text
0,fc9cc16f-b091-4e6f-8481-39b43631b0c0,urn:cts:latinLit:phi0978.phi001:1.1.1,1.1.1,PREFACE IN THE FORM OF A LETTER: PLINIUS SECUN...
1,ab597ded-5b09-4b65-9f5e-82591c341690,urn:cts:latinLit:phi0978.phi001:1.2.1,1.2.1,But who could judge the value of these composi...
2,fb4d5e35-9f3a-47b6-8d50-2e81671f105e,urn:cts:latinLit:phi0978.phi001:1.3.1,1.3.1,"But if Lucilius, the originator of critical sn..."
3,be4d218d-80fb-4f94-8007-5e9b4cf1cac8,urn:cts:latinLit:phi0978.phi001:1.4.1,1.4.1,"My own presumption has indeed gone further, in..."
4,b5d9b2c7-0b5d-4f0b-a50f-b6b95fecdfef,urn:cts:latinLit:phi0978.phi001:1.5.1,1.5.1,For my own part I am of opinion that a special...


In [7]:
# combine the two parts of scraped text

wholebook = pd.concat([df1, df2], ignore_index=True)
wholebook.shape

(3493, 4)

In [8]:
# store the sparsed text into csv file

wholebook.to_csv('wholebooktext.csv')

In [9]:
# construct a corpus for the entire text

wholebook_corpus = wholebook[['Chapter&Paragraph', 'Text']]
wholebook_corpus['Text'] = wholebook_corpus['Text'].str.lower()
wholebook_corpus.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wholebook_corpus['Text'] = wholebook_corpus['Text'].str.lower()


Unnamed: 0,Chapter&Paragraph,Text
0,1.1.1,preface in the form of a letter: plinius secun...
1,1.2.1,but who could judge the value of these composi...
2,1.3.1,"but if lucilius, the originator of critical sn..."
3,1.4.1,"my own presumption has indeed gone further, in..."
4,1.5.1,for my own part i am of opinion that a special...


In [13]:
import os

output_folder = "NH_wholetext"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for _, row in wholebook_corpus.iterrows():
    chapter_paragraph = row['Chapter&Paragraph']
    text = row['Text']
    filename = os.path.join(output_folder, f"{chapter_paragraph}_text.txt")
    
    with open(filename, 'w', encoding = 'utf-8') as file:
        file.write(text)

# print the last exported filename for check     
print(f"Exported {filename}")

Exported NH_wholetext\37.78.1_text.txt


## B. Create a dataset for geographical-related texts

In [14]:
# check the place names with annotation in the first part of the digitized book

response = requests.get(url1)
soup = BeautifulSoup(response.content, features="lxml")

links = soup.find_all("a", {"class": "place"})

for link in links[:5]:
    print(link)

<a about="https://topostext.org/place/380237SAca" class="place" lat="37.992" long="23.707">Academy</a>
<a about="https://topostext.org/place/419125LPal" class="place" lat="41.8896" long="12.4884">Palatine</a>
<a about="https://topostext.org/place/419125LEsq" class="place" lat="41.895" long="12.496">Esquiline</a>
<a about="https://topostext.org/place/419125SCap" class="place" lat="41.8933" long="12.483">Capitol</a>
<a about="https://topostext.org/place/419125PRom" class="place" lat="41.891" long="12.486">Rome</a>


In [15]:
# number of class=place detected

len(links)

5595

In [16]:
# function for text with geographical name annotation scraping

def toposgeotext(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise FileNotFoundError("Failed to retrieve HTML content: " + url)

    data = []
    soup = BeautifulSoup(response.content, features="lxml")
    links = soup.find_all("a", {"class": "place"})

    for link in links:
        Place_Name = link.contents[0]  # Place name
        ToposText_ID = link.get('about')  # ToposText ID
        Lat = link.get('lat')
        Long = link.get('long')
        Parent = link.find_parent("p")
        Text = Parent.text  # Extract related text
        Reference = Parent.get("id")  # Indicate book, chapter, paragraph

        # Separate the information in Text using the regular expression pattern
        match = re.search(r'§\s+(\d+\.\d+\.\d*)\s+(.*)$', Text)
        if match:
            Chapternparagraph = match.group(1)  # Extract the reference from the pattern
            Text = match.group(2)  # Extract the remaining text from the pattern
            UUID4 = uuid.uuid4()  # Create a unique ID

            data.append({
                'UUID4': UUID4,
                'ToposText_ID': ToposText_ID,
                'Place_Name': Place_Name,
                'Reference': Reference,
                'Lat': Lat,
                'Long': Long,
                'Chapter&Paragraph': Chapternparagraph,
                'Text': Text
            })

    df = pd.DataFrame(data)
    return df

In [17]:
# construct the dataframe for two parts of the digitized text with geographical annotations with the toposgeotext function

geodf1 = toposgeotext(url1)
geodf2 = toposgeotext(url2)

In [18]:
geodf1.head()

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
0,73103f96-0ed2-4b9c-855f-559f70ad0a36,https://topostext.org/place/380237SAca,Academy,urn:cts:latinLit:phi0978.phi001:1.8.1,37.992,23.707,1.8.1,For my own part I frankly confess that my work...
1,584b0137-6b50-4a94-a63e-78d64c4adee8,https://topostext.org/place/419125LPal,Palatine,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8896,12.4884,2.5.1,For this reason I deem it a mark of human weak...
2,f88c1b62-043f-4b91-ae1a-2679cad7db3a,https://topostext.org/place/419125LEsq,Esquiline,urn:cts:latinLit:phi0978.phi001:2.5.1,41.895,12.496,2.5.1,For this reason I deem it a mark of human weak...
3,b9fd1510-8525-4d05-b72d-e0cf6423ac0c,https://topostext.org/place/419125SCap,Capitol,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8933,12.483,2.5.1,For this reason I deem it a mark of human weak...
4,ba5a51f6-02fb-4713-bfac-34427f40d7ef,https://topostext.org/place/419125PRom,Rome,urn:cts:latinLit:phi0978.phi001:2.6.3,41.891,12.486,2.6.3,Below the sun revolves a very large star named...


In [19]:
geodf1.shape

(5595, 8)

In [20]:
geodf2.shape

(3281, 8)

In [21]:
geotext_whole = pd.concat([geodf1, geodf2], ignore_index=True)
geotext_whole.head()

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
0,73103f96-0ed2-4b9c-855f-559f70ad0a36,https://topostext.org/place/380237SAca,Academy,urn:cts:latinLit:phi0978.phi001:1.8.1,37.992,23.707,1.8.1,For my own part I frankly confess that my work...
1,584b0137-6b50-4a94-a63e-78d64c4adee8,https://topostext.org/place/419125LPal,Palatine,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8896,12.4884,2.5.1,For this reason I deem it a mark of human weak...
2,f88c1b62-043f-4b91-ae1a-2679cad7db3a,https://topostext.org/place/419125LEsq,Esquiline,urn:cts:latinLit:phi0978.phi001:2.5.1,41.895,12.496,2.5.1,For this reason I deem it a mark of human weak...
3,b9fd1510-8525-4d05-b72d-e0cf6423ac0c,https://topostext.org/place/419125SCap,Capitol,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8933,12.483,2.5.1,For this reason I deem it a mark of human weak...
4,ba5a51f6-02fb-4713-bfac-34427f40d7ef,https://topostext.org/place/419125PRom,Rome,urn:cts:latinLit:phi0978.phi001:2.6.3,41.891,12.486,2.6.3,Below the sun revolves a very large star named...


In [22]:
geotext_whole.shape

(8876, 8)

In [23]:
geotext_whole.to_csv('geotext_whole.csv')

## C. Create dataset and corpus files of India-related content

In [24]:
# transfer the data type of corresponding coordinates to numbers

geotext_whole['Lat'] = geotext_whole['Lat'].astype(float)
geotext_whole['Long'] = geotext_whole['Long'].astype(float)

In [25]:
# define the latitude and longitude ranges for Indian region in the context

lat_range = (5, 35)
long_range = (65, 95)

# create a boolean mask for filtering
mask = (geotext_whole['Lat'].between(*lat_range)) & (geotext_whole['Long'].between(*long_range))

# apply the mask to filter the dataframe
geotext_india = geotext_whole[mask]

geotext_india.head()

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
85,f102882f-623f-4141-bda4-5cd63b62902b,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.75.1,30.0,74.0,2.75.1,Similarly it is reported that at the town of S...
92,5ccc9549-f64e-4e76-88c4-c42f42aebbf6,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.75.1,30.0,74.0,2.75.1,Similarly it is reported that at the town of S...
93,c726650a-0ca7-427d-9f55-27408a7b3bab,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.75.1,30.0,74.0,2.75.1,Similarly it is reported that at the town of S...
218,b0010e6e-d90a-4b3d-9f9e-9992901efd6c,https://topostext.org/place/254683WInd,Indus,urn:cts:latinLit:phi0978.phi001:2.98.1,25.4487,68.3192,2.98.1,Near the town of Harpasa in Asia stands a jagg...
343,6ef0c7fc-8d85-4565-b95f-9983b60897c7,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.112.1,30.0,74.0,2.112.1,"Our own portion of the earth, which is my subj..."


In [26]:
geotext_india.shape

(229, 8)

In [27]:
geotext_india.to_csv('geotext_indianregion.csv')

In [28]:
# construct a corpus for the text mentioned place names in Indian subcontinent

india_corpus = geotext_india[['Chapter&Paragraph', 'Text']].drop_duplicates(subset='Text').reset_index(drop=True)
india_corpus['Text'] = india_corpus['Text'].str.lower()
india_corpus[:5]

Unnamed: 0,Chapter&Paragraph,Text
0,2.75.1,similarly it is reported that at the town of s...
1,2.98.1,near the town of harpasa in asia stands a jagg...
2,2.112.1,"our own portion of the earth, which is my subj..."
3,4.17.4,"such is macedonia, which was once the mistress..."
4,4.26.2,"at this spot begins a well-wooded district, wh..."


In [29]:
output_folder = "NH_geotext_india"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for _, row in india_corpus.iterrows():
    chapter_paragraph = row['Chapter&Paragraph']
    text = row['Text']
    filename = os.path.join(output_folder, f"{chapter_paragraph}_text.txt")
    
    with open(filename, 'w', encoding = 'utf-8') as file:
        file.write(text)

# print the last exported filename for check     
print(f"Exported {filename}")

Exported NH_geotext_india\37.77.1_text.txt


In [30]:
# check the distinct place names in the selected region

distinct_places = geotext_india['Place_Name'].unique()
distinct_places

array(['India', 'Indus', 'Ganges', 'Acesinus', 'Hydaspes', 'Taprobane',
       'Arachosia', 'Muziris', 'Baragaza', 'Ceylon'], dtype=object)