## A. Scraping texts

In [1]:
from bs4 import BeautifulSoup
import future
import pandas as pd
import csv
import itertools
import uuid
import requests
import re

In [2]:
import os

def topostext(html):
    if not os.path.isfile(html):
        raise FileNotFoundError("File not found: " + html)
    
    data = []
    with open(html, encoding="latin-1") as file:
        soup = BeautifulSoup(file, features="lxml")
        paragraphs = soup.find_all("p")  # Find all <p> tags instead of filtering by class
        
        for paragraph in paragraphs:
            match = re.search(r'§\s+(\d+\.\d+\.\d*)\s+(.*)$', paragraph.text)
            if match:
                Chapternparagraph = match.group(1)  # Extract the reference from the pattern
                Text = match.group(2)  # Extract the text from the pattern
                Reference = paragraph.get("id")  # Indicate book, chapter, paragraph
                UUID4 = uuid.uuid4()  # Create a unique ID
                
                data.append({
                'UUID4': UUID4,
                'Reference': Reference,
                'Chapter&Paragraph': Chapternparagraph,     
                'Text': Text
            })
    
    df = pd.DataFrame(data)
    return df

In [3]:
df11 = topostext('/Users/dawn/Desktop/KUL/readings/ma thesis/code&data/NH_Eng_1-11.html')
df22 = topostext('/Users/dawn/Desktop/KUL/readings/ma thesis/code&data/NH_Eng_12-37.html')

In [4]:
df11.head()

Unnamed: 0,UUID4,Reference,Chapter&Paragraph,Text
0,7c54e707-2b72-4cca-ba99-cd5365605719,urn:cts:latinLit:phi0978.phi001:1.1.1,1.1.1,PREFACE IN THE FORM OF A LETTER: PLINIUS SECUN...
1,f6ca31f2-c2d6-494d-8194-0dc3a79acbe4,urn:cts:latinLit:phi0978.phi001:1.2.1,1.2.1,But who could judge the value of these composi...
2,61ee2536-dc3a-4ae4-95ee-921743841d17,urn:cts:latinLit:phi0978.phi001:1.3.1,1.3.1,"But if Lucilius, the originator of critical sn..."
3,3a137fc7-afdd-433e-9c15-47bd2c500ccd,urn:cts:latinLit:phi0978.phi001:1.4.1,1.4.1,"My own presumption has indeed gone further, in..."
4,488dafe0-e6ce-45d7-aba2-7722c615c2e7,urn:cts:latinLit:phi0978.phi001:1.5.1,1.5.1,For my own part I am of opinion that a special...


In [5]:
df11.shape

(1158, 4)

In [6]:
df22.head()

Unnamed: 0,UUID4,Reference,Chapter&Paragraph,Text
0,8e1f84ef-f60f-4630-b0fc-54352ca7a431,urn:cts:latinLit:phi0978.phi001:12.1.1,12.1.1,SUCH are the generic and specific characterist...
1,80916ae7-d538-4c66-9622-caa74a0b103a,urn:cts:latinLit:phi0978.phi001:12.1.2,12.1.2,The riches of earth's bounty were for a long t...
2,273653b5-2daf-4ae4-89d4-cec9e28cea25,urn:cts:latinLit:phi0978.phi001:12.2.1,12.2.1,Once upon a time trees were the temples of the...
3,fcacf3d9-2b1f-4a78-92d4-b3ee33ab0188,urn:cts:latinLit:phi0978.phi001:12.3.1,12.3.1,But who would not be justifiably surprised to ...
4,7e1c966b-d371-405a-a4c0-1c0395ce526d,urn:cts:latinLit:phi0978.phi001:12.4.1,12.4.1,This took place at about the period of the cap...


In [7]:
df22.shape

(2335, 4)

In [15]:
wholebook = pd.concat([df11, df22], ignore_index=True)
wholebook.shape

(3493, 4)

In [16]:
wholebook['Text'] = wholebook['Text'].str.replace("â\x80\x94", "-")

In [10]:
wholebook.to_csv('wholebooktext.csv')

In [17]:
wholebook_corpus = wholebook[['Chapter&Paragraph', 'Text']]
wholebook_corpus['Text'] = wholebook_corpus['Text'].str.lower()
wholebook_corpus

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wholebook_corpus['Text'] = wholebook_corpus['Text'].str.lower()


Unnamed: 0,Chapter&Paragraph,Text
0,1.1.1,preface in the form of a letter: plinius secun...
1,1.2.1,but who could judge the value of these composi...
2,1.3.1,"but if lucilius, the originator of critical sn..."
3,1.4.1,"my own presumption has indeed gone further, in..."
4,1.5.1,for my own part i am of opinion that a special...
...,...,...
3488,37.75.2,concave or convex stones are considered less v...
3489,37.75.3,to distinguish genuine and false gemstones is ...
3490,37.76.1,"i, on the other hand, am prepared to explain t..."
3491,37.77.1,for now that i have completed my survey of nat...


In [18]:
for _, row in wholebook_corpus.iterrows():
    chapter_paragraph = row['Chapter&Paragraph']
    text = row['Text']
    filename = f"{chapter_paragraph}_text.txt"
    
    with open(filename, 'w', encoding = 'latin-1') as file:
        file.write(text)
    
    print(f"Exported {filename}")

Exported 1.1.1_text.txt
Exported 1.2.1_text.txt
Exported 1.3.1_text.txt
Exported 1.4.1_text.txt
Exported 1.5.1_text.txt
Exported 1.6.1_text.txt
Exported 1.7.1_text.txt
Exported 1.8.1_text.txt
Exported 2.1.1_text.txt
Exported 2.1.2_text.txt
Exported 2.2.1_text.txt
Exported 2.3.1_text.txt
Exported 2.3.2_text.txt
Exported 2.4.1_text.txt
Exported 2.5.1_text.txt
Exported 2.5.2_text.txt
Exported 2.5.3_text.txt
Exported 2.6.1_text.txt
Exported 2.6.2_text.txt
Exported 2.6.3_text.txt
Exported 2.6.4_text.txt
Exported 2.6.5_text.txt
Exported 2.7.1_text.txt
Exported 2.8.1_text.txt
Exported 2.8.2_text.txt
Exported 2.9.1_text.txt
Exported 2.9.2_text.txt
Exported 2.10.1_text.txt
Exported 2.11.1_text.txt
Exported 2.12.1_text.txt
Exported 2.13.1_text.txt
Exported 2.13.2_text.txt
Exported 2.13.3_text.txt
Exported 2.13.4_text.txt
Exported 2.13.5_text.txt
Exported 2.13.6_text.txt
Exported 2.13.7_text.txt
Exported 2.13.8_text.txt
Exported 2.14.1_text.txt
Exported 2.14.2_text.txt
Exported 2.15.1_text.txt
Ex

Exported 5.10.3_text.txt
Exported 5.10.4_text.txt
Exported 5.11.1_text.txt
Exported 5.11.2_text.txt
Exported 5.11.3_text.txt
Exported 5.11.4_text.txt
Exported 5.12.1_text.txt
Exported 5.13.1_text.txt
Exported 5.14.1_text.txt
Exported 5.15.1_text.txt
Exported 5.15.2_text.txt
Exported 5.15.3_text.txt
Exported 5.15.4_text.txt
Exported 5.16.1_text.txt
Exported 5.17.1_text.txt
Exported 5.17.2_text.txt
Exported 5.18.1_text.txt
Exported 5.18.2_text.txt
Exported 5.19.1_text.txt
Exported 5.20.1_text.txt
Exported 5.21.1_text.txt
Exported 5.21.2_text.txt
Exported 5.21.3_text.txt
Exported 5.22.1_text.txt
Exported 5.23.1_text.txt
Exported 5.24.1_text.txt
Exported 5.25.1_text.txt
Exported 5.26.1_text.txt
Exported 5.27.1_text.txt
Exported 5.28.1_text.txt
Exported 5.28.2_text.txt
Exported 5.28.3_text.txt
Exported 5.29.1_text.txt
Exported 5.29.2_text.txt
Exported 5.29.3_text.txt
Exported 5.29.4_text.txt
Exported 5.29.5_text.txt
Exported 5.29.6_text.txt
Exported 5.30.1_text.txt
Exported 5.31.1_text.txt


Exported 8.67.1_text.txt
Exported 8.68.1_text.txt
Exported 8.69.1_text.txt
Exported 8.70.1_text.txt
Exported 8.70.2_text.txt
Exported 8.71.1_text.txt
Exported 8.72.1_text.txt
Exported 8.72.2_text.txt
Exported 8.73.1_text.txt
Exported 8.74.1_text.txt
Exported 8.75.1_text.txt
Exported 8.75.2_text.txt
Exported 8.76.1_text.txt
Exported 8.77.1_text.txt
Exported 8.78.1_text.txt
Exported 8.78.2_text.txt
Exported 8.78.3_text.txt
Exported 8.79.1_text.txt
Exported 8.80.1_text.txt
Exported 8.81.1_text.txt
Exported 8.82.1_text.txt
Exported 8.83.1_text.txt
Exported 8.84.1_text.txt
Exported 9.1.1_text.txt
Exported 9.1.2_text.txt
Exported 9.2.1_text.txt
Exported 9.3.1_text.txt
Exported 9.4.1_text.txt
Exported 9.5.1_text.txt
Exported 9.6.1_text.txt
Exported 9.7.1_text.txt
Exported 9.8.1_text.txt
Exported 9.9.1_text.txt
Exported 9.10.1_text.txt
Exported 9.11.1_text.txt
Exported 9.12.1_text.txt
Exported 9.12.2_text.txt
Exported 9.13.1_text.txt
Exported 9.14.1_text.txt
Exported 9.15.1_text.txt
Exported 9

Exported 11.96.1_text.txt
Exported 11.96.2_text.txt
Exported 11.97.1_text.txt
Exported 11.98.1_text.txt
Exported 11.99.1_text.txt
Exported 11.99.2_text.txt
Exported 11.101.1_text.txt
Exported 11.102.1_text.txt
Exported 11.103.1_text.txt
Exported 11.104.1_text.txt
Exported 11.105.1_text.txt
Exported 11.106.1_text.txt
Exported 11.107.1_text.txt
Exported 11.108.1_text.txt
Exported 11.108.2_text.txt
Exported 11.109.1_text.txt
Exported 11.110.1_text.txt
Exported 11.111.1_text.txt
Exported 11.112.1_text.txt
Exported 11.112.2_text.txt
Exported 11.113.1_text.txt
Exported 11.114.1_text.txt
Exported 11.115.1_text.txt
Exported 11.115.2_text.txt
Exported 11.116.1_text.txt
Exported 11.117.1_text.txt
Exported 11.117.2_text.txt
Exported 11.118.1_text.txt
Exported 11.119.1_text.txt
Exported 12.1.1_text.txt
Exported 12.1.2_text.txt
Exported 12.2.1_text.txt
Exported 12.3.1_text.txt
Exported 12.4.1_text.txt
Exported 12.5.1_text.txt
Exported 12.6.1_text.txt
Exported 12.7.1_text.txt
Exported 12.7.2_text.tx

Exported 16.21.1_text.txt
Exported 16.22.1_text.txt
Exported 16.23.1_text.txt
Exported 16.23.2_text.txt
Exported 16.23.3_text.txt
Exported 16.24.1_text.txt
Exported 16.25.1_text.txt
Exported 16.26.1_text.txt
Exported 16.27.1_text.txt
Exported 16.28.1_text.txt
Exported 16.29.1_text.txt
Exported 16.30.1_text.txt
Exported 16.31.1_text.txt
Exported 16.32.1_text.txt
Exported 16.33.1_text.txt
Exported 16.33.2_text.txt
Exported 16.34.1_text.txt
Exported 16.35.1_text.txt
Exported 16.36.1_text.txt
Exported 16.37.1_text.txt
Exported 16.38.1_text.txt
Exported 16.39.1_text.txt
Exported 16.40.1_text.txt
Exported 16.41.1_text.txt
Exported 16.42.1_text.txt
Exported 16.43.1_text.txt
Exported 16.44.1_text.txt
Exported 16.45.1_text.txt
Exported 16.46.1_text.txt
Exported 16.47.1_text.txt
Exported 16.48.1_text.txt
Exported 16.49.1_text.txt
Exported 16.50.1_text.txt
Exported 16.51.1_text.txt
Exported 16.52.1_text.txt
Exported 16.53.1_text.txt
Exported 16.54.1_text.txt
Exported 16.55.1_text.txt
Exported 16.

Exported 18.72.2_text.txt
Exported 18.73.1_text.txt
Exported 18.73.2_text.txt
Exported 18.74.1_text.txt
Exported 18.74.2_text.txt
Exported 18.74.3_text.txt
Exported 18.75.1_text.txt
Exported 18.75.2_text.txt
Exported 18.76.1_text.txt
Exported 18.77.1_text.txt
Exported 18.77.2_text.txt
Exported 18.77.3_text.txt
Exported 18.78.1_text.txt
Exported 18.78.2_text.txt
Exported 18.79.1_text.txt
Exported 18.79.2_text.txt
Exported 18.80.1_text.txt
Exported 18.81.1_text.txt
Exported 18.82.1_text.txt
Exported 18.83.1_text.txt
Exported 18.84.1_text.txt
Exported 18.85.1_text.txt
Exported 18.86.1_text.txt
Exported 18.87.1_text.txt
Exported 18.88.1_text.txt
Exported 18.89.1_text.txt
Exported 18.90.1_text.txt
Exported 19.1.1_text.txt
Exported 19.2.1_text.txt
Exported 19.2.2_text.txt
Exported 19.3.1_text.txt
Exported 19.4.1_text.txt
Exported 19.5.1_text.txt
Exported 19.6.1_text.txt
Exported 19.7.1_text.txt
Exported 19.8.1_text.txt
Exported 19.9.1_text.txt
Exported 19.10.1_text.txt
Exported 19.11.1_text.

Exported 22.34.1_text.txt
Exported 22.35.1_text.txt
Exported 22.36.1_text.txt
Exported 22.37.1_text.txt
Exported 22.38.1_text.txt
Exported 22.39.1_text.txt
Exported 22.40.1_text.txt
Exported 22.41.1_text.txt
Exported 22.42.1_text.txt
Exported 22.43.1_text.txt
Exported 22.44.1_text.txt
Exported 22.45.1_text.txt
Exported 22.46.1_text.txt
Exported 22.47.1_text.txt
Exported 22.48.1_text.txt
Exported 22.49.1_text.txt
Exported 22.50.1_text.txt
Exported 22.51.1_text.txt
Exported 22.52.1_text.txt
Exported 22.53.1_text.txt
Exported 22.54.1_text.txt
Exported 22.55.1_text.txt
Exported 22.56.1_text.txt
Exported 22.57.1_text.txt
Exported 22.58.1_text.txt
Exported 22.59.1_text.txt
Exported 22.60.1_text.txt
Exported 22.61.1_text.txt
Exported 22.62.1_text.txt
Exported 22.63.1_text.txt
Exported 22.64.1_text.txt
Exported 22.64.2_text.txt
Exported 22.65.1_text.txt
Exported 22.66.1_text.txt
Exported 22.67.1_text.txt
Exported 22.68.1_text.txt
Exported 22.69.1_text.txt
Exported 22.70.1_text.txt
Exported 22.

Exported 25.92.1_text.txt
Exported 25.93.1_text.txt
Exported 25.94.1_text.txt
Exported 25.95.1_text.txt
Exported 25.96.1_text.txt
Exported 25.97.1_text.txt
Exported 25.98.1_text.txt
Exported 25.99.1_text.txt
Exported 25.100.1_text.txt
Exported 25.101.1_text.txt
Exported 25.102.1_text.txt
Exported 25.103.1_text.txt
Exported 25.104.1_text.txt
Exported 25.105.1_text.txt
Exported 25.106.1_text.txt
Exported 25.107.1_text.txt
Exported 25.108.1_text.txt
Exported 25.109.1_text.txt
Exported 25.110.1_text.txt
Exported 26.1.1_text.txt
Exported 26.2.1_text.txt
Exported 26.3.1_text.txt
Exported 26.4.1_text.txt
Exported 26.5.1_text.txt
Exported 26.6.1_text.txt
Exported 26.7.1_text.txt
Exported 26.8.1_text.txt
Exported 26.9.1_text.txt
Exported 26.10.1_text.txt
Exported 26.11.1_text.txt
Exported 26.12.1_text.txt
Exported 26.13.1_text.txt
Exported 26.14.1_text.txt
Exported 26.15.1_text.txt
Exported 26.16.1_text.txt
Exported 26.17.1_text.txt
Exported 26.18.1_text.txt
Exported 26.19.1_text.txt
Exported 2

Exported 29.16.1_text.txt
Exported 29.17.1_text.txt
Exported 29.18.1_text.txt
Exported 29.19.1_text.txt
Exported 29.20.1_text.txt
Exported 29.21.1_text.txt
Exported 29.22.1_text.txt
Exported 29.23.1_text.txt
Exported 29.24.1_text.txt
Exported 29.25.1_text.txt
Exported 29.26.1_text.txt
Exported 29.27.1_text.txt
Exported 29.27.2_text.txt
Exported 29.28.1_text.txt
Exported 29.29.1_text.txt
Exported 29.30.1_text.txt
Exported 29.31.1_text.txt
Exported 29.32.1_text.txt
Exported 29.33.1_text.txt
Exported 29.34.1_text.txt
Exported 29.35.1_text.txt
Exported 29.36.1_text.txt
Exported 29.37.1_text.txt
Exported 29.38.1_text.txt
Exported 29.38.2_text.txt
Exported 29.39.1_text.txt
Exported 30.1.1_text.txt
Exported 30.2.1_text.txt
Exported 30.3.1_text.txt
Exported 30.4.1_text.txt
Exported 30.5.1_text.txt
Exported 30.6.1_text.txt
Exported 30.7.1_text.txt
Exported 30.8.1_text.txt
Exported 30.9.1_text.txt
Exported 30.10.1_text.txt
Exported 30.11.1_text.txt
Exported 30.12.1_text.txt
Exported 30.13.1_text

Exported 34.28.1_text.txt
Exported 34.29.1_text.txt
Exported 34.29.2_text.txt
Exported 34.30.1_text.txt
Exported 34.31.1_text.txt
Exported 34.32.1_text.txt
Exported 34.33.1_text.txt
Exported 34.34.1_text.txt
Exported 34.35.1_text.txt
Exported 34.36.1_text.txt
Exported 34.37.1_text.txt
Exported 34.38.1_text.txt
Exported 34.39.1_text.txt
Exported 34.40.1_text.txt
Exported 34.41.1_text.txt
Exported 34.42.1_text.txt
Exported 34.43.1_text.txt
Exported 34.43.2_text.txt
Exported 34.44.1_text.txt
Exported 34.45.1_text.txt
Exported 34.46.1_text.txt
Exported 34.47.1_text.txt
Exported 34.48.1_text.txt
Exported 34.48.2_text.txt
Exported 34.48.3_text.txt
Exported 34.49.1_text.txt
Exported 34.50.1_text.txt
Exported 34.50.2_text.txt
Exported 34.51.1_text.txt
Exported 34.52.1_text.txt
Exported 34.53.1_text.txt
Exported 34.54.1_text.txt
Exported 34.55.1_text.txt
Exported 34.56.1_text.txt
Exported 35.1.1_text.txt
Exported 35.2.1_text.txt
Exported 35.2.2_text.txt
Exported 35.2.3_text.txt
Exported 35.3.1_

In [10]:
#import the source page as soup
## encode with 'latin-1'
soup=BeautifulSoup(open("/Users/dawn/Desktop/KUL/readings/ma thesis/code&data/NH_Eng_1-11.html", encoding="latin-1"), features="lxml")

#look the content of the <a> tags class:place
links=soup.find_all("a", {"class": "place"})

for link in links:
    print(link)

<a about="https://topostext.org/place/380237SAca" class="place" lat="37.992" long="23.707">Academy</a>
<a about="https://topostext.org/place/419125LPal" class="place" lat="41.8896" long="12.4884">Palatine</a>
<a about="https://topostext.org/place/419125LEsq" class="place" lat="41.895" long="12.496">Esquiline</a>
<a about="https://topostext.org/place/419125SCap" class="place" lat="41.8933" long="12.483">Capitol</a>
<a about="https://topostext.org/place/419125PRom" class="place" lat="41.891" long="12.486">Rome</a>
<a about="https://topostext.org/place/352248LIda" class="place" lat="35.2263" long="24.7708">Ida</a>
<a about="https://topostext.org/place/419125PRom" class="place" lat="41.891" long="12.486">Rome</a>
<a about="https://topostext.org/place/380236ASSB" class="place" lat="37.952" long="23.573">Salamis</a>
<a about="https://topostext.org/place/130350REth" class="place" lat="13.01" long="35.01">Ethiopia</a>
<a about="https://topostext.org/place/271307REgy" class="place" lat="27.1" l

<a about="https://topostext.org/place/417129RLat" class="place" lat="41.7" long="12.9">Latium</a>
<a about="https://topostext.org/place/419125WTib" class="place" lat="41.8835" long="12.4755">Tiber</a>
<a about="https://topostext.org/place/419125PRom" class="place" lat="41.891" long="12.486">Rome</a>
<a about="https://topostext.org/place/411146RCam" class="place" lat="41.1" long="14.6">Campania</a>
<a about="https://topostext.org/place/430135RPic" class="place" lat="43" long="13.5">Picenum</a>
<a about="https://topostext.org/place/404160RLeu" class="place" lat="40.4" long="16">Lucania</a>
<a about="https://topostext.org/place/392165RBre" class="place" lat="39.2" long="16.5">Bruttium</a>
<a about="https://topostext.org/place/406163RIta" class="place" lat="40.6" long="16.3">Italy</a>
<a about="https://topostext.org/place/441073LAlp" class="place" lat="44.142" long="7.343">Alps</a>
<a about="https://topostext.org/place/392165RBre" class="place" lat="39.2" long="16.5">Bruttium</a>
<a about=

<a about="https://topostext.org/place/406179PBru" class="place" lat="40.641" long="17.947">Brundusium</a>
<a about="https://topostext.org/place/424163IDio" class="place" lat="42.3926" long="16.2589">Diomedia</a>
<a about="https://topostext.org/place/405195RIll" class="place" lat="40.8" long="19.8">Illyricum</a>
<a about="https://topostext.org/place/447144UAps" class="place" lat="44.6939" long="14.393">Absyrtides</a>
<a about="https://topostext.org/place/000000IEle" class="place" lat="45" long="13">Electrides</a>
<a about="https://topostext.org/place/441152UIad" class="place" lat="44.1155" long="15.2245">Iader</a>
<a about="https://topostext.org/place/431162IIss" class="place" lat="43.061" long="16.182">Issa</a>
<a about="https://topostext.org/place/432166UPha" class="place" lat="43.183" long="16.599">Pharia</a>
<a about="https://topostext.org/place/431162IIss" class="place" lat="43.061" long="16.182">Issa</a>
<a about="https://topostext.org/place/430167UKor" class="place" lat="42.965" 

<a about="https://topostext.org/place/401224LOly" class="place" lat="40.0856" long="22.3586">Olympus</a>
<a about="https://topostext.org/place/399227LTem" class="place" lat="39.877" long="22.576">Tempe</a>
<a about="https://topostext.org/place/399227WPen" class="place" lat="39.9298" long="22.7094">Peneus</a>
<a about="https://topostext.org/place/399227WPen" class="place" lat="39.9298" long="22.7094">Peneus</a>
<a about="https://topostext.org/place/394223RThs" class="place" lat="39.4" long="22.3">Thessaly</a>
<a about="https://topostext.org/place/397227RMag" class="place" lat="39.4" long="22.9">Magnesia</a>
<a about="https://topostext.org/place/400225PLei" class="place" lat="40.0244" long="22.5343">Libethra</a>
<a about="https://topostext.org/place/394230UIol" class="place" lat="39.3663" long="22.9689">Iolcos</a>
<a about="https://topostext.org/place/394230UOrm" class="place" lat="39.3524" long="22.9722">Ormenium</a>
<a about="https://topostext.org/place/393231UMet" class="place" lat="3

<a about="https://topostext.org/place/368263IKin" class="place" lat="36.978" long="26.284">Cinara</a>
<a about="https://topostext.org/place/367251PSik" class="place" lat="36.662" long="25.0901">Sicinus</a>
<a about="https://topostext.org/place/368238IHie" class="place" lat="36.842" long="23.887">Hieracia</a>
<a about="https://topostext.org/place/354269PKas" class="place" lat="35.404" long="26.938">Casos</a>
<a about="https://topostext.org/place/368246IKim" class="place" lat="36.8" long="24.55">Cimolus</a>
<a about="https://topostext.org/place/367244PMel" class="place" lat="36.7389" long="24.4199">Melos</a>
<a about="https://topostext.org/place/368259IAmo" class="place" lat="36.8333" long="25.9">Amorgos</a>
<a about="https://topostext.org/place/368246IPol" class="place" lat="36.76" long="24.64">Polyaegos</a>
<a about="https://topostext.org/place/364255PThe" class="place" lat="36.364" long="25.477">Thera</a>
<a about="https://topostext.org/place/365253IThe" class="place" lat="36.4524" lo

<a about="https://topostext.org/place/257326PThe" class="place" lat="25.684" long="32.641">Thebes</a>
<a about="https://topostext.org/place/259328UCop" class="place" lat="25.9966" long="32.816">Coptos</a>
<a about="https://topostext.org/place/312301WNil" class="place" lat="30.0918" long="31.2313">Nile</a>
<a about="https://topostext.org/place/300740RInd" class="place" lat="30" long="74">India</a>
<a about="https://topostext.org/place/280400RAra" class="place" lat="28" long="40">Arabia</a>
<a about="https://topostext.org/place/253324UPat" class="place" lat="25.3697" long="32.4739">city of Venus</a>
<a about="https://topostext.org/place/260323UDio" class="place" lat="26.0169" long="32.3125">city of Jupiter</a>
<a about="https://topostext.org/place/261326UTen" class="place" lat="26.1418" long="32.6701">Tentyris</a>
<a about="https://topostext.org/place/262319UAby" class="place" lat="26.1855" long="31.9196">Abydus</a>
<a about="https://topostext.org/place/300070RLib" class="place" lat="32.

<a about="https://topostext.org/place/386275LSip" class="place" lat="38.567" long="27.456">Sipylum</a>
<a about="https://topostext.org/place/384271PSmy" class="place" lat="38.4191" long="27.1383">Smyrna</a>
<a about="https://topostext.org/place/385272WMel" class="place" lat="38.46029" long="27.16767">Meles</a>
<a about="https://topostext.org/place/384271PSmy" class="place" lat="38.4191" long="27.1383">Smyrna</a>
<a about="https://topostext.org/place/383281LTmo" class="place" lat="38.3237" long="28.1016">Tmolus</a>
<a about="https://topostext.org/place/383281LTmo" class="place" lat="38.3237" long="28.1016">Tmolus</a>
<a about="https://topostext.org/place/370332LTau" class="place" lat="37" long="33.2">Taurus</a>
<a about="https://topostext.org/place/384271PSmy" class="place" lat="38.4191" long="27.1383">Smyrna</a>
<a about="https://topostext.org/place/385281WHer" class="place" lat="38.5178" long="28.1113">Hermus</a>
<a about="https://topostext.org/place/398305UDor" class="place" lat="39.

<a about="https://topostext.org/place/400310RPhr" class="place" lat="40" long="31">Phrygia</a>
<a about="https://topostext.org/place/401330RGal" class="place" lat="40.1" long="33">Galatia</a>
<a about="https://topostext.org/place/406370UKab" class="place" lat="40.5918" long="36.9571">Neocaesarea</a>
<a about="https://topostext.org/place/397443RArm" class="place" lat="39.702" long="44.298">Armenia</a>
<a about="https://topostext.org/place/408365WLyk" class="place" lat="40.7519" long="36.517">Lycus</a>
<a about="https://topostext.org/place/413363PAmi" class="place" lat="41.312" long="36.315">Amisus</a>
<a about="https://topostext.org/place/412370UThe" class="place" lat="41.215" long="36.967">Themiscyra</a>
<a about="https://topostext.org/place/412367WIri" class="place" lat="41.2054" long="36.7267">Iris</a>
<a about="https://topostext.org/place/408365WLyk" class="place" lat="40.7519" long="36.517">Lycus</a>
<a about="https://topostext.org/place/403359UZel" class="place" lat="40.3041" long

<a about="https://topostext.org/place/339358LLib" class="place" lat="33.925" long="35.765">Libanus</a>
<a about="https://topostext.org/place/355390RSyr" class="place" lat="35.5" long="39">Syria</a>
<a about="https://topostext.org/place/280400RAra" class="place" lat="28" long="40">Arabia</a>
<a about="https://topostext.org/place/406163RIta" class="place" lat="40.6" long="16.3">Italy</a>
<a about="https://topostext.org/place/406163RIta" class="place" lat="40.6" long="16.3">Italy</a>
<a about="https://topostext.org/place/303254UPet" class="place" lat="30.3285" long="35.4423">Petra</a>
<a about="https://topostext.org/place/315344UGaz" class="place" lat="31.523" long="34.432">Gaza</a>
<a about="https://topostext.org/place/355390RSyr" class="place" lat="35.5" long="39">Syria</a>
<a about="https://topostext.org/place/346383UPal" class="place" lat="34.5507" long="38.2688">Palmyra</a>
<a about="https://topostext.org/place/315344UGaz" class="place" lat="31.523" long="34.432">Gaza</a>
<a about="h

<a about="https://topostext.org/place/410290UByz" class="place" lat="41.0119" long="28.9836">Byzantium</a>
<a about="https://topostext.org/place/405268UKar" class="place" lat="40.5482" long="26.7508">Lysimachia</a>
<a about="https://topostext.org/place/403265LChe" class="place" lat="40.3333" long="26.5">Chersonesus</a>
<a about="https://topostext.org/place/406265WMel" class="place" lat="40.55" long="26.46">Melas</a>
<a about="https://topostext.org/place/410250PAbd" class="place" lat="40.95" long="24.9833">Abdera</a>
<a about="https://topostext.org/place/405255PSam" class="place" lat="40.5039" long="25.5348">Samothracia</a>
<a about="https://topostext.org/place/409255PMar" class="place" lat="40.874" long="25.511">Maronea</a>
<a about="https://topostext.org/place/407261PAin" class="place" lat="40.7238" long="26.0812">Aenus</a>
<a about="https://topostext.org/place/418228RMai" class="place" lat="41.75" long="22.75">Maedica</a>
<a about="https://topostext.org/place/415220RPai" class="place

<a about="https://topostext.org/place/389207LAkt" class="place" lat="38.942" long="20.68">Actium</a>
<a about="https://topostext.org/place/434109UVol" class="place" lat="43.4035" long="10.86">Volaterrae</a>
<a about="https://topostext.org/place/398267USke" class="place" lat="39.8074" long="26.7002">Scepsis</a>
<a about="https://topostext.org/place/419125PRom" class="place" lat="41.891" long="12.486">Rome</a>
<a about="https://topostext.org/place/438044UNem" class="place" lat="43.8384" long="4.3604">Nemausus</a>
<a about="https://topostext.org/place/445035UXMi" class="place" lat="44.5173" long="3.4987">Lesura</a>
<a about="https://topostext.org/place/447033UGab" class="place" lat="44.6945" long="3.3434">Gabalicum</a>
<a about="https://topostext.org/place/441073LAlp" class="place" lat="44.142" long="7.343">Alps</a>
<a about="https://topostext.org/place/425136LApe" class="place" lat="42.47" long="13.567">Apennines</a>
<a about="https://topostext.org/place/443083RLig" class="place" lat="44

In [11]:
#number of class=place detected

len(links)

5595

In [12]:
#Constructing of the Dataset and assigning UUID

def toposparse(html):
    if not os.path.isfile(html):
        raise FileNotFoundError("File not found: " + html)
    
    data = []
    with open(html, encoding="latin-1") as file:
        soup = BeautifulSoup(file, features="lxml")
        links = soup.find_all("a", {"class": "place"})
        
        for link in links:
            Place_Name = link.contents[0]  # Place name
            ToposText_ID = link.get('about')  # ToposText ID
            Lat = link.get('lat')
            Long = link.get('long')
            Parent = link.find_parent("p")
            Text = Parent.text  # Extract related text
            Reference = Parent.get("id")  # Indicate book, chapter, paragraph
            
            # Separate the information in Text using the regular expression pattern
            match = re.search(r'§\s+(\d+\.\d+\.\d*)\s+(.*)$', Text)
            if match:
                Chapternparagraph = match.group(1)  # Extract the reference from the pattern
                Text = match.group(2)  # Extract the remaining text from the pattern
                UUID4 = uuid.uuid4()  # Create a unique ID

                data.append({
                    'UUID4': UUID4,
                    'ToposText_ID': ToposText_ID,
                    'Place_Name': Place_Name,
                    'Reference': Reference,
                    'Lat': Lat,
                    'Long': Long,
                    'Chapter&Paragraph': Chapternparagraph,
                    'Text': Text
                })

    df = pd.DataFrame(data)
    return df

In [13]:
df1 = toposparse('/Users/dawn/Desktop/KUL/readings/ma thesis/code&data/NH_Eng_1-11.html')
df2 = toposparse('/Users/dawn/Desktop/KUL/readings/ma thesis/code&data/NH_Eng_12-37.html')

In [14]:
df1.head()

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
0,0fb7ff85-8091-4fe5-b96d-912fc289608b,https://topostext.org/place/380237SAca,Academy,urn:cts:latinLit:phi0978.phi001:1.8.1,37.992,23.707,1.8.1,For my own part I frankly confess that my work...
1,0af04f3d-1795-4ad8-b7c2-1f4472b628c7,https://topostext.org/place/419125LPal,Palatine,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8896,12.4884,2.5.1,For this reason I deem it a mark of human weak...
2,ffde7188-2c80-4f67-a5f3-58d9a9aa4a2c,https://topostext.org/place/419125LEsq,Esquiline,urn:cts:latinLit:phi0978.phi001:2.5.1,41.895,12.496,2.5.1,For this reason I deem it a mark of human weak...
3,e996e816-9510-4a5a-ba7b-d273f02ad8c8,https://topostext.org/place/419125SCap,Capitol,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8933,12.483,2.5.1,For this reason I deem it a mark of human weak...
4,aec7a1e1-99f9-4942-bccd-10308a593252,https://topostext.org/place/419125PRom,Rome,urn:cts:latinLit:phi0978.phi001:2.6.3,41.891,12.486,2.6.3,Below the sun revolves a very large star named...


In [15]:
df1.shape

(5595, 8)

In [16]:
df2.shape

(3281, 8)

In [17]:
geotext_whole = pd.concat([df1, df2], ignore_index=True)
geotext_whole.head()

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
0,0fb7ff85-8091-4fe5-b96d-912fc289608b,https://topostext.org/place/380237SAca,Academy,urn:cts:latinLit:phi0978.phi001:1.8.1,37.992,23.707,1.8.1,For my own part I frankly confess that my work...
1,0af04f3d-1795-4ad8-b7c2-1f4472b628c7,https://topostext.org/place/419125LPal,Palatine,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8896,12.4884,2.5.1,For this reason I deem it a mark of human weak...
2,ffde7188-2c80-4f67-a5f3-58d9a9aa4a2c,https://topostext.org/place/419125LEsq,Esquiline,urn:cts:latinLit:phi0978.phi001:2.5.1,41.895,12.496,2.5.1,For this reason I deem it a mark of human weak...
3,e996e816-9510-4a5a-ba7b-d273f02ad8c8,https://topostext.org/place/419125SCap,Capitol,urn:cts:latinLit:phi0978.phi001:2.5.1,41.8933,12.483,2.5.1,For this reason I deem it a mark of human weak...
4,aec7a1e1-99f9-4942-bccd-10308a593252,https://topostext.org/place/419125PRom,Rome,urn:cts:latinLit:phi0978.phi001:2.6.3,41.891,12.486,2.6.3,Below the sun revolves a very large star named...


In [18]:
geotext_whole.shape

(8876, 8)

In [32]:
geotext_whole['Text'] = geotext_whole['Text'].str.replace("â\x80\x94", "-")

In [34]:
geotext_whole.to_csv('geotext_whole.csv')

In [40]:
geotext_corpus = geotext_whole[['Chapter&Paragraph', 'Text']].drop_duplicates(subset='Text').reset_index(drop=True)
geotext_corpus['Text'] = geotext_corpus['Text'].str.lower()
geotext_corpus

Unnamed: 0,Chapter&Paragraph,Text
0,1.8.1,for my own part i frankly confess that my work...
1,2.5.1,for this reason i deem it a mark of human weak...
2,2.6.3,below the sun revolves a very large star named...
3,2.8.1,this theory leads mortal minds upward to heave...
4,2.9.1,the first person indeed of roman nationality w...
...,...,...
1723,37.70.1,"the 'zathenes,' according to democritus, is an..."
1724,37.74.1,"new, unnamed precious stones come into existen..."
1725,37.76.1,"i, on the other hand, am prepared to explain t..."
1726,37.77.1,for now that i have completed my survey of nat...


In [52]:
for _, row in geotext_corpus.iterrows():
    chapter_paragraph = row['Chapter&Paragraph']
    text = row['Text']
    filename = f"{chapter_paragraph}_text.txt"
    
    with open(filename, 'w', encoding = 'latin-1') as file:
        file.write(text)
    
    print(f"Exported {filename}")

Exported 1.8.1_text.txt
Exported 2.5.1_text.txt
Exported 2.6.3_text.txt
Exported 2.8.1_text.txt
Exported 2.9.1_text.txt
Exported 2.22.1_text.txt
Exported 2.23.1_text.txt
Exported 2.23.2_text.txt
Exported 2.25.1_text.txt
Exported 2.31.1_text.txt
Exported 2.40.1_text.txt
Exported 2.44.2_text.txt
Exported 2.46.1_text.txt
Exported 2.48.1_text.txt
Exported 2.51.1_text.txt
Exported 2.52.1_text.txt
Exported 2.53.1_text.txt
Exported 2.54.1_text.txt
Exported 2.55.3_text.txt
Exported 2.56.1_text.txt
Exported 2.57.1_text.txt
Exported 2.58.1_text.txt
Exported 2.59.1_text.txt
Exported 2.62.1_text.txt
Exported 2.65.1_text.txt
Exported 2.67.1_text.txt
Exported 2.71.1_text.txt
Exported 2.72.1_text.txt
Exported 2.73.1_text.txt
Exported 2.74.1_text.txt
Exported 2.75.1_text.txt
Exported 2.76.1_text.txt
Exported 2.77.1_text.txt
Exported 2.78.1_text.txt
Exported 2.81.1_text.txt
Exported 2.82.1_text.txt
Exported 2.82.2_text.txt
Exported 2.84.1_text.txt
Exported 2.85.1_text.txt
Exported 2.86.1_text.txt
Expor

Exported 6.34.1_text.txt
Exported 6.34.2_text.txt
Exported 6.34.3_text.txt
Exported 6.34.4_text.txt
Exported 6.34.5_text.txt
Exported 6.35.1_text.txt
Exported 6.35.2_text.txt
Exported 6.35.3_text.txt
Exported 6.35.4_text.txt
Exported 6.35.5_text.txt
Exported 6.35.6_text.txt
Exported 6.35.7_text.txt
Exported 6.35.8_text.txt
Exported 6.35.9_text.txt
Exported 6.35.10_text.txt
Exported 6.35.11_text.txt
Exported 6.36.1_text.txt
Exported 6.37.1_text.txt
Exported 6.38.2_text.txt
Exported 6.38.3_text.txt
Exported 6.39.2_text.txt
Exported 6.39.3_text.txt
Exported 6.39.4_text.txt
Exported 6.39.5_text.txt
Exported 6.39.6_text.txt
Exported 6.39.7_text.txt
Exported 6.39.8_text.txt
Exported 6.39.9_text.txt
Exported 6.39.10_text.txt
Exported 6.39.11_text.txt
Exported 7.2.1_text.txt
Exported 7.2.2_text.txt
Exported 7.2.3_text.txt
Exported 7.2.4_text.txt
Exported 7.2.5_text.txt
Exported 7.2.6_text.txt
Exported 7.2.7_text.txt
Exported 7.3.1_text.txt
Exported 7.4.1_text.txt
Exported 7.5.1_text.txt
Export

Exported 13.10.1_text.txt
Exported 13.11.1_text.txt
Exported 13.12.1_text.txt
Exported 13.13.1_text.txt
Exported 13.14.1_text.txt
Exported 13.15.1_text.txt
Exported 13.16.1_text.txt
Exported 13.17.1_text.txt
Exported 13.19.1_text.txt
Exported 13.20.1_text.txt
Exported 13.21.1_text.txt
Exported 13.22.1_text.txt
Exported 13.22.2_text.txt
Exported 13.23.1_text.txt
Exported 13.23.2_text.txt
Exported 13.27.1_text.txt
Exported 13.28.1_text.txt
Exported 13.29.1_text.txt
Exported 13.30.3_text.txt
Exported 13.32.1_text.txt
Exported 13.32.2_text.txt
Exported 13.33.1_text.txt
Exported 13.34.1_text.txt
Exported 13.35.1_text.txt
Exported 13.36.1_text.txt
Exported 13.37.1_text.txt
Exported 13.38.1_text.txt
Exported 13.43.2_text.txt
Exported 13.44.1_text.txt
Exported 13.45.1_text.txt
Exported 13.46.1_text.txt
Exported 13.47.2_text.txt
Exported 13.48.1_text.txt
Exported 13.49.1_text.txt
Exported 13.50.1_text.txt
Exported 13.51.1_text.txt
Exported 14.1.1_text.txt
Exported 14.2.1_text.txt
Exported 14.2.

Exported 22.48.1_text.txt
Exported 22.56.1_text.txt
Exported 22.82.1_text.txt
Exported 23.5.1_text.txt
Exported 23.20.1_text.txt
Exported 23.21.1_text.txt
Exported 23.49.1_text.txt
Exported 23.50.1_text.txt
Exported 23.52.1_text.txt
Exported 23.54.1_text.txt
Exported 23.62.1_text.txt
Exported 23.70.1_text.txt
Exported 23.80.1_text.txt
Exported 24.1.1_text.txt
Exported 24.1.2_text.txt
Exported 24.2.1_text.txt
Exported 24.5.1_text.txt
Exported 24.19.1_text.txt
Exported 24.22.1_text.txt
Exported 24.25.1_text.txt
Exported 24.32.1_text.txt
Exported 24.37.1_text.txt
Exported 24.41.1_text.txt
Exported 24.42.1_text.txt
Exported 24.50.1_text.txt
Exported 24.51.1_text.txt
Exported 24.52.1_text.txt
Exported 24.67.1_text.txt
Exported 24.69.1_text.txt
Exported 24.75.1_text.txt
Exported 24.91.1_text.txt
Exported 24.94.1_text.txt
Exported 24.95.1_text.txt
Exported 24.96.1_text.txt
Exported 24.102.1_text.txt
Exported 24.102.2_text.txt
Exported 24.102.3_text.txt
Exported 24.102.4_text.txt
Exported 24.1

Exported 35.36.6_text.txt
Exported 35.36.9_text.txt
Exported 35.36.10_text.txt
Exported 35.36.11_text.txt
Exported 35.36.13_text.txt
Exported 35.36.14_text.txt
Exported 35.36.15_text.txt
Exported 35.36.17_text.txt
Exported 35.36.20_text.txt
Exported 35.37.1_text.txt
Exported 35.37.2_text.txt
Exported 35.37.6_text.txt
Exported 35.40.1_text.txt
Exported 35.40.2_text.txt
Exported 35.40.3_text.txt
Exported 35.40.5_text.txt
Exported 35.40.6_text.txt
Exported 35.40.8_text.txt
Exported 35.40.9_text.txt
Exported 35.40.12_text.txt
Exported 35.40.13_text.txt
Exported 35.42.1_text.txt
Exported 35.43.1_text.txt
Exported 35.45.1_text.txt
Exported 35.46.1_text.txt
Exported 35.47.1_text.txt
Exported 35.49.2_text.txt
Exported 35.50.1_text.txt
Exported 35.51.1_text.txt
Exported 35.51.2_text.txt
Exported 35.52.1_text.txt
Exported 35.52.2_text.txt
Exported 35.53.1_text.txt
Exported 35.54.1_text.txt
Exported 35.56.1_text.txt
Exported 35.57.1_text.txt
Exported 35.58.1_text.txt
Exported 35.59.1_text.txt
Exp

In [42]:
geotext_whole['Lat'] = geotext_whole['Lat'].astype(float)
geotext_whole['Long'] = geotext_whole['Long'].astype(float)

In [46]:
# Define the latitude and longitude ranges for modern India
lat_range = (8.4, 37.6)
long_range = (68.7, 97.4)

# Create a boolean mask for filtering
mask = (geotext_whole['Lat'].between(*lat_range)) & (geotext_whole['Long'].between(*long_range))

# Apply the mask to filter the dataframe
geotext_india = geotext_whole[mask]

geotext_india.head()

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
85,fcbecd9d-b21e-478e-a59e-05f0470f01ef,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.75.1,30.0,74.0,2.75.1,Similarly it is reported that at the town of S...
92,296005a0-14a3-40cb-b5eb-d131859d1173,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.75.1,30.0,74.0,2.75.1,Similarly it is reported that at the town of S...
93,58fab8f7-f07c-4dc3-b37e-6a41a0aeef05,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.75.1,30.0,74.0,2.75.1,Similarly it is reported that at the town of S...
343,bbce0db4-8ad8-4e3b-bb5b-de56bb1b82ac,https://topostext.org/place/300740RInd,India,urn:cts:latinLit:phi0978.phi001:2.112.1,30.0,74.0,2.112.1,"Our own portion of the earth, which is my subj..."
348,a795f697-1463-4ee7-98c2-b946950e30ec,https://topostext.org/place/38898WGan,Ganges,urn:cts:latinLit:phi0978.phi001:2.112.1,23.7818,89.8009,2.112.1,"Our own portion of the earth, which is my subj..."


In [47]:
geotext_india.loc[indiantext['Place_Name']=='Acesinus']

Unnamed: 0,UUID4,ToposText_ID,Place_Name,Reference,Lat,Long,Chapter&Paragraph,Text
2538,5aa13532-ccf4-4654-822d-42c46e9441b9,https://topostext.org/place/293710WAke,Acesinus,urn:cts:latinLit:phi0978.phi001:4.26.2,29.1513,70.7367,4.26.2,"At this spot begins a well-wooded district, wh..."
4178,e6828b84-374f-479f-9a82-df458a87560b,https://topostext.org/place/293710WAke,Acesinus,urn:cts:latinLit:phi0978.phi001:6.23.1,29.1513,70.7367,6.23.1,"THE INDUS: The Indus, called Sindis by the nat..."


In [48]:
geotext_india.shape

(194, 8)

In [49]:
geotext_india.to_csv('geotext_indianregion.csv')

In [50]:
india_corpus = geotext_india[['Chapter&Paragraph', 'Text']].drop_duplicates(subset='Text').reset_index(drop=True)
india_corpus['Text'] = india_corpus['Text'].str.lower()
india_corpus

Unnamed: 0,Chapter&Paragraph,Text
0,2.75.1,similarly it is reported that at the town of s...
1,2.112.1,"our own portion of the earth, which is my subj..."
2,4.17.4,"such is macedonia, which was once the mistress..."
3,4.26.2,"at this spot begins a well-wooded district, wh..."
4,5.11.1,"the cities of egypt: egypt, besides its boast ..."
...,...,...
127,37.62.1,"the 'lepidotis,' or 'scaly stone,' mimics fish..."
128,37.63.1,"no description of the 'memnonia,' or 'stone of..."
129,37.65.1,the stone that bears the foreign name 'oica' i...
130,37.76.1,"i, on the other hand, am prepared to explain t..."


In [53]:
for _, row in india_corpus.iterrows():
    chapter_paragraph = row['Chapter&Paragraph']
    text = row['Text']
    filename = f"{chapter_paragraph}_text.txt"
    
    with open(filename, 'w', encoding = 'latin-1') as file:
        file.write(text)
    
    print(f"Exported {filename}")

Exported 2.75.1_text.txt
Exported 2.112.1_text.txt
Exported 4.17.4_text.txt
Exported 4.26.2_text.txt
Exported 5.11.1_text.txt
Exported 6.17.2_text.txt
Exported 6.19.1_text.txt
Exported 6.20.2_text.txt
Exported 6.21.1_text.txt
Exported 6.21.2_text.txt
Exported 6.21.3_text.txt
Exported 6.21.4_text.txt
Exported 6.21.5_text.txt
Exported 6.22.1_text.txt
Exported 6.22.2_text.txt
Exported 6.22.3_text.txt
Exported 6.22.4_text.txt
Exported 6.23.1_text.txt
Exported 6.23.2_text.txt
Exported 6.23.3_text.txt
Exported 6.23.5_text.txt
Exported 6.24.1_text.txt
Exported 6.24.3_text.txt
Exported 6.25.3_text.txt
Exported 6.26.1_text.txt
Exported 6.26.4_text.txt
Exported 6.26.5_text.txt
Exported 6.26.7_text.txt
Exported 6.28.3_text.txt
Exported 6.32.3_text.txt
Exported 6.34.5_text.txt
Exported 6.39.2_text.txt
Exported 6.39.3_text.txt
Exported 6.39.4_text.txt
Exported 6.39.5_text.txt
Exported 7.2.6_text.txt
Exported 8.2.1_text.txt
Exported 8.4.1_text.txt
Exported 8.8.1_text.txt
Exported 8.11.1_text.txt
Exp

In [45]:
distinct_places = indiantext['Place_Name'].unique()
distinct_places

array(['India', 'Ganges', 'Acesinus', 'Hydaspes', 'Muziris', 'Baragaza'],
      dtype=object)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

In [None]:
# changed to "device = cpu" for not having a discrete GPU 
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first", device="cpu")

In [None]:
example = "Juba, however, gives another account; he says that there is a city on Mount Megatichos, which lies between Egypt and Ethiopia, by the Arabians known as Myrson, after which come Tacompsos, Aramus, Sesamos, Pide, Mamuda, Orambis, situate near a stream of bitumen, Amodita, Prosda, Parenta, Mama, Tesatta, Gallas, Zoton, Graucome, Emeus, the Pidibotae, the Hebdomecontacometae, Nomades, who dwell in tents, Cyste, Macadagale, Proaprimis, Nups, Detrelis, Patis, the Ganbreves, the Magasnei, Segasmala, Crandala, Denna, Cadeuma, Thena, Batta, Alana, Mascoa, the Scamini, Hora, situate on an island, and then Abala, Androgalis, Sesecre, the Malli, and Agole."

ner_results = ner(example)
print(ner_results)