# DD2417 Final Project - Dating Historical Texts

## Libraries + Imports

In [1]:
import os 
import csv 
import random 
import re 
import string 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

# seed all experiments and setup 
random.seed(42)

## Data

In [2]:
# paths
raw_dataset_path = './Datasets/raw_data'
clean_dataset_path = './Datasets/clean_data'
model_dataset_path = './Datasets/model_data'

In [3]:
# count all the data files in the raw data file 
print(f"count the number of books in each decade directory in the raw data")
total_books = 0
for decade in range(1700, 1900, 10):
    decade_path = f"{raw_dataset_path}/{decade}"
    if os.path.exists(decade_path):
        text_files = [f for f in os.listdir(decade_path) if f.endswith(".txt")]
        print(f"{decade}: {len(text_files)} books")
        total_books += len(text_files)
print(f"total number of books for project: {total_books}")

count the number of books in each decade directory in the raw data
1700: 5 books
1710: 5 books
1720: 4 books
1730: 4 books
1740: 5 books
1750: 5 books
1760: 5 books
1770: 5 books
1780: 5 books
1790: 4 books
1800: 6 books
1810: 5 books
1820: 6 books
1830: 6 books
1840: 6 books
1850: 6 books
1860: 5 books
1870: 5 books
1880: 5 books
1890: 5 books
total number of books for project: 102


In [4]:
# get all the titles of the books
def get_book_titles():
    book_titles = {}
    for year in range(1700, 1900, 10):
        decade_path = f"{raw_dataset_path}/{year}"
        book_titles[year] = []

        print(f"decade: {year}")
        text_files = sorted([f for f in os.listdir(decade_path) if f.endswith('.txt')])
        for filename in text_files:
            file_path = os.path.join(decade_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            title_match = re.search(r"^Title:\s*(.+)$", text, re.MULTILINE)
            book_title = title_match.group(1).strip()
            print(f"book_title: {book_title}")
            book_titles[year].append(book_title)
        print(f"number of titles in decade: {year} -> {len(book_titles[year])}")
        print()

    return book_titles 

In [5]:
# remove all old data files
if os.path.exists(clean_dataset_path):
    print(f"clean up - previous clean_data_files")
    directories = os.listdir(clean_dataset_path)
    directories.sort()
    for dir in directories:
        decade_path = os.path.join(clean_dataset_path, dir)
        if os.path.isdir(decade_path):
            text_files = os.listdir(decade_path)
            text_files.sort()
            for file in text_files:
                if file.endswith(".txt"):
                    file_path = os.path.join(decade_path, file)
                    os.remove(file_path)
                    print(f"succesfully removed {file}")
            os.rmdir(decade_path)
            print(f"successfully removed directory {dir}")
            print()
    os.rmdir(clean_dataset_path)
    print(f"succesfully removed {clean_dataset_path}")
    print()

# create new data files
if os.path.exists(model_dataset_path):
    print(f"clean up - previous model data")
    data_files = [f for f in os.listdir(model_dataset_path) if f.endswith(".csv")]
    for file in data_files:
        file_path = os.path.join(model_dataset_path, file)
        os.remove(file_path)
        print(f"succesfully removed {file}")
    os.rmdir(model_dataset_path)
    print(f"succesfully removed {model_dataset_path}")
    print()

clean up - previous clean_data_files
succesfully removed 1700_1.txt
succesfully removed 1700_2.txt
succesfully removed 1700_3.txt
succesfully removed 1700_4.txt
succesfully removed 1700_5.txt
successfully removed directory 1700

succesfully removed 1710_1.txt
succesfully removed 1710_2.txt
succesfully removed 1710_3.txt
succesfully removed 1710_4.txt
succesfully removed 1710_5.txt
successfully removed directory 1710

succesfully removed 1720_1.txt
succesfully removed 1720_2.txt
succesfully removed 1720_3.txt
succesfully removed 1720_4.txt
successfully removed directory 1720

succesfully removed 1730_1.txt
succesfully removed 1730_2.txt
succesfully removed 1730_3.txt
succesfully removed 1730_4.txt
successfully removed directory 1730

succesfully removed 1740_1.txt
succesfully removed 1740_2.txt
succesfully removed 1740_3.txt
succesfully removed 1740_4.txt
succesfully removed 1740_5.txt
successfully removed directory 1740

succesfully removed 1750_1.txt
succesfully removed 1750_2.txt
suc

In [6]:
if not os.path.exists(clean_dataset_path):
    print(f"create clean data directory")
    os.makedirs(clean_dataset_path)

if not os.path.exists(model_dataset_path):
    print(f"create model data directory for storing data for building models")
    os.makedirs(model_dataset_path)

create clean data directory
create model data directory for storing data for building models


### Data-Preprocessing

In [7]:
def clean_text(text):
    # remove everything up to and including start
    start_match = re.search(
        r"\*\*\* START OF.*?\*\*\*", text, re.IGNORECASE | re.DOTALL
    )
    if start_match:
        text = text[start_match.end():]
    
    # remove everything after end 
    end_match = re.search(
        r'\*\*\* END OF.*?\*\*\*', text, re.IGNORECASE | re.DOTALL
    )
    if end_match:
        text = text[:end_match.start()]
    
    # remove years 
    text = re.sub(r'\b1[0-9]{3}\b', '', text)

    # remove whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [8]:
def preprocess_text(dataset_path, year):
    print(f"preprocess text")
    print(f"directory year: {year}")
    decade_path = dataset_path + "/" + str(year) + "/"
    cleaned_data_path = f'{clean_dataset_path}/{year}/'
    if not os.path.exists(cleaned_data_path):
        os.makedirs(cleaned_data_path)
    text_list = os.listdir(decade_path)
    # print(f"text files: {text_list}")
    for text_file in text_list:
        if text_file.endswith('.txt'):
            print(f"file name: {text_file}")
            # read file 
            with open(decade_path + text_file, 'r', encoding='utf-8') as f:
                raw_text = f.read()
                print(f"read file sucessfully")

            cleaned_text = clean_text(raw_text)
            print(f"text cleaned succesfully")

            # save file 
            out_file = cleaned_data_path + text_file
            with open(out_file, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
                print(f"cleaned text succesfully saved to {out_file}")
                print()

### 1700s Data

In [9]:
years = [1700, 1710, 1720, 1730, 1740, 1750, 1760, 1770, 1780, 1790]
for year in years:
    preprocess_text(raw_dataset_path, year)


preprocess text
directory year: 1700
file name: 1700_5.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_5.txt

file name: 1700_4.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_4.txt

file name: 1700_1.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_1.txt

file name: 1700_3.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_3.txt

file name: 1700_2.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_2.txt

preprocess text
directory year: 1710
file name: 1710_4.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1710/1710_4.txt

file name: 1710_5.txt
read file sucessfully
text cleaned succesfully
clean

### 1800's Data

In [10]:
years = [1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890]
for year in years:
    preprocess_text(raw_dataset_path, year)

preprocess text
directory year: 1800
file name: 1800_2.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_2.txt

file name: 1800_3.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_3.txt

file name: 1800_1.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_1.txt

file name: 1800_4.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_4.txt

file name: 1800_5.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_5.txt

file name: 1800_6.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_6.txt

preprocess text
directory year: 1810
file name: 1810_1.txt
read file sucessfully
text cleaned succesfully
clean

### Dataset Splits - Train, Validation, Test splits

In [11]:
def dataset_info():
    years = [i for i in range(1700, 1900, 10)]
    book_titles = get_book_titles()

    book_data = [] 
    for decade in years:
        decade_path = f'{clean_dataset_path}/{decade}'
        if os.path.exists(decade_path):
            text_files = sorted(
                [f for f in os.listdir(decade_path) if f.endswith(".txt")]
            )
            for index, filename in enumerate(text_files):
                if decade in book_titles and index < len(book_titles[decade]):
                    book_title = book_titles[decade][index]
                else: 
                    book_title = f"unknown_book_{index + 1}"
                book_info = {
                'decade': decade,
                'filename': filename,
                'book_title': book_title,
                'filepath': os.path.join(decade_path, filename),
                'book_id': f"{decade}_{book_title[:20].replace(' ', '_')}" 
            }
                book_data.append(book_info)
    return book_data

In [12]:
def create_dataset_splits(book_data):
    train_books, valid_books, test_books = [], [], []

    # group books by decade 
    books_by_decade = {}
    for book in book_data:
        decade = book['decade']
        if decade not in books_by_decade:
            books_by_decade[decade] = []
        books_by_decade[decade].append(book)
    
    for decade, books in books_by_decade.items():
        num_books = len(books)
        if num_books == 4:
            train_books.extend(books[:2])
            valid_books.extend(books[2:3])
            test_books.extend(books[3:4])

        elif num_books == 5:
            train_books.extend(books[:3])
            valid_books.extend(books[3:4])
            test_books.extend(books[4:5])

        elif num_books == 6:
            train_books.extend(books[:4])
            valid_books.extend(books[4:5])
            test_books.extend(books[5:6])
    
    print(f"train data: {train_books}")
    print(f"valid data: {valid_books}")
    print(f"test data: {test_books}")
    print()

    return train_books, valid_books, test_books

In [13]:
book_data = dataset_info()
train_data, valid_data, test_data = create_dataset_splits(book_data)

decade: 1700
book_title: The Battle of the Books, and other Short Pieces
book_title: The Way of the World
book_title: A Tale of a Tub
book_title: An Essay Towards a New Theory of Vision
book_title: The Tatler, Volume 1
number of titles in decade: 1700 -> 5

decade: 1710
book_title: The Spectator, Volume 1
book_title: An Essay on Criticism
book_title: The Rape of the Lock, and Other Poems
book_title: The Journal to Stella
book_title: Three Dialogues Between Hylas and Philonous in Opposition to Sceptics and Atheists
number of titles in decade: 1710 -> 5

decade: 1720
book_title: Robinson Crusoe
book_title: Gulliver's Travels
book_title: The Beggar's Opera
book_title: The Fable of the Bees; Or, Private Vices, Public Benefits
number of titles in decade: 1720 -> 4

decade: 1730
book_title: A Discourse Concerning Ridicule and Irony in Writing (1729)
book_title: A Letter to Dion
book_title: An Essay on Man; Moral Essays and Satires
book_title: A Treatise of Human Nature
number of titles in de

### Random Sampling Passages

In [14]:
def random_sampling_passages(book_list, num_passages=5, passage_length=1500):
    passages_data = []
    for book in book_list:
        with open(book['filepath'], 'r', encoding='utf-8') as f:
            text = f.read()
        
        # randomly sample passages 
        for i in range(num_passages):
            max_start = len(text) - passage_length
            start_pos = random.randint(0, max_start)
            passage = text[start_pos:start_pos + passage_length]
            passage_info = {
                'text': passage,
                'decade': book['decade'],
                'decade_id': (book['decade'] - 1700) // 10,
                'book_title': book['book_title'],
                'book_id': book['book_id'],
                'passage_id': f"{book['book_id']}_passage_{i}"
            }
            passages_data.append(passage_info)
    
    return passages_data

In [15]:
def write_data_to_csv(passages_data, filepath):
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)

        # write header
        writer.writerow(['text', 
                         'decade',
                         'book_title', 
                         'passage_id',
                        'decade_id', 
                         'book_id'])

        for passage in passages_data:
            writer.writerow(
                [
                    passage['text'].replace("\n", " ").replace("\r", " "),
                    passage['decade'],
                    passage['book_title'],
                    passage['passage_id'],
                    passage['decade_id'],
                    passage['book_id']
                ]
            )
        print(f"Save {len(passages_data)} passages to {filepath}")

In [16]:
print(f"get book data and create splits")
book_data = dataset_info()
train_books, validation_books, test_books = create_dataset_splits(book_data)
print(f"Book splits: {len(train_books)} -> training_data, {len(validation_books)} -> validation_data, {len(test_books)} -> test_data")
print()
print(f"Randomly sample passages for creating datasets")
train_passages = random_sampling_passages(train_books, num_passages=20, passage_length=1500)
validation_passages = random_sampling_passages(validation_books, num_passages=15, passage_length=1500)  
test_passages = random_sampling_passages(test_books, num_passages=10, passage_length=1500)
print(f"Passage counts: {len(train_passages)} -> training_passages, {len(validation_passages)} -> validation_passages, {len(test_passages)} -> test_passages")
print()
print(f"write data to csv files")
write_data_to_csv(train_passages, f"{model_dataset_path}/train_passages.csv")
write_data_to_csv(validation_passages, f"{model_dataset_path}/validation_passages.csv")
write_data_to_csv(test_passages, f"{model_dataset_path}/test_passages.csv")

get book data and create splits
decade: 1700
book_title: The Battle of the Books, and other Short Pieces
book_title: The Way of the World
book_title: A Tale of a Tub
book_title: An Essay Towards a New Theory of Vision
book_title: The Tatler, Volume 1
number of titles in decade: 1700 -> 5

decade: 1710
book_title: The Spectator, Volume 1
book_title: An Essay on Criticism
book_title: The Rape of the Lock, and Other Poems
book_title: The Journal to Stella
book_title: Three Dialogues Between Hylas and Philonous in Opposition to Sceptics and Atheists
number of titles in decade: 1710 -> 5

decade: 1720
book_title: Robinson Crusoe
book_title: Gulliver's Travels
book_title: The Beggar's Opera
book_title: The Fable of the Bees; Or, Private Vices, Public Benefits
number of titles in decade: 1720 -> 4

decade: 1730
book_title: A Discourse Concerning Ridicule and Irony in Writing (1729)
book_title: A Letter to Dion
book_title: An Essay on Man; Moral Essays and Satires
book_title: A Treatise of Hum

## 