# DD2417 Final Project - Dating Historical Texts

## Libraries + Imports

In [1]:
import os 
import random 
import re 
import string 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt


## Data

In [2]:
# paths 
raw_dataset_path = './Datasets/raw_data'
clean_dataset_path = './Datasets/clean_data'

# create directories 
if not os.path.exists(clean_dataset_path):
    print(f"create clean data directory")
    os.makedirs(clean_dataset_path)
    

### Data Preprocessing

In [3]:
def clean_text(text):
    # remove everything up to and including start
    start_match = re.search(
        r"\*\*\* START OF.*?\*\*\*", text, re.IGNORECASE | re.DOTALL
    )
    if start_match:
        text = text[start_match.end():]
    
    # remove everything after end 
    end_match = re.search(
        r'\*\*\* END OF.*?\*\*\*', text, re.IGNORECASE | re.DOTALL
    )
    if end_match:
        text = text[:end_match.start()]
    
    # remove years 
    text = re.sub(r'\b1[0-9]{3}\b', '', text)

    # remove whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [4]:
def preprocess_text(dataset_path, year):
    print(f"preprocess text")
    print(f"directory year: {year}")
    decade_path = dataset_path + "/" + str(year) + "/"
    cleaned_data_path = f'{clean_dataset_path}/{year}/'
    if not os.path.exists(cleaned_data_path):
        os.makedirs(cleaned_data_path)
    text_list = os.listdir(decade_path)
    # print(f"text files: {text_list}")
    for text_file in text_list:
        if text_file.endswith('.txt'):
            print(f"file name: {text_file}")
            # read file 
            with open(decade_path + text_file, 'r', encoding='utf-8') as f:
                raw_text = f.read()
                print(f"read file sucessfully")

            cleaned_text = clean_text(raw_text)
            print(f"text cleaned succesfully")

            # save file 
            out_file = cleaned_data_path + text_file
            with open(out_file, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)
                print(f"cleaned text succesfully saved to {out_file}")
                print()

### 1700s Data

In [5]:
years = [1700, 1710, 1720, 1730, 1740, 1750, 1760, 1770, 1780, 1790]
# test_year = years[0]
# preprocess_text(raw_dataset_path, test_year)
for year in years:
    preprocess_text(raw_dataset_path, year)


preprocess text
directory year: 1700
file name: 1700_5.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_5.txt

file name: 1700_4.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_4.txt

file name: 1700_1.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_1.txt

file name: 1700_3.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_3.txt

file name: 1700_2.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1700/1700_2.txt

preprocess text
directory year: 1710
file name: 1710_4.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1710/1710_4.txt

file name: 1710_5.txt
read file sucessfully
text cleaned succesfully
clean

### 1800's Data

In [6]:
years = [1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890]
# test_year = years[0]
# preprocess_text(raw_dataset_path, test_year)
for year in years:
    preprocess_text(raw_dataset_path, year)

preprocess text
directory year: 1800
file name: 1800_2.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_2.txt

file name: 1800_3.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_3.txt

file name: 1800_1.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_1.txt

file name: 1800_4.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_4.txt

file name: 1800_5.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_5.txt

file name: 1800_6.txt
read file sucessfully
text cleaned succesfully
cleaned text succesfully saved to ./Datasets/clean_data/1800/1800_6.txt

preprocess text
directory year: 1810
file name: 1810_1.txt
read file sucessfully
text cleaned succesfully
clean

### Book and Period Information

In [10]:
def dataset_info():
    years = [i for i in range(1700, 1900, 10)]
    book_data = [] 
    for decade in years:
        decade_path = f'{clean_dataset_path}/{decade}'
        if os.path.exists(decade_path):
            text_list = os.listdir(decade_path)
            text_list.sort()
            for filename in text_list:
                if filename.endswith('.txt'):
                    # print(f'file name: {filename}')
                    book_info = {
                        'decade': decade, 
                        'filename': filename, 
                        'filepath': os.path.join(decade_path, filename),
                        'book_id': f"{decade}_{filename}"
                    }
                    book_data.append(book_info)
    return book_data

In [11]:
def create_dataset_splits(book_data):
    train_books, valid_books, test_books = [], [], []

    # group books by decade 
    books_by_decade = {}
    for book in book_data:
        decade = book['decade']
        if decade not in books_by_decade:
            books_by_decade[decade] = []
        books_by_decade[decade].append(book)
    print(f"{books_by_decade}")
    
    for decade, books in books_by_decade.items():
        num_books = len(books)
        if num_books == 4:
            train_books.extend(books[:2])
            valid_books.extend(books[2:3])
            test_books.extend(books[3:4])

        elif num_books == 5:
            train_books.extend(books[:3])
            valid_books.extend([books[3:4]])
            test_books.extend([books[4:5]])

        elif num_books == 6:
            train_books.extend(books[:4])
            valid_books.extend(books[4:5])
            test_books.extend([books[5:6]])
    
    print(f"train data: {train_books}")
    print(f"valid data: {valid_books}")
    print(f"test data: {test_books}")

    return train_books, valid_books, test_books

In [12]:
book_data = dataset_info()
train_data, valid_data, test_data = create_dataset_splits(book_data)

{1700: [{'decade': 1700, 'filename': '1700_1.txt', 'filepath': './Datasets/clean_data/1700/1700_1.txt', 'book_id': '1700_1700_1.txt'}, {'decade': 1700, 'filename': '1700_2.txt', 'filepath': './Datasets/clean_data/1700/1700_2.txt', 'book_id': '1700_1700_2.txt'}, {'decade': 1700, 'filename': '1700_3.txt', 'filepath': './Datasets/clean_data/1700/1700_3.txt', 'book_id': '1700_1700_3.txt'}, {'decade': 1700, 'filename': '1700_4.txt', 'filepath': './Datasets/clean_data/1700/1700_4.txt', 'book_id': '1700_1700_4.txt'}, {'decade': 1700, 'filename': '1700_5.txt', 'filepath': './Datasets/clean_data/1700/1700_5.txt', 'book_id': '1700_1700_5.txt'}], 1710: [{'decade': 1710, 'filename': '1710_1.txt', 'filepath': './Datasets/clean_data/1710/1710_1.txt', 'book_id': '1710_1710_1.txt'}, {'decade': 1710, 'filename': '1710_2.txt', 'filepath': './Datasets/clean_data/1710/1710_2.txt', 'book_id': '1710_1710_2.txt'}, {'decade': 1710, 'filename': '1710_3.txt', 'filepath': './Datasets/clean_data/1710/1710_3.txt',

## 