## Project Setup

In [None]:
import os
raw_dataset_path = './Datasets/raw_data/'

['1890', '1830', '1800', '1720', '.DS_Store', '1780', '1710', '1750', '1760', '1840', '1870', '1860', '1850', '1770', '1740', '1700', '1730', '1790', '1810', '1880', '1820']


## Data Preprocessing

In [4]:
# create new directory to store cleaned texts
directory='./Datasets/cleaned_data'
if not os.path.exists(directory):
  os.makedirs('./Datasets/cleaned_data')

In [5]:
import re 

def clean_text(text):
  # remove everything before start
  start_match = re.search(r'\*\*\* START OF.*?\*\*\*', text, re.IGNORECASE | re.DOTALL)
  if start_match:
    text = text[start_match.end():]

  # remove everything after end
  end_match = re.search(r'\*\*\* END OF.*?\*\*\*', text, re.IGNORECASE | re.DOTALL)
  if end_match:
    text = text[:end_match.start()]

  # remove years 1000's
  text = re.sub(r'\b1[0-9]{3}\b', '', text)

  # remove whitespace
  text = re.sub(r'\s+', ' ', text)

  return text.strip()


In [6]:
years = [1700, 1710, 1720, 1730, 1740, 1750, 1760, 1770, 1780, 1790]

In [10]:
import string

def preprocess_text(dataset_path, year):
  print(f"preprocess text")
  print(f"directory year: {year}")
  decade_path = dataset_path + str(year) + "/"
  cleaned_data_path = f'./Datasets/cleaned_data/{year}/'
  if not os.path.exists(cleaned_data_path):
    os.makedirs(cleaned_data_path)
  # process and clean each text file
  text_list = os.listdir(decade_path)
  for file in text_list:
    if file.endswith('.txt'):
      print(f"file name: {file}")

      # read file
      with open(decade_path + file, 'r', encoding='utf-8') as f:
        raw_text = f.read()

      cleaned_text = clean_text(raw_text)

      # save file
      out_file = cleaned_data_path + file
      with open(out_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
        print(f"cleaned text and save to -> {out_file}")
        print()

### 1700's Texts

In [11]:
years = [1700, 1710, 1720, 1730, 1740, 1750, 1760, 1770, 1780, 1790]
# test_year = years[0]
# preprocess_text(raw_dataset_path, test_year)
for year in years:
  preprocess_text(raw_dataset_path, year)


preprocess text
directory year: 1700
file name: 1700_5.txt
cleaned text and save to -> ./Datasets/cleaned_data/1700/1700_5.txt

file name: 1700_4.txt
cleaned text and save to -> ./Datasets/cleaned_data/1700/1700_4.txt

file name: 1700_1.txt
cleaned text and save to -> ./Datasets/cleaned_data/1700/1700_1.txt

file name: 1700_3.txt
cleaned text and save to -> ./Datasets/cleaned_data/1700/1700_3.txt

file name: 1700_2.txt
cleaned text and save to -> ./Datasets/cleaned_data/1700/1700_2.txt

preprocess text
directory year: 1710
file name: 1710_4.txt
cleaned text and save to -> ./Datasets/cleaned_data/1710/1710_4.txt

file name: 1710_5.txt
cleaned text and save to -> ./Datasets/cleaned_data/1710/1710_5.txt

file name: 1710_2.txt
cleaned text and save to -> ./Datasets/cleaned_data/1710/1710_2.txt

file name: 1710_3.txt
cleaned text and save to -> ./Datasets/cleaned_data/1710/1710_3.txt

file name: 1710_1.txt
cleaned text and save to -> ./Datasets/cleaned_data/1710/1710_1.txt

preprocess text


### 1800's Texts

In [12]:
years = [1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890]
for year in years:
  preprocess_text(raw_dataset_path, year)

preprocess text
directory year: 1800
file name: 1800_2.txt
cleaned text and save to -> ./Datasets/cleaned_data/1800/1800_2.txt

file name: 1800_3.txt
cleaned text and save to -> ./Datasets/cleaned_data/1800/1800_3.txt

file name: 1800_1.txt
cleaned text and save to -> ./Datasets/cleaned_data/1800/1800_1.txt

file name: 1800_4.txt
cleaned text and save to -> ./Datasets/cleaned_data/1800/1800_4.txt

file name: 1800_5.txt
cleaned text and save to -> ./Datasets/cleaned_data/1800/1800_5.txt

file name: 1800_6.txt
cleaned text and save to -> ./Datasets/cleaned_data/1800/1800_6.txt

preprocess text
directory year: 1810
file name: 1810_1.txt
cleaned text and save to -> ./Datasets/cleaned_data/1810/1810_1.txt

file name: 1810_3.txt
cleaned text and save to -> ./Datasets/cleaned_data/1810/1810_3.txt

file name: 1810_2.txt
cleaned text and save to -> ./Datasets/cleaned_data/1810/1810_2.txt

file name: 1810_5.txt
cleaned text and save to -> ./Datasets/cleaned_data/1810/1810_5.txt

file name: 1810_