In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('web_scraped_humanist_listserv_volumes.csv')
df

Unnamed: 0,volume_text,volume_link,volume_dates,volume_number,inferred_start_year,inferred_end_year
0,From: MCCARTY@UTOREPAS\nSubject: \nDate: 12 Ma...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1987-1988,1,1987,1988
1,From: Sebastian Rahtz \nSubject: C++ and Gnu o...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1988-1989,2,1988,1989
2,From: Willard McCarty \nSubject: Happy Birthda...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1989-1990,3,1989,1990
3,From: Elaine Brennan & Allen Renear \nSubject:...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1990-1991,4,1990,1991
4,From: Elaine Brennan & Allen Renear \nSubject:...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1991-1992,5,1991,1992
5,From: Elaine M Brennan \nSubject: Humanist's B...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1992-1993,6,1992,1993
6,From: 6500card%ucsbuxa@hub.ucsb.edu (Cheryl A....,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1993-1994,7,1993,1994
7,From: Andrew Burday \nSubject: Re: 7.0638 Qs: ...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1994-1995,8,1994,1995
8,"From: ""Gregory Bloomquist"" \nSubject: Round Ta...",https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1995-1996,9,1995,1996
9,From: Humanist \nSubject: Humanist begins its ...,https://humanist.kdl.kcl.ac.uk/Archives/Conver...,1996-1997,10,1996,1997


In [5]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ------------------- -------------------- 20.5/42.0 kB ? eta -:--:--
     ------------------- -------------------- 20.5/42.0 kB ? eta -:--:--
     -------------------------------------  41.0/42.0 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 288.5 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     -------------

In [1]:
 # subset to relevant urls
humanist_urls = ["https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/", "https://humanist.kdl.kcl.ac.uk/Archives/Current/"]
volume_dfs = []
# loop through each url
for url in humanist_urls:
    print(f"Getting volumes from {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all('a')
    # loop through each volume link
    for link in links:
        if link['href'].endswith('.txt'):
            print(f"Getting volume from {url + link['href']}")
            page_soup = BeautifulSoup(requests.get(url + link['href']).text, "html.parser")
            text = page_soup.get_text()
            volume_link = url + link['href']
            dates = link['href'].split('.')[1]
            data_dict = {'volume_text': text, 'volume_link': volume_link, 'volume_dates': dates}
            volume_dfs.append(data_dict)

scraped_humanist_df = pd.DataFrame(volume_dfs)
# Extract the volume number from the dates
scraped_humanist_df['volume_number'] = scraped_humanist_df['volume_dates'].str.extract(r'(\d+)')
# Remove numbers with more than 2 digits
scraped_humanist_df['volume_number'] = scraped_humanist_df['volume_number'].apply(lambda x: np.nan if len(str(x)) > 2 else x)

# Replace nulls with a sequential of volume numbers
scraped_humanist_df['volume_number'] = scraped_humanist_df['volume_number'].fillna(pd.Series(np.arange(1, len(scraped_humanist_df) + 1)))

# Extract the start and end years
scraped_humanist_df[['inferred_start_year', 'inferred_end_year']] = scraped_humanist_df['volume_dates'].str.split('-', expand=True)

# Remove years that are not 4 digits
scraped_humanist_df.inferred_start_year = scraped_humanist_df.inferred_start_year.apply(lambda x: np.nan if len(str(x)) != 4 else x)
scraped_humanist_df.inferred_end_year = scraped_humanist_df.inferred_end_year.apply(lambda x: np.nan if len(str(x)) != 4 else x)

# Ensure the years are numeric
scraped_humanist_df.loc[scraped_humanist_df.inferred_end_year.isnull(), 'inferred_end_year'] = np.nan

# Create an empty dummy variable for the years
start_year_before = None
end_year_before = None

# Loop through dataframe row by row
for index, row in scraped_humanist_df.iterrows():
    # Check that both start and end years are not null
    if (not pd.isnull(row.inferred_start_year)) and (not pd.isnull(row.inferred_end_year)):
        # assign the years to the dummy variables
        start_year_before = row.inferred_start_year
        end_year_before = row.inferred_end_year
        # print the years
        print(start_year_before, end_year_before)
    # Check that if years are null and the dummy variables are not, then update the years in the dataframe
    elif (pd.isnull(row.inferred_start_year) and start_year_before is not None) and (pd.isnull(row.inferred_end_year) and end_year_before is not None):
        # increment the years by 1
        start_year_before = int(start_year_before) + 1
        end_year_before = int(end_year_before) + 1
        # assign the years to the dataframe using the row index to update the original dataframe
        scraped_humanist_df.at[index, 'inferred_start_year'] = start_year_before
        scraped_humanist_df.at[index, 'inferred_end_year'] = end_year_before
        print(start_year_before, end_year_before)

# Save the dataframe to a csv
scraped_humanist_df.to_csv("web_scraped_humanist_listserv_volumes.csv", index=False)

Getting volumes from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1987-1988.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1988-1989.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1989-1990.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1990-1991.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1991-1992.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1992-1993.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1993-1994.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1994-1995.txt
Getting volume from https://humanist.kdl.kcl.ac.uk/Archives/Converted_Text/humanist.1995-1996.txt
Getting volume from https://humanist.kdl.

In [5]:
from nltk import word_tokenize
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer