In [None]:
from bs4 import BeautifulSoup
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import requests
nltk.download('stopwords')
nltk.download('punkt')

class TextProcessor:
  def __init__(self, url, output_file):
    self.url = url
    self.output_file = output_file

  def extract_text_from_html_file(self, content):
      soup = BeautifulSoup(content, 'html.parser')
      # Extract text from all paragraphs (you can adjust based on your HTML structure)
      paragraphs = soup.find_all(['p', 'div'])
      text = ' '.join([p.get_text(separator=' ') for p in paragraphs])
      return text

  def clean_text(self, text):
      # Remove HTML tags and attributes
      text = re.sub(r'<[^>]+>', '', text)

      # Remove non-alphanumeric characters
      text = re.sub(r'\W', ' ', text)

      # Convert to lowercase
      text = text.lower()

      # Remove stopwords
      stop_words = set(stopwords.words('english'))
      word_tokens = word_tokenize(text)
      text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])

      # Remove punctuation
      text = text.translate(str.maketrans("", "", string.punctuation))

      # Stemming
      ps = PorterStemmer()
      text = ' '.join([ps.stem(word) for word in word_tokens])

      return text

  def save_cleaned_text_to_csv(self, cleaned_text, output_csv_path):
      # Split the cleaned text into a list of words
      words = cleaned_text.split()

      # Create a DataFrame with word counts
      word_counts = pd.Series(words).value_counts().reset_index()
      word_counts.columns = ['Word', 'Frequency']

      # Save the DataFrame to a CSV file
      word_counts.to_csv(output_csv_path, index=False)

  def main(self):

      # Extract text from HTML file
      content = requests.get(self.url).text
      html_text = self.extract_text_from_html_file(content)

      # Clean the text
      cleaned_text = self.clean_text(html_text)

      # Save cleaned text to a CSV file
      self.save_cleaned_text_to_csv(cleaned_text, self.output_file)

**AUTHOR A**

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/a1.txt', '/authors/a1.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/a2.txt', '/authors/a2.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/a3.txt', '/authors/a3.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/a4.txt', '/authors/a4.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/a5.txt', '/authors/a5.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/ua.txt', '/authors/ua.csv')
tp.main()

**Author B**

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/b1.txt', '/authors/b1.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/b2.txt', '/authors/b2.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/b3.txt', '/authors/b3.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/b4.txt', '/authors/b4.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/b5.txt', '/authors/b5.csv')
tp.main()

In [None]:
tp = TextProcessor('https://cs.indstate.edu/~cs40143/authors/ub.txt', '/authors/ub.csv')
tp.main()