# Task 1 – Generation Data Science and ML

Maria Nikitha Suresh<br>
Masters in Data Science<br>
marianikitha@arizona.edu

In [None]:
pip install vaderSentiment

In [1]:
#Importing the required libraries
import requests
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize, sent_tokenize, pos_tag
import csv
import time
from urllib.parse import urljoin, urlparse

In [2]:
#Download NLTK's 'averaged_perceptron_tagger' 
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nikitha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Function to scrape a webpage and return its content
def scrape_page(url):
    response = requests.get(url)  # Send a GET request to the provided URL
    soup = BeautifulSoup(response.text, 'html.parser')  # Parse the response content using BeautifulSoup
    content = soup.get_text()  # Extract the content from the webpage
    return content  # Return the extracted content

In [4]:
# Function to calculate NLP statistics from the given text
def calculate_nlp_statistics(text):
    words = word_tokenize(text)  # Tokenize the text into words
    sentences = sent_tokenize(text)  # Tokenize the text into sentences
    
    # Perform POS (Part-of-Speech) tagging on the words
    pos_tags = pos_tag(words)
    
    # Count the number of nouns, verbs, and adjectives using POS tags
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('NN'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('VB'))
    adj_count = sum(1 for word, pos in pos_tags if pos.startswith('JJ'))
    
    # Calculate word count, sentence count, and average word length
    word_count = len(words)
    sentence_count = len(sentences)
    avg_word_length = sum(len(word) for word in words) / word_count if word_count != 0 else 0  
    
    # Return the calculated statistics
    return {
        'word_count': word_count,
        'sentence_count': sentence_count,
        'avg_word_length': avg_word_length,
        'noun_count': noun_count,
        'verb_count': verb_count,
        'adj_count': adj_count
    }


In [5]:
# Function to save detailed results to a CSV file
def save_results_to_csv(results, filename):
    keys = results[0].keys()  
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:  # Opening the CSV file for writing
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)  # Created a DictWriter for writing dictionaries to CSV
        dict_writer.writeheader()  # Writing the column names
        dict_writer.writerows(results)  # Writing the rows


In [6]:
# Function to save aggregated results to a CSV file
def save_aggregated_results(aggregated_result, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:  # Open the CSV file for writing
        writer = csv.DictWriter(output_file, fieldnames=aggregated_result.keys())  # Create a DictWriter fot writing values to CSV
        writer.writeheader()  # Writing the column names
        writer.writerow(aggregated_result)  # Write the single row of aggregated results

In [7]:
# Function to crawl a website and collect up to 'max_pages' URLs
def crawl_website(start_url, max_pages):
    urls_to_scrape = set()  # Initialize a set to store unique URLs to scrape
    urls_to_visit = [start_url]  # Initialize the list of URLs to visit with the start URL
    visited_urls = set()  # Initialize a set to store visited URLs
    
    while urls_to_visit and len(urls_to_scrape) < max_pages:  # Continue until we have enough URLs or no more to visit
        current_url = urls_to_visit.pop(0)  # Get the next URL to visit
        if current_url in visited_urls:  # Skip if the URL has already been visited
            continue
        
        visited_urls.add(current_url)  # Mark the URL as visited
        try:
            response = requests.get(current_url)  # Send a GET request to the current URL
            soup = BeautifulSoup(response.text, 'html.parser')  # Parse the response content
            
            urls_to_scrape.add(current_url)  # Add the current URL to the set of URLs to scrape
            
            # Find all links on the current page
            for link in soup.find_all('a', href=True):
                full_url = urljoin(current_url, link['href'])  # Resolve relative URLs
                parsed_url = urlparse(full_url)  # Parse the URL
                # Add URLs belonging to the same domain and not visited yet
                if parsed_url.netloc == urlparse(start_url).netloc and full_url not in visited_urls:
                    urls_to_visit.append(full_url)  # Add the full URL to the visit list
                    
            # Print the progress of crawling
            print(f"Crawled: {current_url} | Page: {len(urls_to_scrape)}")
            
            time.sleep(1)  # Pause for a second to avoid overwhelming the server
            
        except Exception as e:  # Handle any exceptions that occur during the request
            print(f"Failed to crawl {current_url}: {e}")
    
    return list(urls_to_scrape)  # Return the list of URLs to scrape

In [8]:
# Function to calculate aggregated statistics (averages) from the results
def calculate_aggregated_statistics(results):
    aggregated_result = {
        'avg_word_count': sum(item['word_count'] for item in results) / len(results),  # Calculate average word count
        'avg_sentence_count': sum(item['sentence_count'] for item in results) / len(results),  # Calculate average sentence count
        'avg_word_length': sum(item['avg_word_length'] for item in results) / len(results),  # Calculate average word length
        'avg_noun_count': sum(item['noun_count'] for item in results) / len(results),  # Calculate average noun count
        'avg_verb_count': sum(item['verb_count'] for item in results) / len(results),  # Calculate average verb count
        'avg_adj_count': sum(item['adj_count'] for item in results) / len(results)  # Calculate average adjective count
    }
    return aggregated_result  # Return the aggregated statistics

In [9]:
# Start URL of the website for webscraping
start_url = 'https://www.everydayhealthgroup.com/'

In [10]:
# Crawl the website to collect up to 100 URLs
urls = crawl_website(start_url, max_pages=100)

Crawled: https://www.everydayhealthgroup.com/ | Page: 1
Crawled: https://www.everydayhealthgroup.com/cart | Page: 2
Crawled: https://www.everydayhealthgroup.com/#page | Page: 3
Crawled: https://www.everydayhealthgroup.com/#about | Page: 4
Crawled: https://www.everydayhealthgroup.com/#brands | Page: 5
Crawled: https://www.everydayhealthgroup.com/#team | Page: 6
Crawled: https://www.everydayhealthgroup.com/#news | Page: 7
Crawled: https://www.everydayhealthgroup.com/#culture | Page: 8
Crawled: https://www.everydayhealthgroup.com/#careers | Page: 9
Crawled: https://www.everydayhealthgroup.com/#contact | Page: 10
Crawled: https://www.everydayhealthgroup.com/everydayhealthgroup-careers | Page: 11
Crawled: https://www.everydayhealthgroup.com/dan-stone-bio | Page: 12
Crawled: https://www.everydayhealthgroup.com/tom-dehn-bio | Page: 13
Crawled: https://www.everydayhealthgroup.com/deb-goetz-bio | Page: 14
Crawled: https://www.everydayhealthgroup.com/sean-alford-bio | Page: 15
Crawled: https://w

Crawled: https://www.everydayhealthgroup.com/george-wukoson-bio#page | Page: 98
Crawled: https://www.everydayhealthgroup.com/george-wukoson-bio#about | Page: 99
Crawled: https://www.everydayhealthgroup.com/george-wukoson-bio#brands | Page: 100


In [11]:
# Collect NLP statistics for each page in the list of URLs
results = []
for url in urls:
    print(f"Scraping {url}...")  # Print the URL being scraped
    try:
        content = scrape_page(url)  # Scrape the content of the page
        stats = calculate_nlp_statistics(content)  # Calculate NLP statistics for the page content
        results.append(stats)  # Add the statistics to the results list
    except Exception as e:  # Handle any exceptions during scraping
        print(f"Failed to scrape {url}: {e}")


Scraping https://www.everydayhealthgroup.com/everydayhealthgroup-careers#contact...
Scraping https://www.everydayhealthgroup.com/deb-goetz-bio...
Scraping https://www.everydayhealthgroup.com/dan-stone-bio#careers...
Scraping https://www.everydayhealthgroup.com/george-wukoson-bio#page...
Scraping https://www.everydayhealthgroup.com/cdn-cgi/l/email-protection#2841464e47684d5e4d5a514c4951404d49445c404f5a475d58064b4745...
Scraping https://www.everydayhealthgroup.com/everydayhealthgroup-careers#careers...
Scraping https://www.everydayhealthgroup.com/sean-alford-bio#culture...
Scraping https://www.everydayhealthgroup.com/cdn-cgi/l/email-protection#cda4a3aba28da8bba8bfb4a9acb4a5a8aca1b9a5aabfa2b8bde3aea2a0...
Scraping https://www.everydayhealthgroup.com/sean-alford-bio#brands...
Scraping https://www.everydayhealthgroup.com/cdn-cgi/l/email-protection#cfa6a1a9a08faab9aabdb6abaeb6a7aaaea3bba7a8bda0babfe1aca0a2...
Scraping https://www.everydayhealthgroup.com/everydayhealthgroup-careers#culture...

In [12]:
# Save the results to a CSV file
save_results_to_csv(results, 'nlp_statistics.csv')
print("Scarping complete. Results is saved to 'nlp_statistics.csv'") #Print a message

Scarping complete. Results is saved to 'nlp_statistics.csv'


In [13]:
# Calculate and save the aggregated results
if results:
    aggregated_result = calculate_aggregated_statistics(results)
    save_aggregated_results(aggregated_result, 'aggregated_results.csv')
    

print("Aggregated results containing the statistics is saved to 'aggregated_results.csv'.") # Print a completion message


Aggregated results containing the statistics is saved to 'aggregated_results.csv'.
