<a href="https://colab.research.google.com/github/nelslindahlx/NLP/blob/master/visualize_word_frequency_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install requests beautifulsoup4 matplotlib nltk plotly

# Import required libraries
import requests
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import plotly.express as px

# Download NLTK data for stopwords and lemmatization
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to fetch and parse webpage content
def extract_text_from_webpage(url):
    try:
        # Validate URL
        if not url.startswith(('http://', 'https://')):
            raise ValueError("Invalid URL: Please include 'http://' or 'https://'")

        # Fetch the webpage
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove script and style elements
        for script_or_style in soup(['script', 'style']):
            script_or_style.decompose()

        # Extract and return text
        return soup.get_text(separator=' ')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return ""
    except ValueError as ve:
        print(ve)
        return ""

# Function to clean, lemmatize, and process text
def clean_and_lemmatize_text(text, additional_stopwords=None):
    stop_words = set(stopwords.words('english'))
    if additional_stopwords:
        stop_words.update(additional_stopwords)
    lemmatizer = WordNetLemmatizer()

    words = text.split()
    # Convert words to lowercase, remove stopwords, and lemmatize
    filtered_words = [
        lemmatizer.lemmatize(word.lower())
        for word in words
        if word.isalpha() and word.lower() not in stop_words
    ]
    return filtered_words

# Function to analyze word frequency
def analyze_word_frequency(words, num_common=10):
    word_counts = Counter(words)
    return word_counts.most_common(num_common)

# Function to visualize word frequency interactively
def plot_word_frequency_interactive(word_frequency):
    words, counts = zip(*word_frequency)
    data = {'Word': words, 'Frequency': counts}
    fig = px.bar(
        data,
        x='Word',
        y='Frequency',
        title='Interactive Word Frequency on Webpage',
        text='Frequency',
    )
    fig.update_traces(textposition='outside')
    fig.update_layout(xaxis_title='Words', yaxis_title='Frequency')
    fig.show()

# URL of the webpage to analyze
url = 'https://civichonors.com'

# Step 1: Extract text from the webpage
text = extract_text_from_webpage(url)

# Step 2: Clean, lemmatize, and process text
additional_stopwords = {'civic', 'honors'}  # Add domain-specific stopwords here
cleaned_words = clean_and_lemmatize_text(text, additional_stopwords)

# Step 3: Analyze word frequency
num_common_words = 10  # Customize the number of most common words to visualize
word_frequency = analyze_word_frequency(cleaned_words, num_common=num_common_words)

# Step 4: Visualize word frequency interactively
plot_word_frequency_interactive(word_frequency)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
