In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def extract_content_with_formatting(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('h1', class_='entry-title').text.strip()
            content_div = soup.find('div', class_='td-post-content tagdiv-type')
            if content_div:
                content = [title + '\n\n']
                # Iterate through each child in the content div
                for child in content_div.children:
                    if child.name == 'h1':
                        content.append('\n' + child.text.strip() + '\n')  # Add extra newline for headings
                    elif child.name == 'p':
                        content.append(child.text.strip() + '\n')  # Append paragraphs
                    elif child.name in ['ul', 'ol']:  # Handling lists
                        for li in child.find_all('li'):
                            content.append('- ' + li.text.strip())  # Add a dash for list items
                        content.append('')  # Add an extra newline after a list
                    elif child.name == 'li':  # Direct li without ul or ol
                        content.append('- ' + child.text.strip() + '\n')
                    elif child.text.strip():  # Catch all for any other text-containing tags
                        content.append(child.text.strip() + '\n')
                return '\n'.join(content)
            else:
                return "Content block 'td-ss-main-content' not found."
        else:
            return f"Failed to load page, status code: {response.status_code}"
    except Exception as e:
        return f"Failed to extract due to an error: {str(e)}"

def save_text(filename, content):
    with open(f'{filename}.txt', 'w', encoding='utf-8') as file:
        file.write(content)

def process_urls(excel_path):
    # Load the Excel file
    data = pd.read_excel(excel_path)
    for index, row in data.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        print(f"Processing: {url_id} from {url}")
        content = extract_content_with_formatting(url)
        save_text(os.path.join("/content/drive/MyDrive/Assignment/IN-OUT", url_id), content)
        print(f"Saved content to {url_id}.txt")

# Specify the path to your Excel file
excel_path = '/content/drive/MyDrive/Assignment/IN-OUT/Input.xlsx'
process_urls(excel_path)

Processing: bctech2011 from https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/
Saved content to bctech2011.txt
Processing: bctech2012 from https://insights.blackcoffer.com/streamlined-integration-interactive-brokers-api-with-python-for-desktop-trading-application/
Saved content to bctech2012.txt
Processing: bctech2013 from https://insights.blackcoffer.com/efficient-data-integration-and-user-friendly-interface-development-navigating-challenges-in-web-application-deployment/
Saved content to bctech2013.txt
Processing: bctech2014 from https://insights.blackcoffer.com/effective-management-of-social-media-data-extraction-strategies-for-authentication-security-and-reliability/
Saved content to bctech2014.txt
Processing: bctech2015 from https://insights.blackcoffer.com/streamlined-trading-operations-interface-for-metatrader-4-empowering-efficient-management-and-monitoring/
Saved content to bctech2015.txt
Processi

In [2]:
import pandas as pd
import re
import nltk
import os
from nltk.tokenize import sent_tokenize, word_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Load Positive and Negative Sentiment Dictionaries
def load_sentiment_dictionary():
    pos_path = '/content/drive/MyDrive/Assignment/positive-words.txt'
    neg_path = '/content/drive/MyDrive/Assignment/negative-words.txt'

    # Load positive words
    with open(pos_path, 'r', encoding='utf-8', errors='replace') as pos_file:
        positive_words = set([line.strip().lower() for line in pos_file.readlines() if line.strip() and not line.startswith(';')])

    # Load negative words
    with open(neg_path, 'r', encoding='utf-8', errors='replace') as neg_file:
        negative_words = set([line.strip().lower() for line in neg_file.readlines() if line.strip() and not line.startswith(';')])

    print(f"Loaded {len(positive_words)} positive words and {len(negative_words)} negative words.")

    return positive_words, negative_words

# Load Stopwords
def load_stopwords(stopwords_folder):
    stopwords = set()
    for stopword_file in os.listdir(stopwords_folder):
        stopword_path = os.path.join(stopwords_folder, stopword_file)  # Corrected here
        try:
            with open(stopword_path, 'r', encoding='utf-8') as file:
                stopwords.update(line.strip().lower() for line in file if line.strip())
        except UnicodeDecodeError:
            with open(stopword_path, 'r', encoding='latin-1') as file:
                stopwords.update(line.strip().lower() for line in file if line.strip())
    return stopwords

# Sentiment analysis function
def sentiment_analysis(text, positive_words, negative_words, stopwords):
    words = [word for word in re.findall(r'\w+', text.lower()) if word not in stopwords and word.isalpha()]
    print(f"Words to analyze: {words}")

    # Initialize scores
    positive_score = 0
    negative_score = 0

    for word in words:
        if word in positive_words:
            print(f"Positive word found: {word}")
            positive_score += 1
        if word in negative_words:
            print(f"Negative word found: {word}")
            negative_score += 1

    return positive_score, negative_score

# Function to calculate readability metrics
def readability_metrics(text, stopwords):
    sentences = sent_tokenize(text)
    words = [word for word in word_tokenize(text.lower()) if word not in stopwords and word.isalpha()]
    word_count = len(words)
    sentence_count = len(sentences)

    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    complex_words = [word for word in words if syllable_count(word) >= 3]
    percentage_complex_words = len(complex_words) / word_count if word_count > 0 else 0

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    return avg_sentence_length, percentage_complex_words, fog_index, len(complex_words), word_count

# Function to count syllables in a word
def syllable_count(word):
    vowels = 'aeiou'
    word = word.lower()
    count = 0
    prev_char_was_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_was_vowel:
                count += 1
            prev_char_was_vowel = True
        else:
            prev_char_was_vowel = False
    if word.endswith('e'):
        count -= 1
    return max(1, count)

# Function to count personal pronouns
def personal_pronouns(text):
    pronouns = ["i", "we", "my", "ours", "us"]
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in pronouns)

# Function to calculate average word length
def avg_word_length(text):
    words = word_tokenize(text)
    total_length = sum(len(word) for word in words)
    return total_length / len(words) if words else 0

# Analyze and save results function
def analyze_and_save_results(content, url_id, positive_words, negative_words, stopwords):
    positive_score, negative_score = sentiment_analysis(content, positive_words, negative_words, stopwords)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / len(re.findall(r'\w+', content)) if content else 0

    avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, word_count = readability_metrics(content, stopwords)
    syllables_per_word = sum([syllable_count(word) for word in word_tokenize(content)]) / word_count if word_count else 0
    pronoun_count = personal_pronouns(content)
    avg_word_len = avg_word_length(content)

    # Create DataFrame for results
    results = {
        'URL_ID': url_id,
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': pronoun_count,
        'AVG WORD LENGTH': avg_word_len
    }

    return results  # Return results instead of saving them directly

# Function to filter URLs
def filter_urls(data, start_id, end_id):
    return data[(data['URL_ID'] >= start_id) & (data['URL_ID'] <= end_id)]

# Example usage
def run_analysis(excel_path, output_excel_path, stopwords_folder):
    positive_words, negative_words = load_sentiment_dictionary()
    stopwords = load_stopwords(stopwords_folder)

    data = pd.read_excel(excel_path)

    # Filter URLs from bctech2021 to bctech2157
    filtered_data = filter_urls(data, 'bctech2021', 'bctech2157')

    all_results = []  # List to hold results for all URLs

    for index, row in filtered_data.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        print(f"Analyzing content from: {url}")
        content = extract_content_with_formatting(url)  # Call the scraping function
        result = analyze_and_save_results(content, url_id, positive_words, negative_words, stopwords)
        all_results.append(result)  # Append each result to the list

    # Save all results to an Excel file after processing all URLs
    output_df = pd.DataFrame(all_results)
    output_df.to_excel(output_excel_path, index=False)

    print(f"All results saved to {output_excel_path}.")

# Example paths
excel_path = '/content/drive/MyDrive/Assignment/IN-OUT/Input.xlsx'
output_excel_path = '/content/drive/MyDrive/Assignment/Output_Analysis.xlsx'  # Define the output path here

# Run the analysis
run_analysis(excel_path, output_excel_path, '/content/drive/MyDrive/Assignment/Stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loaded 2006 positive words and 4783 negative words.
Analyzing content from: https://insights.blackcoffer.com/automated-campaign-management-system-a-comprehensive-solution-with-linkedin-and-email-integration/
Words to analyze: ['automated', 'campaign', 'management', 'system', 'a', 'comprehensive', 'solution', 'linkedin', 'email', 'integration', 'client', 'background', 'client', 'a', 'leading', 'marketing', 'tech', 'firm', 'worldwide', 'industry', 'type', 'marketing', 'products', 'services', 'ad', 'tech', 'marketing', 'automation', 'lead', 'management', 'organization', 'size', 'problem', 'integrating', 'linkedin', 'email', 'apis', 'automation', 'building', 'a', 'user', 'friendly', 'responsive', 'frontend', 'interface', 'developing', 'a', 'robust', 'backend', 'code', 'campaign', 'automation', 'ensuring', 'secure', 'user', 'authentication', 'data', 'exchange', 'managing', 'campaign', 'creation', 'scheduling', 'tracking', 'handling', 'data', 'storage', 'organization', 'mongodb', 'providing'