In [9]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from textblob import TextBlob
import syllapy
import re
import nltk


#download necessary nltk data
nltk.download('punkt_tab')

# File paths
input_file_path = r'C:\Users\Dell\Downloads\Input.xlsx'
output_file_path = r'C:\Users\Dell\Downloads\output.xlsx'
output_folder_path = r'C:\Users\Dell\Downloads\articles'

# Ensure output folder exists
os.makedirs(output_folder_path, exist_ok=True)

# Load input URLs
def load_input_urls(file_path):
    input_data = pd.read_excel(file_path)
    return input_data

# Extract article text
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string if soup.title else "No Title"
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])
        return title, article_text
    except Exception as e:
        print(f"Error fetching URL {url}: {e}")
        return "", ""

# Save article to file
def save_article(url_id, title, text):
    file_path = os.path.join(output_folder_path, f"{url_id}.txt")
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(f"{title}\n\n{text}")

# Analyze text
def analyze_text(article_text):
    blob = TextBlob(article_text)
    positive_score = sum([sentence.sentiment.polarity for sentence in blob.sentences if sentence.sentiment.polarity > 0])
    negative_score = sum([sentence.sentiment.polarity for sentence in blob.sentences if sentence.sentiment.polarity < 0])
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    sentences = re.split(r'[.!?]', article_text)
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences) if sentences else 0

    words = article_text.split()
    word_count = len(words)
    syllable_count = sum(syllapy.count(word) for word in words)
    syllable_per_word = syllable_count / word_count if word_count else 0

    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'we', 'you', 'he', 'she', 'they', 'me', 'us'])

    complex_word_count = sum(1 for word in words if syllapy.count(word) >= 3)
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count else 0

    avg_word_length = sum(len(word) for word in words) / word_count if word_count else 0

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words) if avg_sentence_length and percentage_complex_words else 0

    return {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'avg_words_per_sentence': avg_sentence_length,
        'complex_word_count': complex_word_count,
        'word_count': word_count,
        'syllable_per_word': syllable_per_word,
        'personal_pronouns': personal_pronouns,
        'avg_word_length': avg_word_length
    }

# Main function
def main():
    input_data = load_input_urls(input_file_path)
    results = []

    for _, row in input_data.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        title, text = extract_article_text(url)

        if text:
            save_article(url_id, title, text)
            analysis_results = analyze_text(text)
            analysis_results['URL_ID'] = url_id
            results.append(analysis_results)

    # Save results to output file
    results_df = pd.DataFrame(results)
    results_df.to_excel(output_file_path, index=False)
    print(f"Analysis complete. Results saved to {output_file_path}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Analysis complete. Results saved to C:\Users\Dell\Downloads\output.xlsx
