In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


TEXT EXTRACTED AND SAVED INTO FILES. FILES ARE IN A DIRECTORY NAMED 'ARTICLES'.

In [None]:
import pandas as pd
import requests
import spacy
from bs4 import BeautifulSoup
import os
import nltk
import string

In [2]:
nlp = spacy.load("en_core_web_sm")

def extract_text_from_url(url):             # Function to extract article text from URL
    try:
        response = requests.get(url)          # Fetch HTML content from URL
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")   # Parse HTML using BeautifulSoup

        article_title = soup.title.get_text() if soup.title else ""

        doc = nlp(html_content)            # Process HTML content

        article_text = ""                                    # Extract text
        for p in soup.find_all('p'):
            article_text += p.text + "\n"

        return article_title, article_text
    except Exception as e:
        print(f"Error fetching or parsing article from {url}: {e}")
        return None, None

input_df = pd.read_excel("Input.xlsx")




In [None]:

if not os.path.exists("articles"):
    os.makedirs("articles")

for index, row in input_df.iterrows():             # Extract article text for each URL and save into text files
    url_id = row["URL_ID"]
    url = row["URL"]
    article_title, article_text = extract_text_from_url(url)
    if article_title and article_text:
        with open(f"articles/{url_id}.txt", "w", encoding="utf-8") as file:
            file.write(f"{article_title}\n\n{article_text}")

# **Data Analysis**

In [None]:
def clean_text(text):                                        # Function to clean text
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = [word.lower() for word in text.split('.') if word.strip() and word.lower() not in stopwords and word.strip() not in string.punctuation]
    return ' '.join(words)

# Load stop words
stopwords_files = ["StopWords_Auditor.txt", "StopWords_Currencies.txt", "StopWords_DatesandNumbers.txt", "StopWords_Generic.txt", "StopWords_GenericLong.txt", "StopWords_Geographic.txt", "StopWords_Names.txt"]
stopwords = set()
for file in stopwords_files:
    with open(file, 'r', encoding='latin-1') as f:
        stopwords.update(f.read().splitlines())

# Process each file in the articles directory
articles_dir = "articles"
output_dir = "cleaned_articles"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [None]:
for filename in os.listdir(articles_dir):
    if filename.endswith(".txt"):
        # Read the text file
        file_path = os.path.join(articles_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:  # Specify encoding as 'utf-8'
            text = f.read()

        # Clean the text
        cleaned_text = clean_text(text)

        # Write cleaned text to a new file in the output directory
        output_file_path = os.path.join(output_dir, filename)
        with open(output_file_path, 'w', encoding='utf-8') as f:  # Specify encoding as 'utf-8'
            f.write(cleaned_text)


# **TOKENIZATION AND VECTORING**

In [22]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict

# Load positive and negative words
positive_words_file = "positive-words.txt"
negative_words_file = "negative-words.txt"

with open(positive_words_file, 'r', encoding='latin-1') as f:
    positive_words = set(f.read().splitlines())

with open(negative_words_file, 'r', encoding='latin-1') as f:
    negative_words = set(f.read().splitlines())

# Load CMU Pronouncing Dictionary for syllable count
cmu_dict = cmudict.dict()

# Function to count syllables in a word
def count_syllables(word):
    if word.lower() in cmu_dict:
        return len([ph for ph in cmu_dict[word.lower()] if ph[-1].isdigit()])
    # Handling exceptions for words ending with "es" or "ed"
    elif word.lower().endswith(('es', 'ed')) and word.lower()[:-2] in cmu_dict:
        return len([ph for ph in cmu_dict[word.lower()[:-2]] if ph[-1].isdigit()])
    else:
        return len([ph for ph in cmu_dict.get(word.lower(), []) if ph and ph[-1].isdigit()])

# Function to calculate readability metrics
def calculate_readability(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    total_words = len(words)
    total_sentences = len(sentences)
    total_syllables = sum(count_syllables(word) for word in words)
    complex_word_count = sum(1 for word in words if count_syllables(word) > 2)  # Count words with more than 2 syllables

    avg_sentence_length = total_words / total_sentences
    percentage_complex_words = (complex_word_count / total_words) * 100 if total_words > 0 else 0  # Calculate percentage of complex words
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = total_words / total_sentences
    avg_word_length = sum(len(word) for word in words) / total_words  # Calculate average word length

    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, total_words, total_syllables, avg_word_length

# Function to perform sentimental analysis
def perform_sentimental_analysis(text):
    words = word_tokenize(text.lower())
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

# Process each file in the articles directory
articles_dir = "articles"
output_data = []

for file in os.listdir(articles_dir):
    if file.endswith(".txt"):  # Check if the file is a text file
        with open(os.path.join(articles_dir, file), 'r', encoding='utf-8') as f:
            text = f.read()
            avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, total_words, total_syllables, avg_word_length = calculate_readability(text)
            positive_score, negative_score, polarity_score, subjectivity_score = perform_sentimental_analysis(text)
            output_data.append([positive_score, negative_score, polarity_score, subjectivity_score,
                                avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence,
                                complex_word_count, total_words, total_syllables, avg_word_length])



In [None]:
output_df = pd.DataFrame(output_data, columns=[
    "POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
    "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
    "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
    "SYLLABLE PER WORD", "AVG WORD LENGTH"
])

output_df.to_excel("output.xlsx", index=False)


In [23]:

output_df.insert(0, "URL_ID", input_df["URL_ID"])
output_df.insert(1, "URL", input_df["URL"])

output_df.to_excel("output.xlsx", index=False)
