In [None]:
!pip install newspaper3k
!pip install textblob

import pandas as pd
import os
import re
import nltk
from newspaper import Article
from textblob import TextBlob
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import cmudict

nltk.download('cmudict')
cmu_dict = cmudict.dict()
nltk.download('punkt')

#Paths
path_input   ='/content/Input.xlsx'
path_positive='/content/positive-words.txt'
path_negative='/content/negative-words.txt'
path_stop = [
    '/content/StopWords_Auditor.txt',
    '/content/StopWords_Currencies.txt',
    '/content/StopWords_DatesandNumbers.txt',
    '/content/StopWords_Generic.txt',
    '/content/StopWords_Geographic.txt',
    '/content/StopWords_Names.txt',
    '/content/StopWords_GenericLong.txt'
]

# Read
df = pd.read_excel(path_input)

def extract_article_text(url):  # Function to extract article text from URL
    try:
        article = Article(url)
        article.download()
        article.parse()

        title = article.title  # Title
        article_text = article.text  # Text
        return title, article_text

    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None

# Save text files
if not os.path.exists('Article_Text'):
    os.makedirs('Article_Text')

# Dictionary to store URLs that worked
url_dict = {}

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    title, article_text = extract_article_text(url)  # Extract article text

    url_id_clean = re.sub(r'[^\w]', '_', url_id)  # Replace invalid characters with '_'

    file_path = f'Article_Text/{url_id_clean}.txt'
    if title and article_text:  # Save
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f"Title: {title}\n\n")
            file.write(article_text)
            print(f"Article text saved for {url_id_clean}")
        url_dict[url_id_clean] = url  # Store the URL
    else:
        # Create an empty file
        with open(file_path, 'w', encoding='utf-8') as file:
            print(f"Unable to extract article text for {url_id_clean}, created empty file.")

print("Extraction completed.")

def load_words(file_paths, encoding='ISO-8859-1'):
    words = set()
    for file_path in file_paths:
        with open(file_path, 'r', encoding=encoding) as file:
            words.update(file.read().splitlines())
    return words

# Load positive, negative, and stop words
positive_words = load_words([path_positive], encoding='latin-1')
negative_words = load_words([path_negative], encoding='latin-1')
stop_words = load_words([file_path for file_path in path_stop])

# Count syllables in a word
def syllable_count(word):
    if word.lower() in cmu_dict:
        return [len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word.lower()]][0]
    else:
        return 0

# Fog index
def fog_index(avg_sentence_length, percentage_complex_words):
    return 0.4 * (avg_sentence_length + percentage_complex_words)

# Linguistic features
def compute_features(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    num_sentences = len(sentences)
    num_words = len(words)
    avg_sentence_length = num_words / num_sentences
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
    percentage_complex_words = (complex_word_count / num_words) * 100

    fog_index_value = fog_index(avg_sentence_length, percentage_complex_words)

    avg_words_per_sentence = num_words / num_sentences
    avg_word_length = sum(len(word) for word in words) / num_words

    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'])
    syllables_per_word = sum(syllable_count(word) for word in words) / num_words

    # Sentiment analysis using TextBlob
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    sentiment = TextBlob(text)
    polarity_score = sentiment.sentiment.polarity
    subjectivity_score = sentiment.sentiment.subjectivity

    return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index_value, avg_words_per_sentence, complex_word_count, num_words, syllables_per_word, personal_pronouns, avg_word_length

# Read the extracted article texts
article_texts = {}
for filename in os.listdir('Article_Text'):
    with open(os.path.join('Article_Text', filename), 'r', encoding='utf-8') as file:
        article_texts[filename.split('.')[0]] = file.read()

# Initialize lists to store computed features
url_ids = []
urls = []
positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
avg_sentence_lengths = []
percentage_complex_words_list = []
fog_indexes = []
avg_words_per_sentence_list = []
complex_word_counts = []
word_counts = []
syllables_per_word_list = []
personal_pronouns_list = []
avg_word_lengths = []

# Compute features for each article text
for url_id, text in article_texts.items():
    url_ids.append(url_id)
    urls.append(url_dict.get(url_id, "N/A"))

    if "Title:" in text:
        positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index_value, avg_words_per_sentence, complex_word_count, num_words, syllables_per_word, personal_pronouns, avg_word_length = compute_features(text)

        positive_scores.append(positive_score)
        negative_scores.append(negative_score)
        polarity_scores.append(round(polarity_score,3))
        subjectivity_scores.append(round(subjectivity_score,3))
        avg_sentence_lengths.append(round(avg_sentence_length,3))
        percentage_complex_words_list.append(round(percentage_complex_words,3))
        fog_indexes.append(round(fog_index_value,3))
        avg_words_per_sentence_list.append(round(avg_words_per_sentence,3))
        complex_word_counts.append(complex_word_count)
        word_counts.append(num_words)
        syllables_per_word_list.append(round(syllables_per_word,5))
        personal_pronouns_list.append(personal_pronouns)
        avg_word_lengths.append(round(avg_word_length,5))
    else:
        positive_scores.append("error loading site")
        negative_scores.append(None)
        polarity_scores.append(None)
        subjectivity_scores.append(None)
        avg_sentence_lengths.append(None)
        percentage_complex_words_list.append(None)
        fog_indexes.append(None)
        avg_words_per_sentence_list.append(None)
        complex_word_counts.append(None)
        word_counts.append(None)
        syllables_per_word_list.append(None)
        personal_pronouns_list.append(None)
        avg_word_lengths.append(None)

# Create a DataFrame to store the computed features
output_df = pd.DataFrame({
    'URL_ID': url_ids,
    'URL': urls,
    'POSITIVE SCORE': positive_scores,
    'NEGATIVE SCORE': negative_scores,
    'POLARITY SCORE': polarity_scores,
    'SUBJECTIVITY SCORE': subjectivity_scores,
    'AVG SENTENCE LENGTH': avg_sentence_lengths,
    'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words_list,
    'FOG INDEX': fog_indexes,
    'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence_list,
    'COMPLEX WORD COUNT': complex_word_counts,
    'WORD COUNT': word_counts,
    'SYLLABLE PER WORD': syllables_per_word_list,
    'PERSONAL PRONOUNS': personal_pronouns_list,
    'AVG WORD LENGTH': avg_word_lengths
})

# Save the DataFrame to an Excel file
output_df.to_excel('Text_Analysis_Output.xlsx', index=False)




[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Article text saved for blackassign0001
Article text saved for blackassign0002
Article text saved for blackassign0003
Article text saved for blackassign0004
Article text saved for blackassign0005
Article text saved for blackassign0006
Article text saved for blackassign0007
Article text saved for blackassign0008
Article text saved for blackassign0009
Article text saved for blackassign0010
Article text saved for blackassign0011
Article text saved for blackassign0012
Article text saved for blackassign0013
Article text saved for blackassign0014
Article text saved for blackassign0015
Article text saved for blackassign0016
Article text saved for blackassign0017
Article text saved for blackassign0018
Article text saved for blackassign0019
Article text saved for blackassign0020
Article text saved for blackassign0021
Article text saved for blackassign0022
Article text saved for blackassign0023
Article text saved for blackassign0024
Article text saved for blackassign0025
Article text saved for bl