In [12]:
!pip install beautifulsoup4 openpyxl pandas nltk



    torch (>=1.7.*)
           ~~~~~~^


In [51]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\meera/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [54]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [55]:
import nltk
nltk.data.path.append('./nltk_data')  # Ensure the correct relative path
try:
    nltk.data.find('tokenizers/punkt/english.pickle')
    nltk.data.find('corpora/stopwords/english')
    print("Punkt and stopwords are accessible!")
except LookupError as e:
    print(f"Error: {str(e)}")

Punkt and stopwords are accessible!


In [56]:
stop_words = set(stopwords.words('english'))  # Stopwords from nltk_data
positive_words = set(open('./MasterDictionary/positive-words.txt').read().split())  # Positive words dictionary
negative_words = set(open('./MasterDictionary/negative-words.txt').read().split())  # Negative words dictionary


In [57]:

def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    return tokens


In [58]:
def count_syllables(word):
    vowels = "aeiouy"
    word = word.lower()
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [59]:
def calculate_scores(text):
    tokens = clean_text(text)
    sentences = sent_tokenize(text)
    
    # Count positive and negative words
    pos_score = sum(1 for word in tokens if word in positive_words)
    neg_score = sum(1 for word in tokens if word in negative_words)
    
    # Derived metrics
    polarity_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (len(tokens) + 0.000001)
    
    # Word and sentence metrics
    total_words = len(tokens)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    avg_words_per_sentence = total_words / total_sentences if total_sentences > 0 else 0
    
    # Complex words count (words with more than 2 syllables)
    complex_words = [word for word in tokens if count_syllables(word) > 2]
    complex_word_count = len(complex_words)
    perc_complex_words = complex_word_count / total_words if total_words > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + perc_complex_words)
    
    # Word and syllable counts
    avg_word_length = sum(len(word) for word in tokens) / total_words if total_words > 0 else 0
    syllables_per_word = sum(count_syllables(word) for word in tokens) / total_words if total_words > 0 else 0
    
    # Personal pronouns (basic regex for personal pronouns)
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    
    return {
        'POSITIVE SCORE': pos_score,
        'NEGATIVE SCORE': neg_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': perc_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': total_words,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length,
    }


In [60]:
def process_articles(input_file, output_file):
    # Load the input Excel file
    df = pd.read_excel(input_file)
    
    # List to store the results
    results = []
    
    for idx, row in df.iterrows():
        # Concatenate paragraphs from different columns to form the full article text
        article_text = ' '.join([str(row[col]) for col in df.columns if pd.notna(row[col])])
        
        # If the article is successfully concatenated, calculate the scores
        if article_text.strip():
            scores = calculate_scores(article_text)
            
            # Prepare the result row
            result_row = {
                'ARTICLE_ID': idx + 1,  # Adding article ID
                **scores
            }
            results.append(result_row)
    
    # Convert results to DataFrame and write to output Excel
    result_df = pd.DataFrame(results)
    result_df.to_excel(output_file, index=False)

In [61]:
input_file_path = 'output_with_paragraphs.xlsx'  # Your scraped content file
output_file_path = 'Processed_Output.xlsx'  # Output file path
process_articles(input_file_path, output_file_path)

In [21]:
print(nltk.data.path)

['C:\\Users\\meera/nltk_data', 'c:\\Users\\meera\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data', 'c:\\Users\\meera\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data', 'c:\\Users\\meera\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data', 'C:\\Users\\meera\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']
