In [23]:
import pandas as pd
import re
import spacy
from nltk.corpus import cmudict
import json
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize Spacy and CMU dictionary
nlp = spacy.load('en_core_web_sm')
d = cmudict.dict()

# Define functions
def dot_removal_text(text):
    return re.sub(r'(\b\w+)\.(\w+\b)', r'\1DOT\2', text)

def total_sentence(text):
    dot_removed_text = dot_removal_text(text)
    text_chunks = dot_removed_text.split('\n')
    total_sentences = 0
    for chunk in text_chunks:
        doc = nlp(chunk)
        plaintext = " ".join(token.text for token in doc)
        split_sentences = plaintext.split('.')
        for sentence in split_sentences:
            sentence = sentence.strip()
            if sentence:
                sentence = sentence.replace('DOT', '.')
                total_sentences += 1
    return total_sentences

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    cleaned_words = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(cleaned_words)

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def counts(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index)
    return total_words

def calculate_average_word_character_length(total_sentences, total_words):
    if total_sentences == 0:
        return 0
    return total_words / total_sentences

def calculate_average_word_length(text):
    words = text.split()
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    if total_words == 0:
        return 0
    return total_characters / total_words

def no_vowels(word):
    word = word.lower()
    if word in d:
        syllable_count = [len([y for y in x if y[-1].isdigit()]) for x in d[word]]
        counts = max(syllable_count)
    else:
        counts = 1
    if word.endswith('es') or word.endswith('ed'):
        if counts > 1:
            counts -= 1
    return counts

def syllable_count_per_word(words):
    syllable_counts = {}
    for word in words:
        syllable_counts[word] = no_vowels(word)
    return syllable_counts

def complex_word_count(syllable_counts):
    count = 0
    min_syllable = 2
    for word, syllable_count in syllable_counts.items():
        if isinstance(syllable_count, int) and syllable_count > min_syllable:
            count += 1
    return count

def count_personal_pronouns(text):
    pronouns_pattern = r'\b(I|we|my|ours|us)\b'
    matches = re.findall(pronouns_pattern, text, re.IGNORECASE)
    matches = [match for match in matches if match.lower() != 'us' or match.islower()]
    return len(matches)

In [24]:
# Initializing datasets
dataset = pd.read_csv('/Users/mnu/Desktop/NLP_task/Web_scraping/web_scraped_data.csv')
output_dataset = pd.read_csv('/Users/mnu/Desktop/NLP_task/Sentimental_Analysis/Output_Data_Structure.csv')

In [None]:
for index, row in dataset.iterrows():
    url_id = row['URL_ID']
    article_text = row['article_text']
    doc = nlp(article_text)
    plaintext = " ".join(token.text for token in doc)

    dot_removed_text = dot_removal_text(plaintext)
    total_sen = total_sentence(article_text)
    stopwords_removed = remove_stopwords(article_text)
    article_no_punctuation = remove_punctuation(stopwords_removed)
    total_words = counts(article_no_punctuation)
    average_word_length_value = calculate_average_word_length(article_no_punctuation)
    syllable_words = syllable_count_per_word(article_no_punctuation.split())
    complex_words = complex_word_count(syllable_words)
    pronouns_count = count_personal_pronouns(article_text)

    avg_word_length = average_word_length_value
    Average_Sentence_Length = total_words / total_sen if total_sen > 0 else 0
    Percentage_of_Complex_words = complex_words / total_words if total_words > 0 else 0
    Fog_Index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)
    syllable_words_str = json.dumps(syllable_words)
    
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'AVG SENTENCE LENGTH'] = Average_Sentence_Length
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'PERCENTAGE OF COMPLEX WORDS'] = Percentage_of_Complex_words
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'FOG INDEX'] = Fog_Index
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'AVG NUMBER OF WORDS PER SENTENCE'] = Average_Sentence_Length
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'COMPLEX WORD COUNT'] = complex_words
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'WORD COUNT'] = total_words
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'SYLLABLE PER WORD'] = syllable_words_str
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'PERSONAL PRONOUNS'] = pronouns_count
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'AVG WORD LENGTH'] = avg_word_length

In [28]:
output_dataset.to_csv('/Users/mnu/Desktop/NLP_task/Task_2_to_8/final_output.csv', index=False)