In [7]:
# Importing Libraries
import pandas as pd
import os
import chardet
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
# Initializing directories
mydir = '/Users/mnu/Desktop/NLP_task/Provided_data/Sentimental_analysis_data/StopWords'
negativedictionaydir = '/Users/mnu/Desktop/NLP_task/Provided_data/Sentimental_analysis_data/MasterDictionary/negative-words.txt'
positivedictionaydir = '/Users/mnu/Desktop/NLP_task/Provided_data/Sentimental_analysis_data/MasterDictionary/positive-words.txt'

In [9]:
# Initializing datasets
dataset = pd.read_csv('/Users/mnu/Desktop/NLP_task/Web_scraping/web_scraped_data.csv')
# Define the column labels
output_dataset = pd.read_excel('/Users/mnu/Desktop/NLP_task/Provided_data/Output_Data_Structure.xlsx')

In [10]:
# Load stop words from multiple files in a directory
def load_stop_words(mydir):
    stopwords = set()
    folders = os.listdir(mydir)
    
    for folder in folders:
        file_dir = os.path.join(mydir, folder)
        
        # Detect file encoding
        with open(file_dir, 'rb') as file:
            raw_data = file.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']

        # Read stopwords and clean them
        with open(file_dir, 'r') as file:
            stop_words = file.read().splitlines()
            for item in stop_words:
                parts = item.split('|')
                for part in parts:
                    part = re.sub(r' ', '', part)
                    stopwords.add(part.lower())
    return stopwords

# Load a list of words from a file
def load_word_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        word_list = file.read().splitlines()
    return word_list

# Remove stop words from a given text
def remove_stopwords(plaintext):
    cleaned_word = [word for word in plaintext.split() if word.lower() not in stopwords]
    return ' '.join(cleaned_word)

# Perform sentiment analysis based on positive and negative word lists
def positive_negative_score(text, positive_words, negative_words):
    words = word_tokenize(text)
    positive_score, negative_score = 0.0 , 0.0

    for word in words:
        if word in positive_words:
            positive_score += 1
        elif word in negative_words:
            negative_score += 1
            
    return positive_score, negative_score

# Count words after stopword removal
def counts(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    return len(tokenizer.word_index)

In [11]:
# Processing and loading providing textual data
stopwords = load_stop_words(mydir)
positive_words = load_word_list(positivedictionaydir)
negative_words = load_word_list(negativedictionaydir)

In [12]:
# implementing Sentimental_Analysis
for index, row in dataset.iterrows():
    # Extract the article text for the current row
    text = row['article_text']
    
    # Remove stopwords from the text
    stopwords_removed = remove_stopwords(text)
    
    # Recalculate word count for the current row
    words_count = counts(stopwords_removed)
    
    # Calculate positive and negative scores
    positive_score = float(positive_negative_score(stopwords_removed, positive_words, negative_words)[0])
    negative_score = float(positive_negative_score(stopwords_removed, positive_words, negative_words)[1])
    
    # Calculate polarity score
    Polarity_Score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    
    # Calculate subjectivity score
    Subjectivity_Score = (positive_score + negative_score) / (words_count + 0.000001)
    
    # Get the corresponding 'URL_ID'
    url_id = row['URL_ID']
    
    # Update the output dataset with the results for the current 'URL_ID'
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'POSITIVE SCORE'] = positive_score
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'NEGATIVE SCORE'] = negative_score
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'POLARITY SCORE'] = Polarity_Score
    output_dataset.loc[output_dataset['URL_ID'] == url_id, 'SUBJECTIVITY SCORE'] = Subjectivity_Score

# Convert columns to float after all updates
output_dataset['POSITIVE SCORE'] = output_dataset['POSITIVE SCORE'].astype(float)
output_dataset['NEGATIVE SCORE'] = output_dataset['NEGATIVE SCORE'].astype(float)
output_dataset['POLARITY SCORE'] = output_dataset['POLARITY SCORE'].astype(float)
output_dataset['SUBJECTIVITY SCORE'] = output_dataset['SUBJECTIVITY SCORE'].astype(float)

In [16]:
output_dataset.to_csv('/Users/mnu/Desktop/NLP_task/Sentimental_Analysis/Output_Data_Structure.csv', index=False)