In [11]:
import pandas as pd
import os
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease, syllable_count,  lexicon_count

In [12]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\namit\AppData\Roaming\nltk_data...


True

In [13]:
# Directory where your text files are stored
directory = (r'C:\Users\namit\OneDrive\Desktop\blackCoffer\new attempt\extracted_articles')

# Initialize a list to store the data for each text file
data = []

sia = SentimentIntensityAnalyzer()

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('.txt'):  # Check if the file is a text file
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text_content = file.read()
            
            sentiment_scores = sia.polarity_scores(text_content)
            positive_score = sentiment_scores['pos']
            negative_score = sentiment_scores['neg']
            polarity_score = sentiment_scores['compound']

            reading_ease = flesch_reading_ease(text_content)

            word_count = lexicon_count(text_content, removepunct=True)
            sentence_count = text_content.count('.') + text_content.count('!') + text_content.count('?')
            avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
            # complex_word_percentage = complex_word_count(text_content) / word_count * 100 if word_count > 0 else 0
            # fog_index = 0.4 * (avg_sentence_length + complex_word_percentage)
            avg_word_length = sum(len(word) for word in text_content.split()) / word_count if word_count > 0 else 0
            syllable_per_word = syllable_count(text_content) / word_count if word_count > 0 else 0
            personal_pronouns = sum(1 for word in nltk.word_tokenize(text_content) if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'])
            subjectivity_score = sentiment_scores['neu'] + sentiment_scores['pos'] + sentiment_scores['neg']
            
            # For now, let's print the filename and content to verify
            data.append({
                'URL_ID': filename[:-4],  # Remove the ".txt" extension
                'URL': '',  # Replace this with the URL if available
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity_score,
                'SUBJECTIVITY SCORE': subjectivity_score,
                'AVG SENTENCE LENGTH': avg_sentence_length,               
                'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
                'WORD COUNT': word_count,
                'SYLLABLE PER WORD': syllable_per_word,
                'PERSONAL PRONOUNS': personal_pronouns,
                'AVG WORD LENGTH': avg_word_length
            })
# Once you're done with the analysis, you can proceed to the next steps


In [18]:
df = pd.DataFrame(data)

# Define the order of columns as per your specified structure
columns_order = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 
                 'AVG NUMBER OF WORDS PER SENTENCE',  'WORD COUNT',
                 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

# Reorder columns
df = df[columns_order]

In [17]:
df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,AVG NUMBER OF WORDS PER SENTENCE,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,,0.137,0.020,0.9991,1.000,15.683544,15.683544,1239,1.476998,17,4.713479
1,blackassign0002,,0.149,0.036,0.9994,1.000,18.234568,18.234568,1477,1.748815,10,5.633040
2,blackassign0003,,0.118,0.051,0.9971,1.000,19.196429,19.196429,1075,2.017674,20,6.291163
3,blackassign0004,,0.102,0.145,-0.9950,1.000,20.843137,20.843137,1063,1.934149,7,6.121355
4,blackassign0005,,0.107,0.005,0.9955,1.001,17.300000,17.300000,692,1.723988,6,5.677746
...,...,...,...,...,...,...,...,...,...,...,...,...
93,blackassign0096,,0.074,0.150,-0.9986,1.000,21.673077,21.673077,1127,1.673469,3,5.368234
94,blackassign0097,,0.097,0.096,-0.3939,1.000,27.871795,27.871795,1087,1.500460,13,4.840846
95,blackassign0098,,0.098,0.019,0.9800,1.000,16.291667,16.291667,391,1.767263,1,5.810742
96,blackassign0099,,0.138,0.033,0.9952,1.000,17.111111,17.111111,616,1.480519,5,4.883117
