In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re


df = pd.read_csv("new_input.csv")

# Define the folder paths for StopWords and MasterDictionary
stop_words_folder = "StopWords/"
master_dictionary_folder = "MasterDictionary/"

# Define the file names for positive and negative words
positive_words_file = "positive-words.txt"
negative_words_file = "negative-words.txt"

# Create sets to store positive and negative words
positive_words = set()
negative_words = set()


# Load positive words from the file
positive_words_file_path = master_dictionary_folder + positive_words_file
with open(positive_words_file_path, "r") as file:
    positive_words.update(file.read().splitlines())

# Load negative words from the file
negative_words_file_path = master_dictionary_folder + negative_words_file
with open(negative_words_file_path, "r") as file:
    negative_words.update(file.read().splitlines())

# Load stop words from each file in StopWords folder
stop_words_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]

# Define the folder paths for StopWords and MasterDictionary
stop_words_folder = "StopWords/"
master_dictionary_folder = "MasterDictionary/"
# Initialize an empty set to store all stop words
stop_words = set()

# Load stop words from each file
for file_name in stop_words_files:
    file_path = stop_words_folder + file_name
    with open(file_path, "r") as file:
        stop_words.update(file.read().splitlines())

# Function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = 'aeiou'
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith(('ed' or 'es')):
        count -= 1
    if count == 0:
        count += 1
    return count


def sent_tokenize(text):
    # Split text into sentences based on punctuation marks (. ! ?)
    sentences = re.split(r'[.!?]', text)
    # Remove empty strings and strip leading/trailing whitespaces
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences


In [10]:
def perform_text_analysis(text):
    # Step 1: Cleaning using Stop Words Lists
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Step 2: Extracting Derived variables
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)

    # Step 3: Analysis of Readability
    # Calculate the number of sentences
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)

    # Calculate the number of words
    num_words = len(cleaned_words)

    # Calculate the average sentence length
    if num_sentences > 0:
        average_sentence_length = num_words / num_sentences
    else:
        average_sentence_length = 0

    # Calculate the average number of words per sentence
    average_words_per_sentence = num_words / (num_sentences + 0.000001)

    # Calculate the number of complex words
    num_complex_words = sum(1 for word in cleaned_words if syllable_count(word) > 2)
    # Step 5: Complex Word Count
    complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)

    # Step 6: Word Count
    word_count = len(cleaned_words)
    
    # Calculate the average syllables per word
    total_syllables = sum(syllable_count(word) for word in cleaned_words)
    syllables_per_word = total_syllables / (num_words + 0.000001)

    # Calculate the percentage of complex words
    if num_words > 0:
        percentage_complex_words = num_complex_words / num_words
    else:
        percentage_complex_words = 0
        
     # Calculate the number of complex words
    num_complex_words = sum(1 for word in cleaned_words if syllable_count(word) > 2)

    # Calculate the Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    # Step 4: Personal Pronouns
    personal_pronouns_count = sum(1 for word in cleaned_words if word.lower() in ['i', 'we', 'my', 'ours', 'us'])

    # Step 5: Average Word Length
    average_word_length = sum(len(word) for word in cleaned_words) / (len(cleaned_words) + 0.000001)

    return {'Positive Score': positive_score,
            'Negative Score': negative_score,
            'Polarity Score': polarity_score,
            'Subjectivity Score': subjectivity_score,
            'Average Sentence Length': average_sentence_length,
            'Percentage of Complex Words': percentage_complex_words,            
            'Fog Index': fog_index,
            'Average Words Per Sentence': average_words_per_sentence,
            'complex_word_count' : complex_word_count,
            'word_count' : word_count,
            'syllables_per_word': syllables_per_word,
            'Personal Pronouns Count': personal_pronouns_count,
            'Average Word Length': average_word_length,
           }
            

In [11]:
# Apply text analysis function to each row in the DataFrame
df['Analysis Results'] = df['text'].apply(perform_text_analysis)

# Split the analysis results into separate columns
df = pd.concat([df.drop(['Analysis Results'], axis=1), df['Analysis Results'].apply(pd.Series)], axis=1)
# Save the updated DataFrame back to a CSV file
df.to_csv("Output Data Structure.csv", index=False)


In [12]:
df

Unnamed: 0,URL_ID,URL,title,text,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Words Per Sentence,complex_word_count,word_count,syllables_per_word,Personal Pronouns Count,Average Word Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,\n\nWe have seen a huge development and depend...,33.0,6.0,0.692308,0.070018,7.050633,0.280072,2.932282,7.050633,156.0,557.0,2.107720,0.0,6.556553
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,"\n\nThroughout history, from the industrial re...",58.0,31.0,0.303371,0.114543,9.475610,0.405405,3.952406,9.475610,315.0,777.0,2.467181,0.0,7.344916
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand's Evolution, Communication Imp...",\n\nIntroduction\n\n\nIn the span of just a fe...,38.0,24.0,0.225806,0.102819,10.578947,0.532338,4.444514,10.578947,321.0,603.0,2.787728,0.0,8.233831
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,"\n\nThe way we live, work, and communicate has...",36.0,75.0,-0.351351,0.182867,11.673077,0.504119,4.870878,11.673077,306.0,607.0,2.655684,0.0,8.026359
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,\n\nThe year 2040 is poised to witness a conti...,22.0,8.0,0.466667,0.084507,8.875000,0.371831,3.698732,8.875000,132.0,355.0,2.309859,0.0,7.369014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,Due to the COVID-19 the repercussion of the en...,"\n\nEpidemics, in general, have both direct an...",27.0,57.0,-0.357143,0.151351,10.471698,0.412613,4.353724,10.471698,229.0,555.0,2.372973,0.0,7.181982
94,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,Impact of COVID-19 pandemic on office space an...,\n\nCOVID 19 has bought the world to its knees...,21.0,35.0,-0.250000,0.133971,10.717949,0.299043,4.406797,10.717948,125.0,418.0,2.052632,0.0,6.672249
95,blackassign0098,https://insights.blackcoffer.com/contribution-...,Contribution of handicrafts (Visual Arts & Lit...,\n\nHandicrafts is an art of making crafts by ...,6.0,3.0,0.333333,0.038298,9.400000,0.357447,3.902979,9.400000,84.0,235.0,2.195745,0.0,6.851064
96,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,How COVID-19 is impacting payment preferences?...,\n\n\n\nI would rather pay cash – Before COVID...,16.0,3.0,0.684210,0.066901,7.888889,0.250000,3.255556,7.888889,71.0,284.0,1.989437,0.0,6.257042


In [13]:
# Assuming df is your DataFrame
df.rename(columns={'title': 'Title', 'text': 'Text','complex_word_count':'Complex_word_count', 'word_count': 'Word_count', 'syllables_per_word': 'Syllables_per_word' }, inplace=True)


In [14]:
df

Unnamed: 0,URL_ID,URL,Title,Text,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Sentence Length,Percentage of Complex Words,Fog Index,Average Words Per Sentence,Complex_word_count,Word_count,Syllables_per_word,Personal Pronouns Count,Average Word Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,\n\nWe have seen a huge development and depend...,33.0,6.0,0.692308,0.070018,7.050633,0.280072,2.932282,7.050633,156.0,557.0,2.107720,0.0,6.556553
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,"\n\nThroughout history, from the industrial re...",58.0,31.0,0.303371,0.114543,9.475610,0.405405,3.952406,9.475610,315.0,777.0,2.467181,0.0,7.344916
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand's Evolution, Communication Imp...",\n\nIntroduction\n\n\nIn the span of just a fe...,38.0,24.0,0.225806,0.102819,10.578947,0.532338,4.444514,10.578947,321.0,603.0,2.787728,0.0,8.233831
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,"\n\nThe way we live, work, and communicate has...",36.0,75.0,-0.351351,0.182867,11.673077,0.504119,4.870878,11.673077,306.0,607.0,2.655684,0.0,8.026359
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,\n\nThe year 2040 is poised to witness a conti...,22.0,8.0,0.466667,0.084507,8.875000,0.371831,3.698732,8.875000,132.0,355.0,2.309859,0.0,7.369014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,Due to the COVID-19 the repercussion of the en...,"\n\nEpidemics, in general, have both direct an...",27.0,57.0,-0.357143,0.151351,10.471698,0.412613,4.353724,10.471698,229.0,555.0,2.372973,0.0,7.181982
94,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,Impact of COVID-19 pandemic on office space an...,\n\nCOVID 19 has bought the world to its knees...,21.0,35.0,-0.250000,0.133971,10.717949,0.299043,4.406797,10.717948,125.0,418.0,2.052632,0.0,6.672249
95,blackassign0098,https://insights.blackcoffer.com/contribution-...,Contribution of handicrafts (Visual Arts & Lit...,\n\nHandicrafts is an art of making crafts by ...,6.0,3.0,0.333333,0.038298,9.400000,0.357447,3.902979,9.400000,84.0,235.0,2.195745,0.0,6.851064
96,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,How COVID-19 is impacting payment preferences?...,\n\n\n\nI would rather pay cash – Before COVID...,16.0,3.0,0.684210,0.066901,7.888889,0.250000,3.255556,7.888889,71.0,284.0,1.989437,0.0,6.257042
