In [None]:
import numpy as np
import pandas as pd
import re
import requests
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")


In [None]:
data = pd.read_csv("/content/Output Data Structure.xlsx - Sheet1.csv")
data

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,,,,,,,,,,,,,
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,,,,,,,,,,,,,
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,,,,,,,,,,,,,
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,,,,,,,,,,,,,
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...,,,,,,,,,,,,,
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...,,,,,,,,,,,,,
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...,,,,,,,,,,,,,
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...,,,,,,,,,,,,,


In [None]:
stop_words = []
addresses = ["/content/StopWords_Auditor.txt",
             "/content/StopWords_Currencies.txt",
             "/content/StopWords_DatesandNumbers.txt",
             "/content/StopWords_Generic.txt",
             "/content/StopWords_GenericLong.txt",
             "/content/StopWords_Geographic.txt",
             "/content/StopWords_Names.txt"]
for address in tqdm(addresses):
    df = pd.read_csv(address, header = None, encoding='latin1', on_bad_lines="skip", sep = "|")[0]
    for i in range(len(df)):
        word = df[i]
        if type(word) == str:
            stop_words.append(word.lower())
        else:
            stop_words.append(word)

100%|██████████| 7/7 [00:00<00:00, 136.69it/s]


In [None]:
positive_keywords = pd.read_csv("/content/positive-words.txt", header=None)
positive_keywords = np.squeeze(np.array(positive_keywords))



In [None]:
negative_keywords = pd.read_csv("/content/negative-words.txt", header = None, encoding='latin1', on_bad_lines="skip", sep = "|")
negative_keywords = np.squeeze(np.array(negative_keywords))

In [None]:
# A FUNCTION THAT RETURNS THE SYLLABLE COUNT IN A WORD
def syllable_count(word):
    if len(word)>1:
        word = word.lower()
        count = 0
        vowels = "aeiouy"
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1
    else:
        count = 0
    return count

In [None]:
def main_function(url):
    '''
    A FUNCTION THAT TAKES THE URL OF A WEBSITE AS AN INPUT AND SCRAPES IN TO EXTRACT CONTENT AND DO
    ALL THE NECESSARY MANIPULATIONS AND PRE PROCESSINGS TO RETURN A DICTIONARY OF REQUIRED OUTPUTS.
    '''
    r = requests.get(url)

    # PREPARING PARAGRAPHS
    paragraphs = []
    htmlParse = BeautifulSoup(r.content, 'html5lib')
    for para in htmlParse.find_all("p"):
        paragraph = para.get_text().replace("\n", " ")
        paragraphs = paragraphs + [paragraph]

    # PREPARING WORDS AND SENTENCES
    words = []
    sentences = []

    for para in (paragraphs):
        for word in para.split(sep = " "): # SPACE TO SEGREGATE WORDS
            words.append(word)
        for sentance in para.split(sep = "."): # FULL STOP TO SEGREGATE SENTENCES
            sentences.append(sentance)

    filtered_words = [w for w in words if not w.lower() in stop_words]
    word_count = len(filtered_words)
#  ------------------------------------------------------------------------------------------------------------------------------
    positive_score, negative_score, complex_word_count, average_word_len, syll_count = 0, 0, 0, 0, 0

    for i in (range(0, len(words))):
        word = words[i] # CHECKING EACH INDIVIDUAL WORD FOR EACH SPECIFIED CONDITION.
        # POSITIVE KEYWORDS
        if word in positive_keywords:
            positive_score += 1
        # NEGATIVE KEYWORDS
        if word in negative_keywords:
            negative_score += 1
        # SYLLABLE COUNT
        s_count = syllable_count(word)
        syll_count += s_count
        if s_count > 2:
            complex_word_count += 1
        # AVERAGE WORD LENGTH
        average_word_len += len(word)

    # Sentimental Analysis
    polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score)/ ((word_count) + 0.000001)

    # Analysis of Readability
    avg_sentence_len = len(words)/len(sentences)
    complex_words_percent = complex_word_count/ len(words)
    fog_index = 0.4 * (avg_sentence_len + complex_words_percent)

    # Average Number of Words Per Sentence
    avg_words_per_sentence = len(words)/len(sentences)

    # Syllable Count Per Word
    syll_count /= len(words)

    # PRNOUNS
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronoun_count = 0
    for sentence in sentences:
        pronouns = pronounRegex.findall(sentence)
        pronoun_count += len(pronouns)

    # Average Word Length
    average_word_len /= len(words)


    polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score)/ ((word_count) + 0.000001)

    outputs = {
        "POSITIVE SCORE":positive_score, "NEGATIVE SCORE": negative_score,"POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score, "AVG SENTENCE LENGTH": avg_sentence_len,
        "PERCENTAGE OF COMPLEX WORDS": complex_words_percent, "FOG INDEX":fog_index, "AVG NUMBER OF WORDS PER SENTENCE": avg_words_per_sentence,
        "COMPLEX WORD COUNT": complex_word_count, "WORD COUNT":  word_count, "SYLLABLE PER WORD": syll_count ,
        "PERSONAL PRONOUNS": pronoun_count, "AVG WORD LENGTH": average_word_len
                }
    return outputs

In [None]:
for i in tqdm(range(0, len(data))):
    outputs = main_function(data["URL"][i])
    output_df = pd.DataFrame(outputs, index = [0])
    for j in output_df.columns:
        data[j][i] = output_df[j][0]

100%|██████████| 147/147 [05:17<00:00,  2.16s/it]


In [None]:
data

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,4.0,0,1.000000,0.016461,7.039216,0.292479,2.932678,7.039216,105.0,243.0,2.030641,3.0,6.406685
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,10.0,5,0.333333,0.027574,9.971154,0.191900,4.065221,9.971154,199.0,544.0,1.693346,9.0,5.273867
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,10.0,2,0.666667,0.042857,7.250000,0.268473,3.007389,7.250000,109.0,280.0,1.977833,3.0,6.273399
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,22.0,9,0.419355,0.063008,8.493976,0.384397,3.551349,8.493976,271.0,492.0,2.224113,6.0,6.636879
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,3.0,0,1.000000,0.011905,8.195652,0.281167,3.390728,8.195652,106.0,252.0,1.984085,3.0,6.265252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...,9.0,6,0.200000,0.022321,8.758333,0.230257,3.595436,8.758333,242.0,672.0,1.837298,5.0,5.475737
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...,15.0,8,0.304348,0.024033,8.807910,0.203977,3.604755,8.807910,318.0,957.0,1.765876,9.0,5.383579
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...,10.0,4,0.428571,0.059072,7.981132,0.198582,3.271885,7.981132,84.0,237.0,1.808511,13.0,5.600473
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...,2.0,0,1.000000,0.013514,6.656250,0.300469,2.782688,6.656250,64.0,148.0,2.103286,2.0,6.361502
