In [1]:
import pandas as pd 
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
# Creating Dataframe of the Details file that contain text
df=pd.read_csv(r'Details.csv')

In [3]:
# program that will generate the word tokens
def word_cleaner(text):
    # First let's tokenize the whole content
    # If the page corresponding to a link is not found than it's content was N/A 
    if text=="N/A":
        return "N/A"
    try:
        # Remove punctuation marks
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text,preserve_line=True)
    except:
        return "N/A"
    # Return the tokens without punctuation marks
    return tokens

In [4]:
# program that will generate the number of sentence in the content section
def sen_counter(text):
    # If the page corresponding to link does not found then return sentence_length as 0
    if text=="N/A":
        return 0
    # return the number of sentence in the text else return 0
    try:
        sen_token=sent_tokenize(text,language="english")
        return sen_token.__len__()
    except:
        return 0

In [5]:
# Store the word tokens in an array 
word_tokens=[]
for i in range(df.shape[0]):
    word_tokens.append(word_cleaner(df['Content'][i]))

In [6]:
# Store the number of sentence in the text
sen_count=[]
for i in range(df.shape[0]):
    sen_count.append(sen_counter(df['Content'][i]))

In [7]:
# Add the column corresponding to word tokens
df=df.assign(tokens=word_tokens)
# Add the column corresponding to sentence number
df=df.assign(sen_number=sen_count)

In [8]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[IntroductionIf, anything, kills, over, 10, mi...",53
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[Human, minds, a, fascination, in, itself, car...",67
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[IntroductionAI, is, rapidly, evolving, in, th...",67
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[Anything, that, could, give, rise, to, smarte...",75
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[Machine, intelligence, is, the, last, inventi...",66


* Next step in the Data pre-processing is to convert the tokens to their root form and remove stopwords

In [21]:
def lem_stop(text_list):
    # If Column does not have token list return N/A
    if text_list=="N/A":
        return "N/A"
    # Create a stemmer object
    stmmer=PorterStemmer()
    # covert each word to it's root form
    for i in range(text_list.__len__()):
        text_list[i]=stmmer.stem(text_list[i])
    # create a empty list 
    li=[]
    for i in range(text_list.__len__()):
        if text_list[i] not in stopwords.words("english"):
            li.append(text_list[i]) # if word not a stopword then add it to the list
    return li   # return the list that does not have any stopword

In [22]:
# Program to store all the words in a list that does not contain any stop word
cleaned_tokens=[]
for i in range(df.shape[0]):
    cleaned_tokens.append(lem_stop(df.tokens[i]))

In [24]:
# Add the corresponding column in our data frame
df=df.assign(filtered_token=cleaned_tokens)

In [25]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo..."
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ..."
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto..."
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i..."
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ..."


In [28]:
# Importing the required libraries to get the senitment of each word
from nltk.corpus import wordnet as wn
from afinn import Afinn

In [35]:
def get_word_sentiment(word):
    """
    Generates a list of synonyms for a given word using WordNet, and
    computes the positive and negative scores for each synonym using
    the AFINN sentiment lexicon. Returns the average positive and
    negative scores for all synonyms.
    """
    # Load the AFINN sentiment lexicon
    afinn = Afinn()
    synsets = wn.synsets(word)
    synonyms = set()
    for synset in synsets:
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())
    positive_scores = []
    negative_scores = []
    for synonym in synonyms:
        score = afinn.score(synonym)
        if score > 0:
            positive_scores.append(score)
        elif score < 0:
            negative_scores.append(score)
    if len(positive_scores) > 0:
        avg_positive = sum(positive_scores) / len(positive_scores)
    else:
        avg_positive = 0
    if len(negative_scores) > 0:
        avg_negative = sum(negative_scores) / len(negative_scores)
    else:
        avg_negative = 0
    return avg_positive, avg_negative

In [39]:
# Program to get the score of text
def get_scores(text_list):
    # If it's a "N/A" then return 0,0
    if text_list=="N/A":
        return 0,0
    pos=[]
    neg=[]
    # Get score for each word
    for i in range(text_list.__len__()):
        score_pos,score_neg=get_word_sentiment(text_list[i])
        if abs(score_pos)>abs(score_neg):
            pos.append(1)
        elif abs(score_neg)>abs(score_pos):
            neg.append(1)
    # return total positive and negative score
    return pos.__len__(),neg.__len__()

In [42]:
# Get the Positive and Negative score
pos=[]
neg=[]
for i in range(df.shape[0]):
    text_positive,text_negative=get_scores(df.filtered_token[i]) 
    pos.append(text_positive)
    neg.append(text_negative)

In [44]:
# Add the corresponding columns 
df['POSITIVE SCORE']=list(pos)

In [45]:
# Add the corresponding columns
df['NEGATIVE SCORE']=list(neg)

In [46]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto...",100,111
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i...",128,90
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ...",143,90


In [47]:
#Adding the polarity score column to the dataframe
df['POLARITY SCORE']=(df['POSITIVE SCORE']-df['NEGATIVE SCORE'])/(df['POSITIVE SCORE']+df['NEGATIVE SCORE']+0.000001)

In [48]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto...",100,111,-0.052133
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i...",128,90,0.174312
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ...",143,90,0.227468


In [49]:
# Adding the subjectivity score column to the dataframe
df['SUBJECTIVITY SCORE']=(df['POLARITY SCORE']+df['NEGATIVE SCORE'])/(len(df['filtered_token'])+0.000001)

In [50]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto...",100,111,-0.052133,0.973227
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i...",128,90,0.174312,0.791003
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ...",143,90,0.227468,0.791469


In [52]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827,1.701493
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto...",100,111,-0.052133,0.973227,1.701493
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i...",128,90,0.174312,0.791003,1.52
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ...",143,90,0.227468,0.791469,1.727273


In [65]:
import nltk
from nltk.corpus import wordnet

In [66]:
def is_complex(word):
    """
    Check if a given word is complex by checking if it has multiple synsets in WordNet.
    """
    synsets = wordnet.synsets(word)
    return len(synsets) > 1

def count_complex(text_list):
    if text_list=="N/A":
        return 0
    counter=0
    for i in text_list:
        if is_complex(i):
            counter+=1
    return counter

In [67]:
# Program to count complex words in a text
complex_count=[]
for i in range(df.shape[0]):
    complex_count.append(count_complex(df['filtered_token'][i]))

In [71]:
p_o_c_w=[]
for i in range(df.shape[0]):
    p_o_c_w.append(complex_count[i]/len(df['filtered_token'][i]))

In [72]:
# Adding the percentage complex words
df['PERCENTAGE OF COMPLEX WORDS']=p_o_c_w

In [73]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943,0.518158
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827,1.701493,0.553985
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto...",100,111,-0.052133,0.973227,1.701493,0.504559
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i...",128,90,0.174312,0.791003,1.52,0.564189
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ...",143,90,0.227468,0.791469,1.727273,0.529234


In [74]:
# Adding the fog index column
df['FOG INDEX']=0.4*(df['AVG SENTENCE LENGTH']/df['PERCENTAGE OF COMPLEX WORDS'])

In [75]:
df.head(2)

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943,0.518158,1.660455
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827,1.701493,0.553985,1.228549


In [76]:
# Adding the Average number of words per sentence column
df['AVG NUMBER OF WORDS PER SENTENCE']=len(df['filtered_token'])/df['sen_number']

In [77]:
df.head(2)

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943,0.518158,1.660455,2.150943
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827,1.701493,0.553985,1.228549,1.701493


In [82]:
# Adding the complex word count
df['COMPLEX WORD COUNT']=complex_count

In [83]:
df.head(2)

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943,0.518158,1.660455,2.150943,585
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827,1.701493,0.553985,1.228549,1.701493,431


In [84]:
# Adding the word count column
df['WORD COUNT']=len(df['filtered_token'])

In [107]:
# Correct the values in the word_count 
for i in range(df.shape[0]):
    df['WORD COUNT'][i]=len(cleaned_tokens[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['WORD COUNT'][i]=len(cleaned_tokens[i])


In [108]:
df.head(1)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,125,108,0.072961,0.948008,2.150943,0.518158,1.660455,2.150943,585,1129,2211,1,5.943313


In [87]:
# Program to find the number of syllables per word
def count_syllables(word):
    vowels = "aeiouy"
    count = 0
    prev_char = None
    for char in word:
        char = char.lower()
        if char in vowels and (prev_char is None or prev_char not in vowels):
            count += 1
        prev_char = char
    if word.endswith("e"):
        count -= 1
    if word.endswith("le") and len(word) > 2 and word[-3] not in vowels:
        count += 1
    if count == 0:
        count = 1
    return count

def count_syllables_in_sentence(words):
    if words=="N/A":
        return 0
    syllable_count = 0
    for word in words:
        syllable_count += count_syllables(word)
    return syllable_count

In [88]:
# Storing the number of syllables per word
s_c=[]
for i in range(df.shape[0]):
    s_c.append(count_syllables_in_sentence(df['filtered_token'][i]))

In [90]:
# Adding the column corresponding to syllables per word
df['SYLLABLE PER WORD']=list(s_c)

In [91]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943,0.518158,1.660455,2.150943,585,114,2211
1,38,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ...","[human, mind, a, fascin, in, itself, carri, th...",67,"[human, mind, fascin, carri, potenti, tinker, ...",134,95,0.170306,0.834827,1.701493,0.553985,1.228549,1.701493,431,114,1328
2,39,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...,IntroductionAI is rapidly evolving in the empl...,"[introductionai, is, rapidli, evolv, in, the, ...",67,"[introductionai, rapidli, evolv, employ, secto...",100,111,-0.052133,0.973227,1.701493,0.504559,1.348894,1.701493,498,114,1898
3,40,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...,"[anyth, that, could, give, rise, to, smarterth...",75,"[anyth, could, give, rise, smarterthanhuman, i...",128,90,0.174312,0.791003,1.52,0.564189,1.077653,1.52,501,114,1581
4,41,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...,"[machin, intellig, is, the, last, invent, that...",66,"[machin, intellig, last, invent, human, ever, ...",143,90,0.227468,0.791469,1.727273,0.529234,1.305489,1.727273,525,114,1811


In [92]:
# Program to check if a word is personal pronoun or not 
personal_pronouns = ["i", "me", "you", "he", "him", "she", "her", "it", "we", "us", "they", "them"]

def is_personal_pronoun(word):
    return word.lower() in personal_pronouns


In [93]:
# create a list to store the number of personal prenoun 
p_c=[]
for i in range(df.shape[0]):
    if df['filtered_token'][i]=="N/A":
        p_c.append(0)
        continue
    t=0
    for word in df['filtered_token'][i]:
        if is_personal_pronoun(word):
            t+=1
    p_c.append(t)

In [95]:
# Add the corresponding column to the dataframe
df['PERSONAL PRONOUNS']=list(p_c)

In [98]:
def avg_word_length(words):
    t=0
    for word in words:
        t+=len(word)
    return t

In [100]:
# create a list to store the average word length
a_c=[]
for i in range(df.shape[0]):
    if df['filtered_token'][i]=="N/A":
        a_c.append(0)
        continue
    a_c.append(avg_word_length(df['filtered_token'][i])/len(df['filtered_token'][i]))

In [102]:
# Add the average word length column to the data frame
df['AVG WORD LENGTH']=list(a_c)

In [103]:
df.head(1)

Unnamed: 0,URL_ID,URL,Title,Content,tokens,sen_number,filtered_token,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes,Introduction“If anything kills over 10 million...,"[introductionif, anyth, kill, over, 10, millio...",53,"[introductionif, anyth, kill, 10, million, peo...",125,108,0.072961,0.948008,2.150943,0.518158,1.660455,2.150943,585,114,2211,1,5.943313


In [104]:
# drop the columns hat are now not required
df.drop(['Title','Content','tokens','sen_number','filtered_token'],axis=1,inplace=True)

In [105]:
df.head(2)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,125,108,0.072961,0.948008,2.150943,0.518158,1.660455,2.150943,585,114,2211,1,5.943313
1,38,https://insights.blackcoffer.com/what-if-the-c...,134,95,0.170306,0.834827,1.701493,0.553985,1.228549,1.701493,431,114,1328,3,5.24036


In [124]:
# Saving the results in the csv file
df.to_csv('Result.csv',index=False)

In [123]:
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,125,108,0.072961,0.948008,1129,0.518158,1.660455,1129,585,1129,2211,1,5.943313
1,38,https://insights.blackcoffer.com/what-if-the-c...,134,95,0.170306,0.834827,778,0.553985,1.228549,778,431,778,1328,3,5.24036
2,39,https://insights.blackcoffer.com/what-jobs-wil...,100,111,-0.052133,0.973227,987,0.504559,1.348894,987,498,987,1898,2,5.782168
3,40,https://insights.blackcoffer.com/will-machine-...,128,90,0.174312,0.791003,888,0.564189,1.077653,888,501,888,1581,3,5.35473
4,41,https://insights.blackcoffer.com/will-ai-repla...,143,90,0.227468,0.791469,992,0.529234,1.305489,992,525,992,1811,6,5.535282
