In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# imports

import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
# paths to files

npath = '/content/drive/MyDrive/Text_Analysis_Assignment/MasterDictionary/negative-words.txt'
ppath = '/content/drive/MyDrive/Text_Analysis_Assignment/MasterDictionary/positive-words.txt'
stop_dir_path = '/content/drive/MyDrive/Text_Analysis_Assignment/StopWords'
input = '/content/drive/MyDrive/Text_Analysis_Assignment/Input.xlsx'
out = '/content/drive/MyDrive/Text_Analysis_Assignment/Output Data Structure.xlsx'

In [None]:
# function to extract article

def para_ex(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html')

  try:
    para = soup.find('div', class_="td-post-content tagdiv-type")
    return para.text

  except AttributeError:

      try:
        para = soup.find_all('div', class_="tdb-block-inner td-fix-index")[14]
        return para.text
      except IndexError:
        return ' '

# function to extract title

def title_ex(url):
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html')

  try:
    head = soup.find('h1')
    final = head.text
    return final
  except:
    head = ' '
    return head

In [None]:
#creating clean function to remove spaces and punctuations and join the title and rest of the article

def clean(head ,tail):
  head = head.strip()
  head = head.translate({ord(i): None for i in ':,.!?;'})
  title = head.split(' ')

  alt = tail.strip()
  alt = alt.translate({ord(i): None for i in ':,.!?;'})
  art = alt.split(' ')

  all_text = title + art
  return all_text

In [None]:
# function to calculate number of sentences

def sentence(para):
  sen_list = para.split('.')
  for sent in sen_list:
    if len(sent) < 5:
      sen_list.remove(sent)
  sen_num = len(sen_list)
  return (sen_num)

In [None]:
#function to count words, after removing stop words

def tot_words(all_text):
  tot = 0
  for word in all_text:
    if word not in swords:
      tot += 1
  return tot

In [None]:
#function to calculate avg word length

def avg_wlength(all_text):
  c = 0
  for word in all_text:
    c += len(word)
  return c/len(all_text)

In [None]:
#function to count words with more than two syllables

def complex_words(all_text):
  count = 0
  for word in all_text:
    syl = word.count('a') + word.count('e') + word.count('i') + word.count('o') + word.count('u')
    #handling the 'es' and 'ed' exception
    if word.endswith('es') or word.endswith('ed'):
      syl -= 1
    if syl > 2:
      count += 1
  return count

In [None]:
#function to calculate average syllable per word

def avg_syl(text):
  count = 0
  for word in text:
    syl = word.count('a') + word.count('e') + word.count('i') + word.count('o') + word.count('u')
    #handling the 'es' and 'ed' exception
    if word.endswith('es') or word.endswith('ed'):
      syl -= 1
    count += syl

  avg = count/len(text)
  return avg

In [None]:
#function to count number of personal pronouns

def count_pronoun_inst(stri, w):
    regexPattern = "\\b" + w + "\\b"

    count=0

    for m in re.finditer(regexPattern, stri):

        count=count+1

    return count

In [None]:
#creating an array of negative words

nwords = []
with open(npath, 'r', encoding='windows-1252') as f:
    for word in f:
      nwords.append(word.strip())

#creating an array of positive words

pwords = []
with open(ppath, 'r', encoding='windows-1252') as f:
    for word in f:
      pwords.append(word.strip())

#creating an array of stop words

swords = []
for filename in os.listdir(stop_dir_path):
  with open(stop_dir_path + '/' + filename, 'r', encoding='windows-1252') as f:
    for word in f:
      swords.append(word.strip())


In [None]:
#function to calculate positive score

def pscore(all_text):

  pscore = 0
  for word in all_text:
    if word not in swords:
      if word in pwords:
        pscore += 1
    else:
      continue
  return(pscore)

#function to calculate negative score

def nscore(all_text):

   nscore = 0
   for word in all_text:
    if word not in swords:
      if word in nwords:
        nscore += 1
    else:
      continue
   return(nscore * (-1))

In [None]:
#required lists corresponding to different analysis criterions

df = pd.read_excel(out)

positive = []
negative = []
polarity = []
subjectivity = []
complex_count = []
per_of_complexw = []
fog_index = []
word_count = []
avg_wcount = []
avg_sen_lgth = []
per_pronouns = []
syl_per_word = []

pronouns = ['I', 'we', 'my', 'ours', 'us']



#loop iterating all URLs and calculating their respective parameters

for i in range(len(df)):
    url = df['URL'].iloc[i]

    article = para_ex(url)
    title = title_ex(url)
    complete = clean(title, article)
    ps = pscore(complete)
    ns = nscore(complete)
    total = tot_words(complete)
    complexw = complex_words(complete)
    avg_word_count = avg_wlength(complete)
    total_sentences = sentence(article)
    avg_syllables = avg_syl(complete)

    number_of_pronouns = 0
    for w in pronouns:
      number_of_pronouns += count_pronoun_inst(article, w)

    pol_score = (ps - ns) / ((ps + ns) + 0.000001)

    subj_score = (ps + ns)/ ((total) + 0.000001)

    comp_per = (complexw/len(complete)) * 100

    if total_sentences != 0:
      words_per_sentence = len(complete) / total_sentences
    else:
      words_per_sentence = 0

    fog_value = 0.4 * (words_per_sentence + comp_per)


    positive.append(ps)

    negative.append(ns)

    polarity.append(pol_score)

    subjectivity.append(subj_score)

    per_of_complexw.append(comp_per)

    complex_count.append(complexw)

    word_count.append(total)

    avg_wcount.append(avg_word_count)

    avg_sen_lgth.append(words_per_sentence)

    fog_index.append(fog_value)

    per_pronouns.append(number_of_pronouns)

    syl_per_word.append(avg_syllables)

#populating columns

df['POSITIVE SCORE'] = positive
df['NEGATIVE SCORE'] = negative
df['POLARITY SCORE'] = polarity
df['SUBJECTIVITY SCORE'] = subjectivity
df['PERCENTAGE OF COMPLEX WORDS'] = per_of_complexw
df['COMPLEX WORD COUNT'] = complex_count
df['WORD COUNT'] = word_count
df['AVG WORD LENGTH'] = avg_wcount
df['FOG INDEX'] = fog_index
df['AVG NUMBER OF WORDS PER SENTENCE'] = avg_sen_lgth
df['AVG SENTENCE LENGTH'] = avg_sen_lgth
df['PERSONAL PRONOUNS'] = per_pronouns
df['SYLLABLE PER WORD'] = syl_per_word

df.to_excel(out, index=False)
df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,33,-5,1.357143,0.045603,15.538462,18.646865,13.674130,15.538462,226,614,1.683993,6,4.737624
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,58,-29,3.000000,0.034982,17.792683,28.512680,18.522145,17.792683,416,829,2.029472,3,5.583962
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,37,-23,4.285714,0.021212,18.543860,37.086093,22.251981,18.543860,392,660,2.259224,13,6.282876
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,35,-69,-3.058824,-0.054575,19.961538,35.452794,22.165733,19.961538,368,623,2.204239,4,6.151252
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,19,-8,2.454545,0.029810,16.950000,28.023599,17.989440,16.950000,190,369,1.997050,6,5.709440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,27,-52,-3.160000,-0.040519,21.980392,25.245317,18.890284,21.980392,283,617,1.909010,2,5.337199
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,21,-34,-4.230770,-0.028571,28.289474,21.116279,19.762301,28.289474,227,455,1.746977,5,4.810233
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,5,-2,2.333333,0.012146,15.480000,29.974160,18.181664,15.480000,116,247,1.974160,0,5.759690
98,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...,13,-2,1.363636,0.034591,17.057143,17.922948,13.992036,17.057143,107,318,1.666667,3,4.956449
