Text Analysis Project

1) Importing Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from textstat.textstat import textstatistics

2) Importing input file

In [None]:
df=pd.read_csv('Input.csv')[['URL_ID','URL']]
df=df.iloc[0:114]
df

3) Data Extraction

In [None]:
contents=[]
for url in df['URL']:
    headers={'User-Agent': 'Chrome/74.0.3729.169'}
    #The User-Agent request header lets servers and network peers identify the application, operating system, vendor, and/or version of the requesting user agent.
    try:
        page=requests.get(url, headers=headers) #loading text
        print(page)
        soup=BeautifulSoup(page.content,'html.parser') #parsing text 
        title=soup.findAll('h1') #extracting title of website
        title=title[0].text
        body=soup.findAll(attrs={'class':'td-post-content'}) #extracting required content
        body=body[0].text.replace('\xa0',"  ").replace('\n',"  ") #extract text from p tags and replace end line symbols with space
        text=title+ '. '+body #merging title and content
        contents.append(text)
    except:
        contents.append(None)
        continue

In [None]:
df1 = pd.merge(df, pd.DataFrame(contents), left_index = True, right_index = True, how = "left")

In [None]:
df1.columns = ['URL_ID', 'URL', 'Text_Contents']
df1

In [None]:
df1.to_csv('collected_data.csv')

TEXT ANALYSIS

1) Sentimental Analysis

In [2]:
df=pd.read_csv("Output.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     114 non-null    int64  
 1   URL_ID         114 non-null    float64
 2   URL            114 non-null    object 
 3   Text_Contents  111 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 3.7+ KB


In [3]:
df.drop('Unnamed: 0',axis=1, inplace=True)

In [4]:
df

Unnamed: 0,URL_ID,URL,Text_Contents
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,AI in healthcare to Improve Patient Outcomes. ...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,What if the Creation is Taking Over the Creato...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,What Jobs Will Robots Take From Humans in The ...
3,40.0,https://insights.blackcoffer.com/will-machine-...,Will Machine Replace The Human in the Future o...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,Will AI Replace Us or Work With Us?. “Machin...
...,...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...,Blockchain for Payments. Reconciling with th...
110,147.0,https://insights.blackcoffer.com/the-future-of...,The future of Investing. What Is an Investme...
111,148.0,https://insights.blackcoffer.com/big-data-anal...,Big Data Analytics in Healthcare. Quality an...
112,149.0,https://insights.blackcoffer.com/business-anal...,Business Analytics In The Healthcare Industry....


1.1 Cleaning StopWords

In [5]:
#import all Stopwords
stop_words = []

StopWords_Auditor = 'StopWords/StopWords_Auditor.txt'
for stop_word in open(StopWords_Auditor, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())

StopWords_Currencies = 'Stopwords/StopWords_Currencies.txt'
for stop_word in open(StopWords_Currencies, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())

StopWords_Generic = 'Stopwords/StopWords_Generic.txt'
for stop_word in open(StopWords_Generic, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())

StopWords_GenericLong = 'Stopwords/StopWords_GenericLong.txt'
for stop_word in open(StopWords_GenericLong, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())

StopWords_DatesandNumbers= 'Stopwords/StopWords_DatesandNumbers.txt'
for stop_word in open(StopWords_DatesandNumbers, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())

StopWords_Geographic= 'Stopwords/StopWords_Geographic.txt'
for stop_word in open(StopWords_Geographic, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())
    
StopWords_Names= 'Stopwords/StopWords_Names.txt'
for stop_word in open(StopWords_Names, 'r').readlines():
    stop_words.append(stop_word.rstrip().upper())

In [6]:

# Function to remove stopwords and tokenize the text data
def text_prep(x: str) -> list:
     operation = str(x).upper()
     operation= re.sub('[^a-zA-Z]+',' ', operation).strip() 
     tokens = word_tokenize(operation)
     words = [t for t in tokens if t not in stop_words]
     return words
# Applying the function on the data
tokenized_text = [text_prep(i) for i in df['Text_Contents']]
df["tokenized_text"] = tokenized_text

In [7]:
df['num_words'] = df['tokenized_text'].map(lambda x: len(x))

1.2 Creating a dictionary of Positive and Negative Words

In [8]:
#importing master Dictionary
positive_dict=[]
positive= 'MasterDictionary/positive-words.txt'
for word in open(positive, 'r').readlines():
    positive_dict.append(word.rstrip().upper())

negative_dict=[]
negative= 'MasterDictionary/negative-words.txt'
for word in open(negative, 'r').readlines():
    negative_dict.append(word.rstrip().upper())

In [9]:
positive_master=[words for words in positive_dict if words not in stop_words]
negative_master=[words for words in negative_dict if words not in stop_words]

1.3 Extracting Derived Variables

In [10]:
#Positive Score
num_pos = df["tokenized_text"].map(lambda x: len([i for i in x if i in positive_master]))
df["positive_score"] = num_pos

In [11]:
#Negative Score
num_neg = df["tokenized_text"].map(lambda x: len([i for i in x if i in negative_master]))
df["negative_score"] = num_neg

In [12]:
#Polarity Score
df['polarity_score'] = round((df['positive_score'] - df['negative_score'])/(df['positive_score'] + df['negative_score'] + 0.000001), 2)

In [13]:
#Subjectivity Score
df['subjectivity_score'] = round((df['positive_score'] + df['negative_score'])/(df['num_words'] + 0.000001), 2)

2. Analysis of Readability

2.1 Average Sentence Length

In [14]:
#Average Sentence Length
df['num_sent'] = df['Text_Contents'].map(lambda x: len(sent_tokenize(x)), na_action='ignore')
df['avg_sent_len'] = round(df['num_words']/df['num_sent'], 1)

2.2 Percentage of Complex Words

In [15]:
def syllables_count(text):
  return textstatistics().syllable_count(text)
  
def complex_words(text):
  words_set = set()
  words = text
  for word in words:
    syllable_count = syllables_count(word)
    if syllable_count > 2:
      words_set.add(word)
  return len(words_set)

In [16]:
df['complex_words'] = df['tokenized_text'].apply(lambda x: complex_words(x))
df['complex_words_percent'] = round((df['complex_words']/df['num_words']), 2)

2.3 Fog Index

In [17]:
df['Fog_index'] = 0.4 * (df['avg_sent_len'] + df['complex_words_percent'])

3. Average Number of Words per Sentence

In [18]:
df['avg_words_per_sent'] = round(df['num_words']/df['num_sent'], 2)

4. Complex Word Count

Already calculated in complex_words Column.

5. Word Count

Already calculated in num_words

6. Syllable Count Per Word

(The library being used by default handles exceptions)

In [19]:
df['syll_count'] = df['tokenized_text'].apply(lambda x: syllables_count(" ".join(x)))
df['syll_per_word'] = df['syll_count']/df['num_words']

7. Personal Pronouns

In [20]:
def psnl_pronoun(text):
  pronounRe = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
  allpronouns = pronounRe.findall(text)
  return allpronouns

(?-i:us) is used as in-line modifier group where the matching is CASE SENSITIVE. As a result, this matches only us not US.

In [21]:
df['psnl_pronouns'] = df['Text_Contents'].map(lambda x: len(psnl_pronoun(x)), na_action='ignore')

8. Average Word Length

In [22]:
def text_len(text):
  text = ''.join(text)
  filtered= re.sub('[^a-zA-Z]+',' ', text)
  words = [word for word in filtered.split() if word]
  avglen = sum(map(len, words))/len(words)
  return avglen

In [23]:
df['avg_word_len'] = df['Text_Contents'].map(lambda x: text_len(x), na_action='ignore')

Formatting the Output Structure

In [24]:
df.columns

Index(['URL_ID', 'URL', 'Text_Contents', 'tokenized_text', 'num_words',
       'positive_score', 'negative_score', 'polarity_score',
       'subjectivity_score', 'num_sent', 'avg_sent_len', 'complex_words',
       'complex_words_percent', 'Fog_index', 'avg_words_per_sent',
       'syll_count', 'syll_per_word', 'psnl_pronouns', 'avg_word_len'],
      dtype='object')

In [25]:
df=df[['URL_ID', 'URL', 'positive_score', 'negative_score', 'polarity_score', 'subjectivity_score', 'avg_sent_len', 'complex_words_percent', 'Fog_index', 'avg_words_per_sent', 'complex_words', 'num_words', 'syll_per_word', 'psnl_pronouns', 'avg_word_len']]

In [26]:
df.columns=['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
df.head()


Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,66,34,0.32,0.1,12.4,0.25,5.06,12.42,246,969,2.303406,1.0,5.566338
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,58,37,0.22,0.17,6.8,0.23,2.812,6.79,126,550,2.156364,7.0,4.733425
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,65,35,0.3,0.12,9.6,0.26,3.944,9.59,217,825,2.384242,3.0,5.366647
3,40.0,https://insights.blackcoffer.com/will-machine-...,66,28,0.4,0.15,6.5,0.22,2.688,6.52,136,626,2.193291,17.0,4.794313
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60,27,0.38,0.12,9.4,0.24,3.856,9.39,181,751,2.270306,16.0,5.028016


In [27]:
df.to_csv('Output-Data-Structure.csv', index = False)