In [3]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import re

In [36]:
class sentiment_analysis:
    def __init__(self):
        
        self.input_dataframe= self.get_input_dataframe()
        self.output_dataframe = self.get_output_dataframe()
        
        self.positive_words=self.get_positive_words()
        self.negative_words=self.get_negative_words()
        
        self.stop_words= self.get_stop_words()
        
        self.stemmer = PorterStemmer()
        
        self.main_code = self.main_code(0)
        
        #returns the input data frame 
        
    def get_input_dataframe(self):
        
        input_data=pd.read_excel('C:/Sentiment_analysis_project/input.xlsx')
        return input_data
        
        # returns the structure of the output data frame. 
        
    def get_output_dataframe(self):
        
        output_data = pd.read_excel('C:/Sentiment_analysis_project/Output_structure.xlsx')
        return output_data
    
        # main code updates the output data frame based on the calculated values. It interacts with the method named 'get_all_output_features'
    
    def main_code(self,i):
        
        self.index=i
        url=self.input_dataframe.loc[self.index,'URL']
        
        response = requests.get(url)
        html_content = response.text
        soup = BeautifulSoup(html_content, "html.parser")

        output_filename = "C:/Sentiment_analysis_project/Extracted_text/extracted_paragraphs_{}.txt"

        output_file = output_filename.format(self.index)

        min_paragraph_length = 200

        paragraphs = soup.find_all('p')

        words=[]

        with open(output_file, "w") as file:
            
            count_of_sentences = 0
            
            count_of_words_with_token = 0
            
            total_characters=0
            
            count_of_pronouns =0
            
            

            for p in paragraphs:
                
                para_text=p.get_text()
                
                if 'We provide intelligence, accelerate innovation and implement technology' not in para_text and len(para_text)>min_paragraph_length: 
                    
                    pronounRegex = re.compile(r'I|we|my|ours|us',re.I)
                    pronouns = pronounRegex.findall(para_text)
                    
                    count_of_pronouns +=len(pronouns)
                    
                    count_of_sentences += len(sent_tokenize(para_text))
                    
                    count_of_words_with_token += len(word_tokenize(para_text))
                    
                    file.write(para_text + "\n")

                    paragraph_words=para_text.split(' ')
                    
                    for word in paragraph_words:
    
                        word=word.replace(".","")
                        word=word.replace('"',"")
                        word=word.replace(',',"")

                        if word.lower() not in self.stop_words and len(word)>=1:
                        
                            total_characters+=len(word.lower())
                            words.append(word.lower())
        
        
        
        name_of_output_features = ['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                                   'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH',
                                   'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                                   'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
                                   'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
        
        # calling the 'get_all_output_features'
        values_of_output_features = self.get_all_output_features(words,count_of_sentences,count_of_words_with_token,total_characters,count_of_pronouns)
        
        # updating the desried feaatures for ith row
        for j in range (len(name_of_output_features)):
            
            self.output_dataframe.loc[i,name_of_output_features[j]]=values_of_output_features[j]
        
        if i<len(self.input_dataframe)-1:
            
        #calling the main_code method recursively for every ith row of the dataframe
            self.main_code(i+1)
            
        # after iterating through everyrow and updating all the values we call this method.
        #This will return the final output data frame
        else:
            final_output_dataframe = self.get_final_output_dataframe()
        
            return final_output_dataframe
            
    # get the list of positive words
    
    def get_positive_words(self):
        
        self.positive_words_list=[]
        
        positive_words_filename = "C:/Sentiment_analysis_project/positive-words.txt"
        
        with open(positive_words_filename,"r",encoding='utf-8') as file:
            
            for line in file:
                
                positive_words_inline= line.split()
                for positive_word in positive_words_inline:
                    self.positive_words_list.append(positive_word.lower()) 
                    
        return self.positive_words_list
    
    #get the list of negative words
    
    def get_negative_words(self):
        
        self.negative_words_list=[]
        negative_words_filename = "C:/Sentiment_analysis_project/negative-words.txt"
        
        with open(negative_words_filename,"r", encoding='utf-8') as file:
            
                
            for line in file:
                    
                negative_words_inline= line.split()
                for negative_word in negative_words_inline:
                    self.negative_words_list.append(negative_word.lower())
                    
        return self.negative_words_list 
    
    #get the list of stop words
    
    def get_stop_words(self) :
        
        self.stop_words_list=[]
        
        combined_stop_words_filename = "C:/Sentiment_analysis_project/Stop_words/combined_stop_words.txt"
        
        with open(combined_stop_words_filename,"r") as file:
            
            for line in file:
                
                combined_stop_words_inline= line.split()
                
                for stop_word in combined_stop_words_inline:
                    
                    self.stop_words_list.append(stop_word.lower())
                    
        currency_stop_words_filename = "C:/Sentiment_analysis_project/Stop_words/StopWords_Currencies.txt"
        
        with open(currency_stop_words_filename,"r") as file:
            
            for line in file:
                
                currency_stop_words_inline= line.rstrip('\n').split('|')
                
                for stop_word in currency_stop_words_inline:
                    
                    self.stop_words_list.append(stop_word.lower())
        
        return self.stop_words_list
    
    # this methods get a list of output features for ith row.It takes the input from the 'main_code' method.
    #This method also uses the output from get_count_of_complex_and_syllables method
    def get_all_output_features(self,word_list_of_ith_url,count_of_sentences,count_of_words,total_useful_characters,personal_pronouns):
        
        self.words=word_list_of_ith_url
        
        positive_score=0
        negative_score=0
        subjectivity_score=0
        
        for word in self.words:
            
            if word in self.positive_words:
                positive_score+=1
                
            if word in self.negative_words:
                negative_score+=1
                
        polarity_score = ((positive_score-negative_score)/((positive_score + negative_score)+0.000001)) +0.000001
        subjectivity_score = (positive_score+negative_score)/(len(self.words)+0.000001)
        
        # calling the get_count_of_complex_and_syllables method
        
        syllabes_and_complex_count = self.get_count_of_complex_and_syllables(self.words)
        
        count_of_syllables = syllabes_and_complex_count[0]
        count_of_complex_words =syllabes_and_complex_count[1]
        
        if count_of_words==0:
            
            syllable_per_word = 0
            avg_word_length = 0
            percentage_of_complex_words = 0
            
        else:
            
            syllable_per_word = count_of_syllables/count_of_words
            avg_word_length = total_useful_characters//count_of_words
            percentage_of_complex_words = (count_of_complex_words/count_of_words)*100
        
        
        if count_of_sentences==0:
            
                avg_sentence_length    =0
                avg_words_per_sentence =0
        else:
                
            avg_sentence_length = total_useful_characters//count_of_sentences
        
            avg_words_per_sentence = count_of_words//count_of_sentences
        
        fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

        return [positive_score,negative_score,polarity_score,subjectivity_score,
                avg_sentence_length,percentage_of_complex_words,fog_index,avg_words_per_sentence,
               count_of_complex_words,count_of_words,syllable_per_word,personal_pronouns,avg_word_length]
    
    # this method takes the input of the word list from the get_all_output_features method
    # and returns a list of length 2 back into that method
    
    def get_count_of_complex_and_syllables(self,list_of_words):
        
        total_count_of_syllables =0
        count_of_complex_words =0
        vowels ='aeoiuy'
        
        for word in list_of_words:
            count_of_syllables_in_word =0
            
            if word[0] in vowels:
                count_of_syllables_in_word +=1
                
            if len(word)<=1:
                continue
                
            else:
                
                for word_index in range(1,len(word)):

                    if word[word_index] in  vowels and word[word_index - 1] not in vowels:

                        count_of_syllables_in_word +=1
                
            if word.endswith('e'):
                count_of_syllables_in_word -= 1
                
            if word.endswith('le'):
                count_of_syllables_in_word += 1
                
            if count_of_syllables_in_word == 0:
                count_of_syllables_in_word += 1
                
            total_count_of_syllables += count_of_syllables_in_word 
            
            if count_of_syllables_in_word >=3:
                count_of_complex_words +=1
                
        return [total_count_of_syllables,count_of_complex_words]
                        
    # This method is called in the end of 'main_code' method. This returns the final output data frame
    
    def get_final_output_dataframe(self):
        
        self.output_dataframe.to_excel('C:Sentiment_analysis_project/final_output_dataframe.xlsx')
        
        return self.output_dataframe
    
         

In [37]:
obj=sentiment_analysis()



UnicodeDecodeError: 'utf-8' codec can't decode byte 0xef in position 3551: invalid continuation byte

In [33]:
obj.get_final_output_dataframe()

NameError: name 'obj' is not defined