# **Report Analysis - Organizing report in the blink of an eye**


## The main idea of this project, to automate the financial reports analysis, which will give the client an upper hand on the analysis of the report, readability, constrainity, uncertainity, complexity, as well as the proportion of sentiment present in the reports. This proof of concept will save an around ample amount of time spent on reports, that will lead to the better time management. The future version might even have the regression on the fog index, that will lead to the sortment of the employees according to the difficuilty of the reports.

In [None]:
!pip install english_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re # for string processing
import nltk # text analytics
import time # to borrow time for request-response
import requests # to fetch the information from the desired website
import string # to use in NLP preprocessing
import pandas as pd # data analyatics
import numpy as np # one dimensional data analysis
from nltk.corpus import stopwords # stopwords
from bs4 import BeautifulSoup # web-scraping tool
from nltk.tokenize import word_tokenize, sent_tokenize # NLP tools
from nltk.stem import WordNetLemmatizer, PorterStemmer # NLP tools
from english_words import english_words_lower_alpha_set, english_words_set # NLP tools

In [None]:
# prerequiste required for text analytics
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# reading the excel file
report = pd.read_excel('/content/drive/MyDrive/Capstone Project/cik_list.xlsx')
report.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


In [None]:
# getting ready our website link
url_start = 'https://www.sec.gov/Archives/'
urls = []
for ele in report['SECFNAME']:
  urls.append(url_start + ele)
report['URL'] = urls

In [None]:
# requesting the permission for fetching the information, & then reading in 'html.parser'.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61/63 Safari/537.36'}
data = requests.get('https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-000413.txt', headers = headers)
page = data.text
text = BeautifulSoup(page, 'html.parser')
text

-----BEGIN PRIVACY-ENHANCED MESSAGE-----
Proc-Type: 2001,MIC-CLEAR
Originator-Name: webmaster@www.sec.gov
Originator-Key-Asymmetric:
 MFgwCgYEVQgBAQICAf8DSgAwRwJAW2sNKK9AVtBzYZmr6aGjlWyK3XmZv3dTINen
 TWSM7vrzLADbmYQaionwg5sDW3P6oaM5D3tdezXMm7z1T+B+twIDAQAB
MIC-Info: RSA-MD5,RSA,
 EvPdKfnjzBIjWkEk2RgNCk1/52qXomHpN+LDwL/XTT/XBuAzk70AYYrsxlQbyiqr
 V5559QRyTgPe9PfVt0db9Q==

<sec-document>0000950170-98-000413.txt : 19980309
<sec-header>0000950170-98-000413.hdr.sgml : 19980309
ACCESSION NUMBER:		0000950170-98-000413
CONFORMED SUBMISSION TYPE:	10-K405
PUBLIC DOCUMENT COUNT:		21
CONFORMED PERIOD OF REPORT:	19971228
FILED AS OF DATE:		19980306
SROS:			NYSE

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			SUNBEAM CORP/FL/
		CENTRAL INDEX KEY:			0000003662
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRIC HOUSEWARES &amp; FANS [3634]
		IRS NUMBER:				251638266
		STATE OF INCORPORATION:			DE
		FISCAL YEAR END:			1229

	FILING VALUES:
		FORM TYPE:		10-K405
		SEC ACT:		
		SEC FILE NUMBER:	001-00052

In [None]:
# checking the type of the scraped data.
type(text.text)

str

In [None]:
lem = WordNetLemmatizer()

In [None]:
# cleaning steps for text data.
clean_text = re.sub('\d+', ' ', text.text) # substituting blank space instead of digit
clean_text = re.sub('\n+', ' ', clean_text) # substituting blank space instead of new-line marker
clean_text = re.sub('\t+', ' ', clean_text) # substituting blank space instead of tab marker
clean_text = clean_text.lower() # transforming to lowercase
clean_text = re.sub('[^a-z]', ' ', clean_text) # substituting blank space instead of alphabets, basically removing everything other than words
clean_text = re.sub('\s+', ' ', clean_text) # substituting blank one space instead of more than one blank space
clean_text = ' '.join([lem.lemmatize(word) for word in word_tokenize(clean_text)]) # converting each word to the root form, joining back to the sentence
clean_text = ' '.join([word for word in word_tokenize(clean_text) if word in english_words_lower_alpha_set]) # segregating each word from the meaningless word
clean_text = ' '.join([word for word in word_tokenize(clean_text) if len(word)>=2]) # length of the word should be equal or greater than 2.
clean_text

'begin privacy message type clear name sec key info md accession type public document count period of of date company data company name sunbeam corp fl central index key standard industrial electric fan irs state of de fiscal year end value form type sec act sec file film business address street south congress avenue street suite city beach state fl zip business phone mail address street south congress avenue street suite city beach state fl zip company name sunbeam company inc de date of name change state and exchange commission washington form mark one annual pursuant to section or of the exchange act of for the fiscal year december or transition pursuant to section or of the exchange act of no fee for the transition period from to commission file sunbeam sunbeam exact name of registrant in it delaware state or other jurisdiction of or congress avenue suite beach florida address of principal executive office zip code registrant telephone area code pursuant to section of the act title

In [None]:
# assembling different type of stopwords
with open('/content/drive/MyDrive/Capstone Project/stopWords_Auditor.txt') as file_1:
  stop = file_1.read().lower()
with open('/content/drive/MyDrive/Capstone Project/stopWords_Currencies.txt') as file_2:
  stop_1 = file_2.read().lower()
with open('/content/drive/MyDrive/Capstone Project/stopWords_DatesandNumbers.txt') as file_3:
  stop_2 = file_3.read().lower()
with open('/content/drive/MyDrive/Capstone Project/stopWords_GenericLong.txt') as file_4:
  stop_3 = file_4.read().lower()
with open('/content/drive/MyDrive/Capstone Project/stopWords_Geographic.txt') as file_5:
  stop_4 = file_5.read().lower()
with open('/content/drive/MyDrive/Capstone Project/stopWords_Names.txt') as file_6:
  stop_5 = file_6.read().lower()
stopword = list(stop.split()) + list(stop_1.split()) + list(stop_2.split()) + list(stop_3.split()) + list(stop_4.split()) + list(stop_5.split())
print(stopword)  

['ernst', 'young', 'deloitte', 'touche', 'kpmg', 'pricewaterhousecoopers', 'pricewaterhouse', 'coopers', 'afghani', 'ariary', 'baht', 'balboa', 'birr', 'bolivar', 'boliviano', 'cedi', 'colon', 'córdoba', 'dalasi', 'denar', 'dinar', 'dirham', 'dobra', 'dong', 'dram', 'escudo', 'euro', 'florin', 'forint', 'gourde', 'guarani', 'gulden', 'hryvnia', 'kina', 'kip', 'konvertibilna', 'marka', 'koruna', 'krona', 'krone', 'kroon', 'kuna', 'kwacha', 'kwanza', 'kyat', 'lari', 'lats', 'lek', 'lempira', 'leone', 'leu', 'lev', 'lilangeni', 'lira', 'litas', 'loti', 'manat', 'metical', 'naira', 'nakfa', 'new', 'lira', 'new', 'sheqel', 'ngultrum', 'nuevo', 'sol', 'ouguiya', 'pataca', 'peso', 'pound', 'pula', 'quetzal', 'rand', 'real', 'renminbi', 'rial', 'riel', 'ringgit', 'riyal', 'ruble', 'rufiyaa', 'rupee', 'rupee', 'rupiah', 'shilling', 'som', 'somoni', 'special', 'drawing', 'rights', 'taka', 'tala', 'tenge', 'tugrik', 'vatu', 'won', 'yen', 'zloty', 'hundred', 'thousand', 'million', 'billion', 'tril

In [None]:
# number of in cleaned data 
len(clean_text.split())

127294

In [None]:
# removing stopwords from the corpus
clean_text = ' '.join([word for word in word_tokenize(clean_text) if word not in stopword])
clean_text

'begin privacy message type clear sec info md accession type public document count period company data company sunbeam corp fl central index standard industrial electric fan irs de fiscal end form type sec act sec file film business address congress avenue suite fl zip business phone mail address congress avenue suite fl zip company sunbeam company de change exchange commission form pursuant section exchange act fiscal transition pursuant section exchange act transition period commission file sunbeam sunbeam exact registrant jurisdiction congress avenue suite address principal executive office zip code registrant telephone area code pursuant section act title class exchange common par exchange pursuant section act check registrant section exchange act period registrant wa file subject past check delinquent pursuant item section chapter registrant knowledge definitive proxy information part form form aggregate market class registrant affiliate wa share registrant common document proxy s

In [None]:
# number of words in cleaned corpus
len(clean_text.split())

40504

In [None]:
# reading the positive words
with open('/content/drive/MyDrive/Capstone Project/positiveWords.txt') as file_7:
  positive = file_7.read()
positivewords = [word for word in word_tokenize(positive) if word not in stopword] # removing stopwords from positive words
positivewords[:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [None]:
# reading the negative words
with open('/content/drive/MyDrive/Capstone Project/negativeWords.txt') as file_8:
  negative = file_8.read()
negativewords = [word for word in word_tokenize(negative) if word not in stopword] 
negativewords[:5]

['abnormal', 'abolish', 'abominable', 'abominably', 'abominate']

In [None]:
# calculating the positive score
positivescore = 0
for word in word_tokenize(clean_text):
  if word in positivewords: # for every word present, we assign one point
    positivescore = positivescore + 1
positivescore

2004

In [None]:
# calculating the negative score
negativescore = 0
for word in word_tokenize(clean_text):
  if word in negativewords: # for every word present, we assign one point
    negativescore = negativescore + 1
negativescore

1207

In [None]:
# calculating the polarity score
polarityscore = (positivescore - negativescore) / ((positivescore + negativescore) + 0.000001)
polarityscore

0.2482092805206449

In [None]:
# calculating the subjectivity score
subjectivityscore = (positivescore + negativescore) / ((len(word_tokenize(clean_text))) + 0.000001)
subjectivityscore

0.07927612087499318

In [None]:
# text processing for sentence
text_clean = text.text.lower()
text_clean = re.sub('[^a-z.]',' ', text_clean)
text_clean = re.sub('\s+',' ', text_clean)
text_clean = ' '.join([word for word in word_tokenize(text_clean) if len(word)>=2])
text_clean = re.split('\.+', text_clean)
text_clean = [sent.strip() for sent in text_clean]
text_clean = [sent for sent in text_clean if len(sent.split()) > 3]
text_clean = '. '.join([sent for sent in text_clean])
text_clean = ' '.join([word for word in text_clean.split() if len(word)>=2])
text_clean = re.sub('gov originator key asymmetric mfgwcgyevqgbaqicaf dsgawrwjaw snkk avtbzyzmr agjlwyk xmzv dtinen twsm vrzladbmyqaionwg sdw oam tdezxmm twidaqab mic info rsa md rsa evpdkfnjzbijwkek rgnck qxomhpn ldwl xtt xbuazk ayyrsxlqbyiqr qrytgpe pfvt db.', '', text_clean)
text_clean 

'begin privacy enhanced message proc type mic clear originator name webmaster www.  sgml accession number conformed submission type public document count conformed period of report filed as of date sros nyse filer company data company conformed name sunbeam corp fl central index key standard industrial classification electric housewares fans irs number state of incorporation de fiscal year end filing values form type sec act sec file number film number business address street south congress avenue street suite city delray beach state fl zip business phone mail address street south congress avenue street suite city delray beach state fl zip former company former conformed name sunbeam oster company inc de date of name change united states securities and exchange commission washington d. form mark one annual report pursuant to section or of the securities exchange act of for the fiscal year ended december or transition report pursuant to section or of the securities exchange act of no fe

In [None]:
# calculating the average sentence lenth
averagesentencelength = len(text_clean.split())/len(text_clean.split('.'))
averagesentencelength

191.50480769230768

In [None]:
# calculating the complex words score
complexwords = 0
for word in word_tokenize(text_clean):
  syllable_count = 0
  for ch in word:
    if ch == 'a' or ch == 'e' or ch == 'i' or ch == 'o' or ch == 'u':
      syllable_count = syllable_count + 1
  if syllable_count > 2: 
    complexwords = complexwords + 1
complexwords

48540

In [None]:
# calculating percentage of complex words
percentage_of_complex_word = (complexwords / len(word_tokenize(text_clean))) * 100
percentage_of_complex_word

30.38402553910676

In [None]:
# calculating fog index
fog_index = 0.4 * (averagesentencelength + percentage_of_complex_word)
fog_index

88.75553329256577

In [None]:
# reading the constrain words
constrain = pd.read_excel('/content/drive/MyDrive/Capstone Project/constraining_dictionary.xlsx')
constrain.head()

In [None]:
# normalizing the vocabarly
constrainwords = [word.lower() for word in constrain['Word']]
constrainwords[:5]

['abide', 'abiding', 'bound', 'bounded', 'commit']

In [None]:
# reading the uncertain words
uncertain = pd.read_excel('/content/drive/MyDrive/Capstone Project/uncertainty_dictionary.xlsx')
uncertain.head()

Unnamed: 0,Word
0,ABEYANCE
1,ABEYANCES
2,ALMOST
3,ALTERATION
4,ALTERATIONS


In [None]:
# normalizing the vocabarly
uncertainwords = [word.lower() for word in uncertain['Word']]
uncertainwords[:5]

['abeyance', 'abeyances', 'almost', 'alteration', 'alterations']

In [None]:
# calculating the constrain score
constrainingscore = 0
for word in word_tokenize(clean_text):
  if word in constrainwords: # for every word in constrain
    constrainingscore = constrainingscore + 1
constrainingscore

401

In [None]:
# calculating the uncertain words
uncertainityscore = 0
for word in word_tokenize(clean_text):
  if word in uncertainwords: # for every word in uncertain
    uncertainityscore = uncertainityscore + 1
uncertainityscore

123

In [None]:
# getting the stopwords, only of english
stop = stopwords.words('english')
stop[:5]

['i', 'me', 'my', 'myself', 'we']

In [None]:
# calculating the words in corpus
wordcount = [word for word in word_tokenize(clean_text) if word not in stop]
len(wordcount)

40504

In [None]:
# positive word proportion
positive_word_proportion = positivescore / len(wordcount)
positive_word_proportion

0.04947659490420699

In [None]:
# negativce word proportion
negative_word_proportion = negativescore / len(wordcount)
negative_word_proportion

0.02979952597274343

In [None]:
# uncertainity word proportion
uncertainty_word_proportion = uncertainityscore / len(wordcount)
uncertainty_word_proportion

0.0030367371123839622

In [None]:
# constraining word proportion
constraining_word_proportion = constrainingscore / len(wordcount)
constraining_word_proportion

0.009900256764763974

In [None]:
# constrainiing words all over the report
constraining_words_whole_report = [word for word in word_tokenize(text_clean) if word in constrainwords]
len(constraining_words_whole_report)

1483

In [None]:
def analysis_generator(urls):
	positivescorelst = []
	negativescorelst = []
	polarityscorelst = []
	subjectivityscorelst = []
	averagesentencelengthlst = []
	complexwordslst = []
	percentage_of_complex_wordlst = []
	fog_indexlst = []
	constrainingscorelst = []
	uncertainityscorelst = []
	wordcountlst = []
	positive_word_proportionlst = []
	negative_word_proportionlst = []
	constraining_word_proportionlst = []
	uncertainty_word_proportionlst = []
	constraining_words_whole_reportlst = []
	for url in urls:
		positivescore = 0
		negativescore = 0
		complexwords = 0
		constrainingscore = 0
		uncertainityscore = 0
		print(url)
		headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
		data = requests.get(url , headers = headers)
		time.sleep(5)
		page = data.text
		text = BeautifulSoup(page, 'html.parser')
		clean_text = re.sub('\d+', ' ', text.text)
		clean_text = re.sub('\n+', ' ', clean_text)
		clean_text = re.sub('\t+', ' ', clean_text)
		clean_text = clean_text.lower()
		clean_text = re.sub('[^a-z]', ' ', clean_text)
		clean_text = re.sub('\s+', ' ', clean_text)
		clean_text = ' '.join([lem.lemmatize(word) for word in word_tokenize(clean_text)])
		clean_text = ' '.join([word for word in word_tokenize(clean_text) if word in english_words_lower_alpha_set])
		clean_text = ' '.join([word for word in word_tokenize(clean_text) if len(word)>=2])
		clean_text = ' '.join([word for word in word_tokenize(clean_text) if word not in stopword])
		for word in word_tokenize(clean_text):
		  if word in positivewords:
		    positivescore = positivescore + 1
		positivescorelst.append(positivescore)
		print(positivescore)
		for word in word_tokenize(clean_text):
		  if word in negativewords:
		    negativescore = negativescore + 1
		negativescorelst.append(negativescore)
		print(negativescore)
		polarityscore = (positivescore - negativescore) / ((positivescore + negativescore) + 0.000001)
		polarityscorelst.append(polarityscore)
		print(polarityscore)
		subjectivityscore = (positivescore + negativescore) / ((len(word_tokenize(clean_text))) + 0.000001)
		subjectivityscorelst.append(subjectivityscore)
		print(subjectivityscore)
		text_clean = text.text.lower()
		text_clean = re.sub('[^a-z.]',' ', text_clean)
		text_clean = re.sub('\s+',' ', text_clean)
		text_clean = ' '.join([word for word in word_tokenize(text_clean) if len(word)>=2])
		text_clean = re.split('\.+', text_clean)
		text_clean = [sent.strip() for sent in text_clean]
		text_clean = [sent for sent in text_clean if len(sent.split()) > 3]
		text_clean = '. '.join([sent for sent in text_clean])
		text_clean = ' '.join([word for word in text_clean.split() if len(word)>=2])
		text_clean = re.sub('gov originator key asymmetric mfgwcgyevqgbaqicaf dsgawrwjaw snkk avtbzyzmr agjlwyk xmzv dtinen twsm vrzladbmyqaionwg sdw oam tdezxmm twidaqab mic info rsa md rsa evpdkfnjzbijwkek rgnck qxomhpn ldwl xtt xbuazk ayyrsxlqbyiqr qrytgpe pfvt db.', '', text_clean)
		averagesentencelength = len(word_tokenize(text_clean))/len(text_clean.split('.'))
		averagesentencelengthlst.append(averagesentencelength)
		print(averagesentencelength)
		for word in word_tokenize(text_clean):
		  syllable_count = 0
		  for ch in word:
		    if ch == 'a' or ch == 'e' or ch == 'i' or ch == 'o' or ch == 'u':
		      syllable_count = syllable_count + 1
		  if syllable_count > 2:
		    complexwords = complexwords + 1
		complexwordslst.append(complexwords)
		print(complexwords)
		percentage_of_complex_word = (complexwords / len(word_tokenize(text_clean))) * 100
		percentage_of_complex_wordlst.append(percentage_of_complex_word)
		print(percentage_of_complex_word)
		fog_index = 0.4 * (averagesentencelength + percentage_of_complex_word)
		fog_indexlst.append(fog_index)
		print(fog_index)
		for word in word_tokenize(clean_text):
		  if word in constrainwords:
		    constrainingscore = constrainingscore + 1
		constrainingscorelst.append(constrainingscore)
		print(constrainingscore)
		for word in word_tokenize(clean_text):
		  if word in uncertainwords:
		    uncertainityscore = uncertainityscore + 1
		uncertainityscorelst.append(uncertainityscore)
		print(uncertainityscore)
		wordcount = [word for word in word_tokenize(clean_text) if word not in stop]
		wordcountlst.append(len(wordcount))
		print(len(wordcount))
		positive_word_proportion = positivescore / len(wordcount)
		positive_word_proportionlst.append(positive_word_proportion)
		print(positive_word_proportion)
		negative_word_proportion = negativescore / len(wordcount)
		negative_word_proportionlst.append(negative_word_proportion)
		print(negative_word_proportion)
		uncertainty_word_proportion = uncertainityscore / len(wordcount)
		uncertainty_word_proportionlst.append(uncertainty_word_proportion)
		print(uncertainty_word_proportion)
		constraining_word_proportion = constrainingscore / len(wordcount)
		constraining_word_proportionlst.append(constraining_word_proportion)
		print(constraining_word_proportion)
		constraining_words_whole_report = [word for word in word_tokenize(text_clean) if word in constrainwords]
		constraining_words_whole_reportlst.append(len(constraining_words_whole_report))
		print(len(constraining_words_whole_report))
		print('**********************************')
	report['positive score'] = positivescorelst
	report['negative score'] = negativescorelst
	report['polarity score'] = polarityscorelst
	report['subjectivity score'] = subjectivityscorelst
	report['average sentence length'] = averagesentencelengthlst
	report['complex words'] = complexwordslst
	report['percentage of complex word'] = percentage_of_complex_wordlst
	report['fog index'] = fog_indexlst
	report['constraining score'] = constrainingscorelst
	report['uncertainity score'] = uncertainityscorelst
	report['word count'] = wordcountlst
	report['positive word proportion'] = positive_word_proportionlst
	report['negative word proportion'] = negative_word_proportionlst
	report['uncertainty word proportion'] = uncertainty_word_proportionlst
	report['constraining word proportion'] = constraining_word_proportionlst
	report['constraining words whole report'] = constraining_words_whole_reportlst
	return True

In [None]:
analysis_generator(urls) # function to scrape desired information

https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-000413.txt
2004
1207
0.2482092805206449
0.07927612087499318
192.01322115384616
48540
30.38402553910676
88.95889867718118
401
123
40504
0.04947659490420699
0.02979952597274343
0.0030367371123839622
0.009900256764763974
1483
**********************************
https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-001001.txt
1170
763
0.21055354360550774
0.07526965460553446
276.68684210526317
32522
30.931796349663788
123.04745538197079
309
118
25681
0.045558973560219616
0.029710681048245785
0.004594836649663175
0.012032241735134924
1042
**********************************
https://www.sec.gov/Archives/edgar/data/3662/0000950172-98-000783.txt
5
1
0.6666665555555741
0.02564102553144861
164.8
243
29.49029126213592
77.71611650485437
0
0
234
0.021367521367521368
0.004273504273504274
0.0
0.0
5
**********************************
https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-002145.txt
924
618
0.19844357963784462
0.076227198

True

In [None]:
pd.set_option('display.max_columns', 30)

In [None]:
# saving the data for further analysis
report.to_csv('/content/drive/MyDrive/Capstone Project/report.csv', index = False)