In [1]:
import pandas as pd
import numpy as np
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import re
import nltk

In [2]:
#read excel file 'cik_list' 
cik_data = pd.read_excel('cik_list.xlsx')
print(cik_data.shape)
cik_data.head()

(152, 6)


Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


In [3]:
#generating a column containing full link of the report for corresponding row
cik_data['link'] = 'https://www.sec.gov/Archives/' + cik_data['SECFNAME'].astype(str)

In [5]:
# https://pypi.org/project/requests-random-user-agent/
#this was required as I was not able to get reports from site do request limit exceeded
import requests_random_user_agent

requests.Session()

reports = []
for link in cik_data.link:
    l = requests.get(link)
    report = l.text
    soup = BeautifulSoup(report, "html.parser")
    reports.append(soup.get_text())
                   
print(len(reports))

152


### Cleaning Using Stopwords Lists

In [7]:
#read stopwords text and save it to a list stopwords 
with open('StopWords_Generic.txt','r') as x:
    StopWords = x.read()
StopWords = StopWords.split('\n')
print(len(StopWords))

121


There are 121 stop words in the list provided by SRAF

In [8]:
#function for removing stopwords
def rem_stopwords(words, StopWords):
    not_stopwords = []
    for i in words:
        if i not in StopWords:
            not_stopwords.append(i)
    return not_stopwords
    

In [9]:
# https://stackoverflow.com/a/47091490/4084039
#to preprocess text data, removing extra characters and expressions. Expanding contracted words like we'll to we will
def preprocessed(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"\\r\\n", "", phrase)
    phrase = re.sub('[\d%/$]', '', phrase) #to remove digits
    phrase = phrase.replace('\\r', ' ') # \r, \n , \t remove from string
    phrase = phrase.replace('\\"', ' ')
    phrase = phrase.replace('\\n', ' ')
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase) #remove special character
    return phrase

In [10]:
#removing stopwords from the text in reports
from tqdm import tqdm
texts_without_stopwords = []
# tqdm is for printing the status bar
for i in tqdm(range(len(reports))):
    text = reports[i]
    text = preprocessed(text)
    text = ' '.join(e for e in text.upper().split() if e not in StopWords)
    texts_without_stopwords.append(text.strip())
    

100%|█████████████████████████████████████████████████████████████████| 152/152 [00:20<00:00,  7.54it/s]


###  1.2 Creating Dictionary of Positive and Negative Words

In [11]:
#reading master dictionary.xlsx file of LoughranMcDonald
master_dict = pd.read_excel('LoughranMcDonald_MasterDictionary_2020.xlsx')
print(master_dict.shape)
master_dict.head(2)

(86531, 17)


Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,AARDVARK,1,312,1.42205e-08,1.335201e-08,3.700747e-06,96,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.367356e-10,8.882163e-12,9.362849e-09,1,0,0,0,0,0,0,0,0,2,12of12inf


In [12]:
#positive and negative word dictionary of words that are not in stopwords
PositiveWords = [x for x in master_dict[master_dict['Positive'] != 0]['Word']]
pos_dict = [i for i in PositiveWords if i not in StopWords]
print('NO. of positive words:',len(pos_dict))
 
NegativeWords = [x for x in master_dict[master_dict['Negative'] != 0]['Word']]
neg_dict = [i for i in NegativeWords if i not in StopWords]
print('NO. of negative words:',len(neg_dict))


NO. of positive words: 354
NO. of negative words: 2355


### Extracting Derived Variables

In [13]:
#function for tokenizing the text report and reports after getting tokenized
def tokenize(text):
    TokenWords = word_tokenize(text) #tokenizer
    return TokenWords

tokenized_reports = list(map(tokenize, texts_without_stopwords))

### Positive Score

In [14]:
#calculate positive_score i.e, no. of positive words in the report
positive_score = []
for i in tqdm(tokenized_reports):
    pos_score_i = 0
    for x in i:
        if x in pos_dict:
            pos_score_i += 1
    positive_score.append(pos_score_i)       

100%|█████████████████████████████████████████████████████████████████| 152/152 [00:30<00:00,  4.96it/s]


### Negative Score

In [15]:
#calculate negative_score i.e, no. of negative words in the report
negative_score = []
for i in tqdm(tokenized_reports):
    neg_score_i = 0
    for x in i:
        if x in neg_dict:
            neg_score_i -= 1
    neg_score_i = -neg_score_i
    negative_score.append(neg_score_i)

100%|█████████████████████████████████████████████████████████████████| 152/152 [03:47<00:00,  1.50s/it]


### Polarity Score

In [16]:
#polarity score = (positive score - negative score)/((positive score + negative score)+0.000001)
polarity_score = [(positive_score[i] - negative_score[i])/((positive_score[i] + negative_score[i])+ 0.000001) for i in range(len(positive_score))]

In [17]:
#function for sentence tokenizing the text report and tokenized reports
def tokenize_sent(text):
    token_sent = sent_tokenize(text) #sentence tokenizer
    return token_sent

tokenized__sent_reports = list(map(tokenize_sent, reports))

### Average Sentence Length

In [18]:
#average sentence length
avg_sent_length = [(len(tokenized_reports[i]))/(len(tokenized__sent_reports[i])) for i in range(len(tokenized_reports))]

### Complex Word Count

In [19]:
#function for complex word count

def complex_word_count(text):
    complex_word = 0
    for word in text:
        count = 0
        vowels = ['a','e','i','o','u']
        if not (len(word) > 2 and (word[-2:] == 'ES' or word[-2:] == 'ED')): # removing exceptions        
            for x in word:
                if(x.lower() in vowels):
                    count = count +1
                        
        if(count > 2):
            complex_word += 1
    return complex_word 

#complex word count
complex_word_count = list(map(complex_word_count, tokenized_reports))          

### Percentage of Complex Words

In [20]:
#percentage of complex words
per_complex_words = [(complex_word_count[i]/len(tokenized_reports[i])) for i in range(len(tokenized_reports))]

### Fog Index

In [21]:
#fog index = 0.4 * (Average Sentence Length + Percentage of Complex words)
def fog_index(average_sentance_length , percentage_complex_words):
    return 0.4*(average_sentance_length + percentage_complex_words)
fog_ind = list(map(fog_index, avg_sent_length, per_complex_words))

### Word Count

In [22]:
#word caount, already cleaned words(removed stopwords and punctuations) in tokenized words
word_count = [len(i) for i in tokenized_reports]


In [23]:
#reading excel file of uncertainity and constraining dictionary
uncertainty_dict = pd.read_excel('uncertainty_dictionary.xlsx')
constraining_dict = pd.read_excel('constraining_dictionary.xlsx')

### Uncertainity Score

In [24]:
#uncertainity score is no. of words found in reports which are available in uncertainity dictionary
uncertainity = []
for i in tqdm(tokenized_reports):
    count = 0
    for x in i:
        if x in list(uncertainty_dict['Word']):
            count += 1
    uncertainity.append(count)


100%|█████████████████████████████████████████████████████████████████| 152/152 [03:43<00:00,  1.47s/it]


### Constraining Score

In [25]:
#constraining score is no. of words found in reports which are available in constraining dictionary
constraining = []
for i in tqdm(tokenized_reports):
    count = 0
    for x in i:
        if x in list(constraining_dict['Word']):
            count += 1
    constraining.append(count)

100%|█████████████████████████████████████████████████████████████████| 152/152 [02:30<00:00,  1.01it/s]


### Positive, Negative, Uncertainty & Constraining Word Proportion

In [26]:
#positive word proportion
pos_word_prop = [(positive_score[i]/len(tokenized_reports[i])) for i in range(len(tokenized_reports))]

#negative word proportion
neg_word_prop = [(negative_score[i]/len(tokenized_reports[i])) for i in range(len(tokenized_reports))]

#uncertainty word proportion
uncertainty_word_prop = [(uncertainity[i]/len(tokenized_reports[i])) for i in range(len(tokenized_reports))]

#constraining word proportion score
constraining_word_prop = [(constraining[i]/len(tokenized_reports[i])) for i in range(len(tokenized_reports))]

###  Constraining Words for Whole report

In [27]:
# Constraining Words for Whole report, which is similar to constraining score
constraining_words_report = constraining

In [31]:
#creating columns in the cik list of the variables as required
cik_data['positive_score'] = positive_score
cik_data['negative_score'] = negative_score
cik_data['polarity_score'] = polarity_score
cik_data['average_sentence_length'] = avg_sent_length
cik_data['percentage_of_complex_words'] = per_complex_words
cik_data['fog_index'] = fog_ind
cik_data['complex_word_count'] = complex_word_count
cik_data['word_count'] = word_count
cik_data['uncertainty_score'] = uncertainity
cik_data['constraining_score'] = constraining
cik_data['positive_word_proportion'] = pos_word_prop
cik_data['negative_word_proportion'] = neg_word_prop
cik_data['uncertainty_word_proportion'] = uncertainty_word_prop
cik_data['constraining_word_proportion'] = constraining_word_prop
cik_data['constraining_words_whole_report'] = constraining_words_report
output_data = cik_data.drop('link', axis = 1)
print(output_data.shape)
output_data.head()

(152, 21)


Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,positive_score,negative_score,polarity_score,average_sentence_length,...,fog_index,complex_word_count,word_count,uncertainty_score,constraining_score,positive_word_proportion,negative_word_proportion,uncertainty_word_proportion,constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,1065,2997,-0.475628,23.506687,...,9.569851,38934,93157,940,1487,0.011432,0.032171,0.01009,0.015962,1487
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,585,1477,-0.43259,24.213942,...,9.858303,26098,60438,859,1046,0.009679,0.024438,0.014213,0.017307,1046
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,2,8,-0.6,29.05,...,11.75494,196,581,9,5,0.003442,0.013769,0.015491,0.008606,5
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,368,1442,-0.59337,18.954509,...,7.755565,20453,47083,552,716,0.007816,0.030627,0.011724,0.015207,716
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,3,8,-0.454545,27.88,...,11.279977,223,697,10,4,0.004304,0.011478,0.014347,0.005739,4


In [32]:
#Exporting csv file for the output of output_data dataframe
output_data.to_csv('Output.csv', index = False)

In [None]:
#polarity score gives info about if a text is negative or positive

