In [106]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

In [107]:
myfile = open("C:\\Users\\Petros\\Desktop\\sec_filing\\sec_edgar_filings\\AAPL\\10-K\\0000320193-18-000145.txt","r") #returns file handle
raw_10k = myfile.read()

print(raw_10k[0:1300])

<SEC-DOCUMENT>0000320193-18-000145.txt : 20181105
<SEC-HEADER>0000320193-18-000145.hdr.sgml : 20181105
<ACCEPTANCE-DATETIME>20181105080140
ACCESSION NUMBER:		0000320193-18-000145
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		88
CONFORMED PERIOD OF REPORT:	20180929
FILED AS OF DATE:		20181105
DATE AS OF CHANGE:		20181105

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			APPLE INC
		CENTRAL INDEX KEY:			0000320193
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRONIC COMPUTERS [3571]
		IRS NUMBER:				942404110
		STATE OF INCORPORATION:			CA
		FISCAL YEAR END:			0930

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-36743
		FILM NUMBER:		181158788

	BUSINESS ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014
		BUSINESS PHONE:		(408) 996-1010

	MAIL ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	APPLE COMPUTER INC
		DATE OF NA

In [108]:
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

In [109]:
### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
### First filter will give us document tag start <end> and document tag end's <start> 
### We will use this to later grab content in between these tags
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
### as section names
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

In [110]:
document = {}

# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-K':
        document[doc_type] = (raw_10k[doc_start:doc_end])

In [111]:
# display excerpt the document
document['10-K'][0:500]

'\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>a10-k20189292018.htm\n<DESCRIPTION>10-K\n<TEXT>\n<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n<html>\n\t<head>\n\t\t<!-- Document created using Wdesk 1 -->\n\t\t<!-- Copyright 2018 Workiva -->\n\t\t<title>Document</title>\n\t</head>\n\t<body style="font-family:Times New Roman;font-size:10pt;">\n<div><a name="s3540C27286EF5B0DA103CC59028B96BE"></a></div><div style="line-height:120%;text-align:center;font-size:10pt;"><div sty'

In [112]:
# Write the regex
regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

# Use finditer to math the regex
matches = regex.finditer(document['10-K'])

# Write a for loop to print the matches
for match in matches:
    print(match)

<re.Match object; span=(38318, 38327), match='>Item 1A.'>
<re.Match object; span=(39347, 39356), match='>Item 1B.'>
<re.Match object; span=(46148, 46156), match='>Item 7.'>
<re.Match object; span=(47281, 47290), match='>Item 7A.'>
<re.Match object; span=(48357, 48365), match='>Item 8.'>
<re.Match object; span=(119131, 119140), match='>Item 1A.'>
<re.Match object; span=(197023, 197032), match='>Item 1B.'>
<re.Match object; span=(333318, 333326), match='>Item 7.'>
<re.Match object; span=(729984, 729993), match='>Item 7A.'>
<re.Match object; span=(741774, 741782), match='>Item 8.'>


In [113]:
# Matches
matches = regex.finditer(document['10-K'])

# Create the dataframe
test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

test_df.columns = ['item', 'start', 'end']
test_df['item'] = test_df.item.str.lower()

# Display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,>item 1a.,38318,38327
1,>item 1b.,39347,39356
2,>item 7.,46148,46156
3,>item 7a.,47281,47290
4,>item 8.,48357,48365


In [114]:
# Get rid of unnesesary charcters from the dataframe
test_df.replace('&#160;',' ',regex=True,inplace=True)
test_df.replace('&nbsp;',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)

# display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,item1a,38318,38327
1,item1b,39347,39356
2,item7,46148,46156
3,item7a,47281,47290
4,item8,48357,48365


In [115]:
# Drop duplicates
pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

# Display the dataframe
pos_dat

Unnamed: 0,item,start,end
5,item1a,119131,119140
6,item1b,197023,197032
7,item7,333318,333326
8,item7a,729984,729993
9,item8,741774,741782


In [116]:
# Set item as the dataframe index
pos_dat.set_index('item', inplace=True)

# display the dataframe
pos_dat

Unnamed: 0_level_0,start,end
item,Unnamed: 1_level_1,Unnamed: 2_level_1
item1a,119131,119140
item1b,197023,197032
item7,333318,333326
item7a,729984,729993
item8,741774,741782


In [117]:
# Get Item 1a
item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]

# Get Item 7
item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]

# Get Item 7a
item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

In [118]:
item_1a_raw[0:1000]

'>Item 1A.</font></div></td><td style="vertical-align:top;"><div style="line-height:120%;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">Risk Factors</font></div></td></tr></table><div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;">The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item&#160;7, &#8220;Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations&#8221; and the consolidated financial statements and related notes in Part II, Item&#160;8, &#8220;Financial Statements and Supplementary Data&#8221; of this Form 10-K.</font></div><div style="line-height:120%;padding-top:16px;text-align:justify;font-size:9pt;"><

In [119]:
### First convert the raw text we have to exrtacted to BeautifulSoup object 
item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

In [120]:
### By just applying .pretiffy() we see that raw text start to look oragnized, as BeautifulSoup
### apply indentation according to the HTML Tag tree structure
print(item_1a_content.prettify()[0:1000])

<html>
 <body>
  <p>
   &gt;Item 1A.
  </p>
  <td style="vertical-align:top;">
   <div style="line-height:120%;text-align:justify;font-size:9pt;">
    <font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">
     Risk Factors
    </font>
   </div>
  </td>
  <div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;">
   <font style="font-family:Helvetica,sans-serif;font-size:9pt;">
    The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K.
   </font>
  </div>
  <div style="line-height:120%;padding-top:16px;text-align:justify;fon

In [121]:
### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text 
### more cleanly, it's basically new line character between sections. 
#print(item_1a_content.get_text("\n\n")[0:1500])
print(item_1a_content.get_text("\n\n"))

>Item 1A.

Risk Factors

The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K.

The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the Company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results. Any of these factors, in whole or in part, could materially and adverse

In [122]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Petros\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Petros\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [123]:
#nltk.download()
from nltk.tokenize import sent_tokenize
#item_tk_1a = item_1a_content.get_text("\n\n")[0:180]
item_tk_1a = item_1a_content.get_text("\n\n")
#print(sent_tokenize(item_tk_1a))

In [124]:
item_tk_1a.split()

['>Item',
 '1A.',
 'Risk',
 'Factors',
 'The',
 'following',
 'discussion',
 'of',
 'risk',
 'factors',
 'contains',
 'forward-looking',
 'statements.',
 'These',
 'risk',
 'factors',
 'may',
 'be',
 'important',
 'to',
 'understanding',
 'other',
 'statements',
 'in',
 'this',
 'Form',
 '10-K.',
 'The',
 'following',
 'information',
 'should',
 'be',
 'read',
 'in',
 'conjunction',
 'with',
 'Part',
 'II,',
 'Item',
 '7,',
 '“Management’s',
 'Discussion',
 'and',
 'Analysis',
 'of',
 'Financial',
 'Condition',
 'and',
 'Results',
 'of',
 'Operations”',
 'and',
 'the',
 'consolidated',
 'financial',
 'statements',
 'and',
 'related',
 'notes',
 'in',
 'Part',
 'II,',
 'Item',
 '8,',
 '“Financial',
 'Statements',
 'and',
 'Supplementary',
 'Data”',
 'of',
 'this',
 'Form',
 '10-K.',
 'The',
 'business,',
 'financial',
 'condition',
 'and',
 'operating',
 'results',
 'of',
 'the',
 'Company',
 'can',
 'be',
 'affected',
 'by',
 'a',
 'number',
 'of',
 'factors,',
 'whether',
 'currently'

In [125]:
from nltk.tokenize import word_tokenize
tokenized_word_1a=word_tokenize(item_tk_1a)
#print(tokenized_word)

In [126]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

{'yourself', 'it', 'not', 'has', "should've", 'been', 'more', 'shouldn', 'off', 'hasn', "wasn't", "mightn't", 'your', 'by', "didn't", 'its', 'mightn', 'ourselves', 'on', 'very', "needn't", 'be', 'doesn', 'himself', "hadn't", 'below', 'won', 'from', 'should', 'isn', "mustn't", 'same', 'all', 'any', 'because', 'each', 'no', "it's", 'm', 'if', 'up', 'over', 'ma', 'down', "shan't", 'who', 'through', 'theirs', 'about', 'herself', "you're", 'hers', 'nor', 'or', 'her', "you'll", 'weren', 'what', 'wasn', 've', 'is', 'for', 'he', 'an', 'with', 'only', "aren't", 'yourselves', 'again', 'there', 'aren', 'why', 'does', 'his', 'am', 'they', 'y', 'having', 'were', 'then', 'to', "wouldn't", "weren't", 'couldn', "hasn't", 'when', 'ours', "isn't", 'than', "you've", 'have', 'do', 'as', 'other', 'themselves', 'after', 'o', 'and', 'you', 'of', 'few', 'd', 'did', "you'd", 'under', 'didn', 'myself', 'ain', 'which', 'before', 'but', 'those', 'while', 'their', 'can', 'most', 'yours', 'a', 'them', 'too', "doesn

In [127]:
tokenized_sent_1a = tokenized_word_1a
filtered_sent_1a=[]
for w in tokenized_sent_1a:
    if w not in stop_words:
        filtered_sent_1a.append(w)
#print("Tokenized Sentence:",tokenized_sent)
#print("Filterd Sentence:",filtered_sent)

In [154]:
# !!! https://github.com/TiesdeKok/UW_Python_Camp/blob/master/Materials/Session_5/EDGAR_walkthrough.ipynb
pos_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\positive-master.xlsx")
neg_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\negative-master.xlsx")
#print(pos_list_df)
#print(neg_list_df)

pos_list = pos_list_df['positive'].tolist()
#print(pos_list)
neg_list = neg_list_df['negative'].tolist()
#print(neg_list)

#Calculate tone based on Loughran and McDonald dictionary

pos_words, neg_words, num_words = 0, 0, 0
for w in filtered_sent_1a:
    #print(w)
    if w in neg_list:
        neg_words += 1
            
    if w in pos_list:
        pos_words += 1
    
    num_words += 1
    
#print(num_words)
#print(neg_words)
#print(pos_words)

In [155]:
tone_data = []
tone_data.append([pos_words, neg_words, num_words])
tone_df = pd.DataFrame(tone_data, columns = ['pos_words', 'neg_words', 'num_words'])
tone_df.head()

Unnamed: 0,pos_words,neg_words,num_words
0,52,326,6187


In [156]:
tone_df['tone_score'] = (tone_df['pos_words'] - tone_df['neg_words']) / tone_df['num_words']
tone_df

Unnamed: 0,pos_words,neg_words,num_words,tone_score
0,52,326,6187,-0.044286


In [151]:
# !!! https://github.com/TiesdeKok/UW_Python_Camp/blob/master/Materials/Session_5/EDGAR_walkthrough.ipynb
pos_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\positive-master.xlsx")
neg_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\negative-master.xlsx")
#print(pos_list_df)
#print(neg_list_df)

pos_list = pos_list_df['positive'].tolist()
#print(pos_list)
neg_list = neg_list_df['negative'].tolist()
#print(neg_list)

pos_words, neg_words, num_words = 0, 0, 0
for w in tokenized_sent_1a:
    #print(w)
    if w in neg_list:
        neg_words += 1
            
    if w in pos_list:
        pos_words += 1
    
    num_words += 1
    
#print(num_words)
#print(neg_words)
#print(pos_words)

In [152]:
tone_data = []
tone_data.append([pos_words, neg_words, num_words])
tone_df = pd.DataFrame(tone_data, columns = ['pos_words', 'neg_words', 'num_words'])
tone_df.head()

Unnamed: 0,pos_words,neg_words,num_words
0,52,332,9171


In [153]:
tone_df['tone_score'] = (tone_df['pos_words'] - tone_df['neg_words']) / tone_df['num_words']
tone_df

Unnamed: 0,pos_words,neg_words,num_words,tone_score
0,52,332,9171,-0.030531


In [131]:
item_7_raw[0:1000]

'>Item 7.</font></div></td><td style="vertical-align:top;"><div style="line-height:120%;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations</font></div></td></tr></table><div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-style:italic;">This section and other parts of this Annual Report on Form 10-K (&#8220;Form 10-K&#8221;) contain forward-looking statements, within the meaning of the Private Securities Litigation Reform Act of 1995, that involve risks and uncertainties. Forward-looking statements provide current expectations of future events based on certain assumptions and include any statement that does not directly relate to any historical or current fact. Forward-looking statements can also be identified by words such as &#8220;future,&#

In [132]:
### First convert the raw text we have to exrtacted to BeautifulSoup object 
item_7_content = BeautifulSoup(item_7_raw, 'lxml')

In [133]:
### By just applying .pretiffy() we see that raw text start to look oragnized, as BeautifulSoup
### apply indentation according to the HTML Tag tree structure
print(item_7_content.prettify()[0:1000])

<html>
 <body>
  <p>
   &gt;Item 7.
  </p>
  <td style="vertical-align:top;">
   <div style="line-height:120%;text-align:justify;font-size:9pt;">
    <font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">
     Management’s Discussion and Analysis of Financial Condition and Results of Operations
    </font>
   </div>
  </td>
  <div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;">
   <font style="font-family:Helvetica,sans-serif;font-size:9pt;font-style:italic;">
    This section and other parts of this Annual Report on Form 10-K (“Form 10-K”) contain forward-looking statements, within the meaning of the Private Securities Litigation Reform Act of 1995, that involve risks and uncertainties. Forward-looking statements provide current expectations of future events based on certain assumptions and include any statement that does not directly relate to any historical or current fact. Forward-looking statements can also be identified by word

In [134]:
### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text 
### more cleanly, it's basically new line character between sections. 
#print(item_1a_content.get_text("\n\n")[0:1500])
print(item_7_content.get_text("\n\n"))

>Item 7.

Management’s Discussion and Analysis of Financial Condition and Results of Operations

This section and other parts of this Annual Report on Form 10-K (“Form 10-K”) contain forward-looking statements, within the meaning of the Private Securities Litigation Reform Act of 1995, that involve risks and uncertainties. Forward-looking statements provide current expectations of future events based on certain assumptions and include any statement that does not directly relate to any historical or current fact. Forward-looking statements can also be identified by words such as “future,”  “anticipates,”  “believes,”  “estimates,”  “expects,”  “intends,”  “plans,”  “predicts,”  “will,”  “would,”  “could,”  “can,”  “may,”  and similar terms. Forward-looking statements are not guarantees of future performance and the Company’s actual results may differ significantly from the results discussed in the forward-looking statements. Factors that might cause such differences include, but are not

In [135]:
#nltk.download()
from nltk.tokenize import sent_tokenize
#item_tk_1a = item_1a_content.get_text("\n\n")[0:180]
item_tk_7 = item_7_content.get_text("\n\n")
#print(sent_tokenize(item_tk_1a))

In [136]:
from nltk.tokenize import word_tokenize
tokenized_word_7=word_tokenize(item_tk_7)
#print(tokenized_word)

In [137]:
tokenized_sent_7 = tokenized_word_7
filtered_sent_7=[]
for w in tokenized_sent_7:
    if w not in stop_words:
        filtered_sent_7.append(w)
#print("Tokenized Sentence:",tokenized_sent)
#print("Filterd Sentence:",filtered_sent)

In [138]:
# !!! https://github.com/TiesdeKok/UW_Python_Camp/blob/master/Materials/Session_5/EDGAR_walkthrough.ipynb
pos_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\positive-master.xlsx")
neg_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\negative-master.xlsx")
#print(pos_list_df)
#print(neg_list_df)

pos_list = pos_list_df['positive'].tolist()
#print(pos_list)
neg_list = neg_list_df['negative'].tolist()
#print(neg_list)

pos_words, neg_words, num_words = 0, 0, 0
for w in filtered_sent_7:
    #print(w)
    if w in neg_list:
        neg_words += 1
            
    if w in pos_list:
        pos_words += 1
    
    num_words += 1
    
#print(num_words)
#print(neg_words)
#print(pos_words)

In [139]:
tone_data = []
tone_data.append([pos_words, neg_words, num_words])
tone_df = pd.DataFrame(tone_data, columns = ['pos_words', 'neg_words', 'num_words'])
tone_df.head()

Unnamed: 0,pos_words,neg_words,num_words
0,46,80,7385


In [140]:
tone_df['tone_score'] = (tone_df['pos_words'] - tone_df['neg_words']) / tone_df['num_words']
tone_df

Unnamed: 0,pos_words,neg_words,num_words,tone_score
0,46,80,7385,-0.004604


In [141]:
item_7a_raw[0:1000]

'>Item 7A.</font></div></td><td style="vertical-align:top;"><div style="line-height:120%;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">Quantitative and Qualitative Disclosures About Market Risk</font></div></td></tr></table><div style="line-height:120%;padding-top:12px;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">Interest Rate and Foreign Currency Risk Management</font></div><div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;">The Company regularly reviews its foreign exchange forward and option positions and interest rate swaps, both on a stand-alone basis and in conjunction with its underlying foreign currency and interest rate exposures. Given the effective horizons of the Company&#8217;s risk management activities and the anticipatory nature of the exposures, 

In [142]:
### First convert the raw text we have to exrtacted to BeautifulSoup object 
item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')

In [143]:
### By just applying .pretiffy() we see that raw text start to look oragnized, as BeautifulSoup
### apply indentation according to the HTML Tag tree structure
print(item_7a_content.prettify()[0:1000])

<html>
 <body>
  <p>
   &gt;Item 7A.
  </p>
  <td style="vertical-align:top;">
   <div style="line-height:120%;text-align:justify;font-size:9pt;">
    <font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">
     Quantitative and Qualitative Disclosures About Market Risk
    </font>
   </div>
  </td>
  <div style="line-height:120%;padding-top:12px;text-align:justify;font-size:9pt;">
   <font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">
    Interest Rate and Foreign Currency Risk Management
   </font>
  </div>
  <div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;">
   <font style="font-family:Helvetica,sans-serif;font-size:9pt;">
    The Company regularly reviews its foreign exchange forward and option positions and interest rate swaps, both on a stand-alone basis and in conjunction with its underlying foreign currency and interest rate exposures. Given the effective horizons of the Company’s risk management 

In [144]:
### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text 
### more cleanly, it's basically new line character between sections. 
#print(item_1a_content.get_text("\n\n")[0:1500])
print(item_7a_content.get_text("\n\n"))

>Item 7A.

Quantitative and Qualitative Disclosures About Market Risk

Interest Rate and Foreign Currency Risk Management

The Company regularly reviews its foreign exchange forward and option positions and interest rate swaps, both on a stand-alone basis and in conjunction with its underlying foreign currency and interest rate exposures. Given the effective horizons of the Company’s risk management activities and the anticipatory nature of the exposures, there can be no assurance these positions will offset more than a portion of the financial impact resulting from movements in either foreign exchange or interest rates. Further, the recognition of the gains and losses related to these instruments may not coincide with the timing of gains and losses related to the underlying economic exposures and, therefore, may adversely affect the Company’s financial condition and operating results.

Interest Rate Risk

The Company’s exposure to changes in interest rates relates primarily to the Com

In [145]:
#nltk.download()
from nltk.tokenize import sent_tokenize
#item_tk_1a = item_1a_content.get_text("\n\n")[0:180]
item_tk_7a = item_7a_content.get_text("\n\n")
#print(sent_tokenize(item_tk_1a))

In [146]:
from nltk.tokenize import word_tokenize
tokenized_word_7a=word_tokenize(item_tk_7a)
#print(tokenized_word)

In [147]:
tokenized_sent_7a = tokenized_word_7a
filtered_sent_7a=[]
for w in tokenized_sent_7a:
    if w not in stop_words:
        filtered_sent_7a.append(w)
#print("Tokenized Sentence:",tokenized_sent)
#print("Filterd Sentence:",filtered_sent)

In [148]:
# !!! https://github.com/TiesdeKok/UW_Python_Camp/blob/master/Materials/Session_5/EDGAR_walkthrough.ipynb
pos_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\positive-master.xlsx")
neg_list_df = pd.read_excel("C:\\Users\\Petros\\Desktop\\negative-master.xlsx")
#print(pos_list_df)
#print(neg_list_df)

pos_list = pos_list_df['positive'].tolist()
#print(pos_list)
neg_list = neg_list_df['negative'].tolist()
#print(neg_list)

pos_words, neg_words, num_words = 0, 0, 0
for w in filtered_sent_7a:
    #print(w)
    if w in neg_list:
        neg_words += 1
            
    if w in pos_list:
        pos_words += 1
    
    num_words += 1
    
#print(num_words)
#print(neg_words)
#print(pos_words)

In [149]:
tone_data = []
tone_data.append([pos_words, neg_words, num_words])
tone_df = pd.DataFrame(tone_data, columns = ['pos_words', 'neg_words', 'num_words'])
tone_df.head()

Unnamed: 0,pos_words,neg_words,num_words
0,8,20,820


In [150]:
tone_df['tone_score'] = (tone_df['pos_words'] - tone_df['neg_words']) / tone_df['num_words']
tone_df

Unnamed: 0,pos_words,neg_words,num_words,tone_score
0,8,20,820,-0.014634
