In [1]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

In [2]:
myfile = open("C:\\Users\\Petros\\Desktop\\sec_filing\\sec_edgar_filings\\AAPL\\10-K\\0000320193-18-000145.txt","r") #returns file handle
raw_10k = myfile.read()

print(raw_10k[0:1300])

<SEC-DOCUMENT>0000320193-18-000145.txt : 20181105
<SEC-HEADER>0000320193-18-000145.hdr.sgml : 20181105
<ACCEPTANCE-DATETIME>20181105080140
ACCESSION NUMBER:		0000320193-18-000145
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		88
CONFORMED PERIOD OF REPORT:	20180929
FILED AS OF DATE:		20181105
DATE AS OF CHANGE:		20181105

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			APPLE INC
		CENTRAL INDEX KEY:			0000320193
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRONIC COMPUTERS [3571]
		IRS NUMBER:				942404110
		STATE OF INCORPORATION:			CA
		FISCAL YEAR END:			0930

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-36743
		FILM NUMBER:		181158788

	BUSINESS ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014
		BUSINESS PHONE:		(408) 996-1010

	MAIL ADDRESS:	
		STREET 1:		ONE APPLE PARK WAY
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	APPLE COMPUTER INC
		DATE OF NA

In [3]:
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

In [4]:
### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
### First filter will give us document tag start <end> and document tag end's <start> 
### We will use this to later grab content in between these tags
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
### as section names
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

In [5]:
document = {}

# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-K':
        document[doc_type] = (raw_10k[doc_start:doc_end])

In [6]:
# display excerpt the document
document['10-K'][0:500]

'\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>a10-k20189292018.htm\n<DESCRIPTION>10-K\n<TEXT>\n<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n<html>\n\t<head>\n\t\t<!-- Document created using Wdesk 1 -->\n\t\t<!-- Copyright 2018 Workiva -->\n\t\t<title>Document</title>\n\t</head>\n\t<body style="font-family:Times New Roman;font-size:10pt;">\n<div><a name="s3540C27286EF5B0DA103CC59028B96BE"></a></div><div style="line-height:120%;text-align:center;font-size:10pt;"><div sty'

In [7]:
# Write the regex
regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

# Use finditer to math the regex
matches = regex.finditer(document['10-K'])

# Write a for loop to print the matches
for match in matches:
    print(match)

<re.Match object; span=(38318, 38327), match='>Item 1A.'>
<re.Match object; span=(39347, 39356), match='>Item 1B.'>
<re.Match object; span=(46148, 46156), match='>Item 7.'>
<re.Match object; span=(47281, 47290), match='>Item 7A.'>
<re.Match object; span=(48357, 48365), match='>Item 8.'>
<re.Match object; span=(119131, 119140), match='>Item 1A.'>
<re.Match object; span=(197023, 197032), match='>Item 1B.'>
<re.Match object; span=(333318, 333326), match='>Item 7.'>
<re.Match object; span=(729984, 729993), match='>Item 7A.'>
<re.Match object; span=(741774, 741782), match='>Item 8.'>


In [8]:
# Matches
matches = regex.finditer(document['10-K'])

# Create the dataframe
test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

test_df.columns = ['item', 'start', 'end']
test_df['item'] = test_df.item.str.lower()

# Display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,>item 1a.,38318,38327
1,>item 1b.,39347,39356
2,>item 7.,46148,46156
3,>item 7a.,47281,47290
4,>item 8.,48357,48365


In [9]:
# Get rid of unnesesary charcters from the dataframe
test_df.replace('&#160;',' ',regex=True,inplace=True)
test_df.replace('&nbsp;',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)

# display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,item1a,38318,38327
1,item1b,39347,39356
2,item7,46148,46156
3,item7a,47281,47290
4,item8,48357,48365


In [10]:
# Drop duplicates
pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

# Display the dataframe
pos_dat

Unnamed: 0,item,start,end
5,item1a,119131,119140
6,item1b,197023,197032
7,item7,333318,333326
8,item7a,729984,729993
9,item8,741774,741782


In [11]:
# Set item as the dataframe index
pos_dat.set_index('item', inplace=True)

# display the dataframe
pos_dat

Unnamed: 0_level_0,start,end
item,Unnamed: 1_level_1,Unnamed: 2_level_1
item1a,119131,119140
item1b,197023,197032
item7,333318,333326
item7a,729984,729993
item8,741774,741782


In [12]:
# Get Item 1a
item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]

# Get Item 7
item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]

# Get Item 7a
item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

In [13]:
item_1a_raw[0:1000]

'>Item 1A.</font></div></td><td style="vertical-align:top;"><div style="line-height:120%;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">Risk Factors</font></div></td></tr></table><div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;"><font style="font-family:Helvetica,sans-serif;font-size:9pt;">The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item&#160;7, &#8220;Management&#8217;s Discussion and Analysis of Financial Condition and Results of Operations&#8221; and the consolidated financial statements and related notes in Part II, Item&#160;8, &#8220;Financial Statements and Supplementary Data&#8221; of this Form 10-K.</font></div><div style="line-height:120%;padding-top:16px;text-align:justify;font-size:9pt;"><

In [14]:
### First convert the raw text we have to exrtacted to BeautifulSoup object 
item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

In [15]:
### By just applying .pretiffy() we see that raw text start to look oragnized, as BeautifulSoup
### apply indentation according to the HTML Tag tree structure
print(item_1a_content.prettify()[0:1000])

<html>
 <body>
  <p>
   &gt;Item 1A.
  </p>
  <td style="vertical-align:top;">
   <div style="line-height:120%;text-align:justify;font-size:9pt;">
    <font style="font-family:Helvetica,sans-serif;font-size:9pt;font-weight:bold;">
     Risk Factors
    </font>
   </div>
  </td>
  <div style="line-height:120%;padding-top:8px;text-align:justify;font-size:9pt;">
   <font style="font-family:Helvetica,sans-serif;font-size:9pt;">
    The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K.
   </font>
  </div>
  <div style="line-height:120%;padding-top:16px;text-align:justify;fon

In [16]:
### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text 
### more cleanly, it's basically new line character between sections. 
#print(item_1a_content.get_text("\n\n")[0:1500])
print(item_1a_content.get_text("\n\n"))

>Item 1A.

Risk Factors

The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K.

The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the Company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results. Any of these factors, in whole or in part, could materially and adverse

In [38]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution
import re

#item_tk_1a = item_1a_content.get_text("\n\n")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Petros\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
item_tk_1a = item_1a_content.get_text()
sentences = []
sentences = sent_tokenize(item_tk_1a)
sentences

['>Item 1A.Risk FactorsThe following discussion of risk factors contains forward-looking statements.',
 'These risk factors may be important to understanding other statements in this Form 10-K.',
 'The following information should be read in conjunction with Part II, Item\xa07, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item\xa08, “Financial Statements and Supplementary Data” of this Form 10-K.The business, financial condition and operating results of the Company can be affected by a number of factors, whether currently known or unknown, including but not limited to those described below, any one or more of which could, directly or indirectly, cause the Company’s actual financial condition and operating results to vary materially from past, or from anticipated future, financial condition and operating results.',
 'Any of these factors, in whole or in part, could material

In [19]:
#sentences = [y for x in sentences for y in x]

In [20]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
print(clean_sentences)
# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]


0       Item  A Risk FactorsThe following discussion ...
1      These risk factors may be important to underst...
2      The following information should be read in co...
3      Any of these factors  in whole or in part  cou...
4      In addition  a majority of the Company s suppl...
                             ...                        
170    Due to economic and political conditions  tax ...
171    The Company s effective tax rates could be aff...
172    The Company regularly assesses the likelihood ...
173    There can be no assurance as to the outcome of...
174    If the Company s effective tax rates were to i...
Length: 175, dtype: object


In [21]:
print(clean_sentences)

[' item  a risk factorsthe following discussion of risk factors contains forward looking statements ', 'these risk factors may be important to understanding other statements in this form    k ', 'the following information should be read in conjunction with part ii  item     management s discussion and analysis of financial condition and results of operations  and the consolidated financial statements and related notes in part ii  item     financial statements and supplementary data  of this form    k the business  financial condition and operating results of the company can be affected by a number of factors  whether currently known or unknown  including but not limited to those described below  any one or more of which could  directly or indirectly  cause the company s actual financial condition and operating results to vary materially from past  or from anticipated future  financial condition and operating results ', 'any of these factors  in whole or in part  could materially and ad

In [22]:
nltk.download('stopwords')# one time execution

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Petros\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [24]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [25]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [26]:
clean_sentences

['item risk factorsthe following discussion risk factors contains forward looking statements',
 'risk factors may important understanding statements form k',
 'following information read conjunction part ii item management discussion analysis financial condition results operations consolidated financial statements related notes part ii item financial statements supplementary data form k business financial condition operating results company affected number factors whether currently known unknown including limited described one could directly indirectly cause company actual financial condition operating results vary materially past anticipated future financial condition operating results',
 'factors whole part could materially adversely affect company business financial condition operating results stock price following factors well factors affecting company financial condition operating results past financial performance considered reliable indicator future performance investors use his

In [27]:
# Extract word vectors
word_embeddings = {}
f = open('C:\\Users\\Petros\\Desktop\\glove.6B\\glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [28]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)

In [29]:
len(sentence_vectors)

175

In [30]:
# The next step is to find similarities among the sentences. 
# We will use cosine similarity to find similarity between a pair of sentences. 
#Let's create an empty similarity matrix for this task and populate it with cosine similarities of the sentences.


# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [33]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [34]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [37]:
# Specify number of sentences to form the summary
sn = 14

# Generate summary
for i in range(sn):
    print(ranked_sentences[i][1])

Any such costs, which may rise in the future as a result of changes in these laws and regulations or in their interpretation, could individually or in the aggregate make the Company’s products and services less attractive to the Company’s customers, delay the introduction of new products in one or more regions, or cause the Company to change or limit its business practices.
Quality problems could also adversely affect the experience for users of the Company’s products and services, and result in harm to the Company’s reputation, loss of competitive advantage, poor market acceptance, reduced demand for products and services, delay in new product and services introductions and lost revenue.The Company relies on access to third-party digital content, which may not be available to the Company on commercially reasonable terms or at all.The Company contracts with numerous third parties to offer their digital content to customers.
The Company could also experience a significant increase in pa