This notebook shows how to calculate
1. [Flesch–Kincaid readability Grade Index](https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests)  
2. [Gunning Fog Index](https://en.wikipedia.org/wiki/Gunning_fog_index)

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
import unicodedata
import os.path

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer, sent_tokenize

from collections import Counter
from nltk.corpus import stopwords

from nltk import word_tokenize
from nltk.corpus import cmudict

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [64]:
def get_links(cik, priorto, count):
    link = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+ \
        str(cik)+"&type=10-K&dateb="+str(priorto)+\
        "&owner=exclude&output=xml&count="+str(count)
    """
     given cik and priorto ('date'), count parameters, the functions gets 
     all the links on the search page on SEC Edgar database
    """
    
    # parse the website and extract links
    data = requests.get(link).text
    # print("see tentative links for all documents:")
    # print(link)
    
    soup = BeautifulSoup(data, "lxml")
    # store the link in the list
    links = []

    # If the link is .htm convert it to .html
    for link in soup.find_all('filinghref'):

        # convert http://*-index.htm to http://*.txt
        url = link.string
        if link.string.split(".")[len(link.string.split("."))-1] == "htm":
            url += "l"
        required_url = url.replace('-index.html', '')
        txtdoc = required_url + ".txt"
        #docname = txtdoc.split("/")[-1]
        links.append(txtdoc)
    return links

# clean up the soup we construct from the links
def clean_soup(link):
    data = requests.get(link).text
    soup = BeautifulSoup(data, "lxml")
    blacklist = ["script", "style"]
    attrlist = ["class", "id", "name", "style", 'cellpadding', 'cellspacing']
    skiptags = ['font', 'a', 'b', 'i', 'u']
    
    for tag in soup.findAll():
        if tag.name.lower() in blacklist:
            # blacklisted tags are removed in their entirety
            tag.extract()

        if tag.name.lower() in skiptags:
            tag.replaceWithChildren()
            
        for attribute in attrlist:
            del tag[attribute]
            
                    
    return soup


# normalize the text
# remove some escape characters
def normtxt(txt):
    return unicodedata.normalize("NFKD",txt)

# get section from 10K
# looks for the term "item 1a" and collects text until "item 1b" is found
# returns None if there is no appropriate section found
# raise error when it cannot find the end of the section

def extract_section(soup, section='1a', section_end='1b'):
    
    search_next = ["p", "div", "table"]
    
    # loop over all tables
    items = soup.find_all(("table", "div"))

    myitem = None
    
    search_txt = ['item '+ section ]
    
    for i, item in enumerate(items):
        txt = normtxt(item.get_text())
        
        # this is to avoid long sentences or tables that contain the item
        if len(txt.split()) > 5:
            continue
        if any([w in txt.lower() for w in search_txt]):
            myitem = item
            
    if myitem is None:
        # print("section not found, returned None")
        return None
        
    lines = ""
    des = myitem.find_next(search_next)
    
    search_txt = [ 'item '+section_end ]

    while not des is None:
        des = des.find_next(search_next)
        
        if des is None:
            raise ValueError("end section not properly found")
            
        if any([w in normtxt(des.get_text()).lower() for w in search_txt]):
            break
            
        elif des is not None:
            if len(des.get_text().split()) > 2 and '|' not in normtxt(des.get_text()):
                # need to get rid of escape characters
                lines += normtxt(" "+des.get_text())
            #elif len(des.get_text().split()) > 2:
                #print("removing text: ",des.get_text())
            
        else:
            continue
    
    return lines[1:]
    
    

def get_files(cik, company, n=5, max_n=20):
    mylinks = get_links(cik, '20170601', str(max_n))
    dates = range(2017, 1000, -1)
    print("downloading 10-Ks item 1A for CIK =",cik, "...")
    result_txt = []
    i=0
    for link in mylinks:
        filename = company+"_10k_"+str(dates[i])+".txt"

        if os.path.isfile(filename):
            print("skipping "+filename)
            i+=1
            
            if i >= n:
                break

            continue

        soup = clean_soup(link)
        section = extract_section(soup)
        
        if section is None:
            continue
        
        print("writing "+os.path.join('Data',filename)+" in Data folder ...")
        
        with open(os.path.join('Data',filename),"w") as f:
            f.write(section)
            
        i+=1

        if i >= n:
            break

In [65]:
CIK = {'ebay': '0001065088', 'apple':'0000320193', 'sears': '0001310067'}
get_files(CIK['ebay'], 'EBAY')
get_files(CIK['apple'], 'AAPL')
get_files(CIK['sears'], 'SHLDQ')

downloading 10-Ks item 1A for CIK = 0001065088 ...
writing Data/EBAY_10k_2017.txt in Data folder ...
writing Data/EBAY_10k_2016.txt in Data folder ...
writing Data/EBAY_10k_2015.txt in Data folder ...
writing Data/EBAY_10k_2014.txt in Data folder ...
writing Data/EBAY_10k_2013.txt in Data folder ...
downloading 10-Ks item 1A for CIK = 0000320193 ...
writing Data/AAPL_10k_2017.txt in Data folder ...
writing Data/AAPL_10k_2016.txt in Data folder ...
writing Data/AAPL_10k_2015.txt in Data folder ...
writing Data/AAPL_10k_2014.txt in Data folder ...
writing Data/AAPL_10k_2013.txt in Data folder ...
downloading 10-Ks item 1A for CIK = 0001310067 ...
writing Data/SHLDQ_10k_2017.txt in Data folder ...
writing Data/SHLDQ_10k_2016.txt in Data folder ...
writing Data/SHLDQ_10k_2015.txt in Data folder ...
writing Data/SHLDQ_10k_2014.txt in Data folder ...
writing Data/SHLDQ_10k_2013.txt in Data folder ...


In [72]:
def safeRead(fname):
    with open(fname,'r') as f:
        text = ''.join(f).replace(';','.')
    return text
text_phy = safeRead('Data/physics.txt')
text_alice = safeRead('Data/alice.txt')
text_10k = safeRead('Data/AAPL_10k_2017.txt')

In [73]:
print(text_10k[:500]+"...\n")
print(text_phy[:500]+"...\n")
print(text_alice[:500]+"...\n")

The following discussion of risk factors contains forward-looking statements. These risk factors may be important to understanding other statements in this Form 10-K. The following information should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and related notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K. The business, financia...

In particle physics, supersymmetry (SUSY) is a principle that proposes a relationship between two basic classes of elementary particles: bosons, which have an integer-valued spin, and fermions, which have a half-integer spin. A type of spacetime symmetry, supersymmetry is a possible candidate for undiscovered particle physics, and seen as an elegant solution to many current problems in particle physics if confirmed correct, which could resolve various areas where current theories are believ

## 1. Pre-processing

In [82]:
nltk.download('cmudict')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [85]:
d = cmudict.dict()
sno = SnowballStemmer('english')
wnl = WordNetLemmatizer()

def syllable_count(word):
    try:
        return np.min([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    except KeyError:
        #if word not found in cmudict
        return _syllables(word)

def _syllables(word):
#referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

# tokenizer that selects out non letter and non symbol (i.e. all alphabets)
word_tokenizer = RegexpTokenizer(r'[^\d\W]+')

def word_tokenize(sent):
    return [ w for w in word_tokenizer.tokenize(sent) if w.isalpha() ]

def sentence_count(text):
    return len(sent_tokenizer.tokenize(text))

def word_count(sent):
    return len([ w for w in word_tokenize(sent)])

def hard_word_count(sent):
    return len([ w for w in word_tokenize(sent) \
                if syllable_count(wnl.lemmatize(w, pos='v'))>=3 ])

In [84]:
word_tokenize(text_10k[:100])

['The',
 'following',
 'discussion',
 'of',
 'risk',
 'factors',
 'contains',
 'forward',
 'looking',
 'statements',
 'These',
 'risk',
 'factors',
 'may']

## 2. Readability Grade-Levels

Here, we will implement the two readability indices (grade levels). They are defined by

\begin{align}
\textrm{Flesch–Kincaid Grade} 
= 0.39 \left(
\frac{\textrm{Number of words}}{\textrm{Number of sentences}}
\right) \\
+11.8
\left(
\frac{\textrm{Number of syllables}}{\textrm{Number of words}}
\right)
-15.59
\end{align}

and

\begin{align}
\textrm{Gunning-Fog Grade} 
=\; &0.4 \bigg[ 
\left(
\frac{\textrm{Number of words}}{\textrm{Number of sentences}}
\right) 
+100
\left(
\frac{\textrm{Number of hard words}}{\textrm{Number of words}}
\right)
\bigg]
\end{align}

To count syllables, we've added a syllable_count function you can access via 

```
from syllable_count import syllable_count
syllable_count("syllable")
```

Below, implement the function `flesch_index` and `fog_index` that computes the readability grade level for a given text.

In [86]:
def flesch_index(text):
    sentences = sent_tokenize(text)

    total_sentences = len(sentences)
    total_words = np.sum([ word_count(s) for s in sentences ])
    total_syllables = np.sum([ np.sum([ syllable_count(w) for w in word_tokenize(s) ]) \
                              for s in sentences ])
    
    return 0.39*(total_words/total_sentences) + \
            11.8*(total_syllables/total_words) - 15.59

def fog_index(text):
    sentences = sent_tokenize(text)

    total_sentences = len(sentences)
    total_words = np.sum([ word_count(s) for s in sentences ])
    total_hard_words = np.sum([ hard_word_count(s) for s in sentences ])
    
    return 0.4*((total_words/total_sentences) + \
            100.0*(total_hard_words/total_words))

## 3 Results

In [87]:
print(flesch_index(text_alice),fog_index(text_alice))
print(flesch_index(text_phy),fog_index(text_phy))
print(flesch_index(text_10k),fog_index(text_10k))

7.78094652406 9.73654188948
16.3171712123 19.3225332001
18.2108288106 21.5614490682


We expect a grade level around 7-10 for `alice.txt`, and around 16-19 for `physics.txt`, and 18+ for financial documents! 

It turns out 10-Ks are really *hard* to read legal documents!
Now, let's compute the readability for all the 10-Ks we have

In [89]:
filelist_10k=!ls Data/*10k*txt


flesch = []
fog = []

for file in filelist_10k:
    with open(file, 'r') as f:
        text=''.join(f).replace(';','.')
        flesch.append(flesch_index(text))
        fog.append(fog_index(text))
        print(file, flesch[-1],fog[-1])

Data/AAPL_10k_2013.txt 18.1336596757 21.4219541786
Data/AAPL_10k_2014.txt 18.1536894665 21.533048686
Data/AAPL_10k_2015.txt 18.2144706379 21.6060051245
Data/AAPL_10k_2016.txt 18.2620196893 21.6361424013
Data/AAPL_10k_2017.txt 18.2108288106 21.5614490682
Data/EBAY_10k_2013.txt 17.2088261149 19.4673717189
Data/EBAY_10k_2014.txt 17.522305957 19.844332095
Data/EBAY_10k_2015.txt 17.1741438469 19.5172704435
Data/EBAY_10k_2016.txt 16.8119978036 19.2121925858
Data/EBAY_10k_2017.txt 16.988036714 19.3980211714
Data/SHLDQ_10k_2013.txt 16.8126305116 19.2154420317
Data/SHLDQ_10k_2014.txt 17.1138126995 19.5253765922
Data/SHLDQ_10k_2015.txt 18.304118527 21.0016011567
Data/SHLDQ_10k_2016.txt 18.7321020854 21.4781606764
Data/SHLDQ_10k_2017.txt 17.755571973 20.6452057848


Superficially, and according to our readability metrics, reading 10-Ks is harder than reading articles on theoretical physics!