# A look at some of the readability score metrics

<A HREF="https://en.wikipedia.org/wiki/Readability">Wikipedia - Readability</A><BR>
<A HREF="https://www.geeksforgeeks.org/readability-index-pythonnlp/">GeeksforGeeks Readability - Index in Python</A><BR>
<A HREF="https://pypi.org/project/readability/">Readability python package</A><BR>
    <A HREF="https://pypi.org/project/textstat/">Textstat python package</A>

In [38]:
import os
import re
import unicodedata
import numpy as np
import pandas as pd
import warnings
import nltk
import en_core_web_md
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from collections import defaultdict
from textblob import TextBlob
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
import readability
import pprint # pretty print for easy printing of ordered dictionary
# import spacy # Needed for GeeksforGeeks code which is a mess
from textstat.textstat import textstatistics
spacy.load('en_core_web_md')

<spacy.lang.en.English at 0x7f4d037abd60>

In [2]:
tidy_data = pd.read_csv('tidy_data.csv')
tidy_data['date'] = pd.to_datetime(tidy_data['date'], format='%Y-%m-%d')
tidy_data.head(2)

Unnamed: 0,date,source,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,...,sadness,surprise,trust,num_sents,num_words,num_unique_words,depth,TBsubjectivity,TBpolarity,words_per_sentence
0,2008-06-04,nyt,0.064458,0.065088,0.0624,0.064962,0.064416,0.063408,0.055598,0.06492,...,0.07563,0.046218,0.130252,47,1459,620,7.276596,0.406976,0.13588,31.042553
1,2008-06-04,oba,0.064649,0.06467,0.06444,0.064398,0.064471,0.064638,0.054967,0.064503,...,0.042017,0.048739,0.159664,217,5856,939,5.986175,0.445383,0.167408,26.986175


In [4]:
sentences = pd.read_csv('sentence_depth.csv')
sentences['date'] = pd.to_datetime(sentences['date'], format='%Y-%m-%d')
sentences.head(2)

Unnamed: 0,date,source,sentence,depth
0,2004-07-28,oba,"On behalf of the great state of Illinois, cros...",8
1,2004-07-28,oba,"Tonight is a particular honor for me because, ...",5


### First, using the readability package

In [9]:
# Text should be encoded with UTF-8, one sentence per line, tokens space-separated.
results = readability.getmeasures(sentences['sentence'][0], lang='en')
pprint.pprint(results)

OrderedDict([('readability grades',
              OrderedDict([('Kincaid', 11.765714285714285),
                           ('ARI', 15.447142857142858),
                           ('Coleman-Liau', 11.701783714285718),
                           ('FleschReadingEase', 60.57928571428573),
                           ('GunningFogIndex', 16.914285714285715),
                           ('LIX', 60.142857142857146),
                           ('SMOGIndex', 13.954451150103322),
                           ('RIX', 9.0),
                           ('DaleChallIndex', 11.228514285714287)])),
             ('sentence info',
              OrderedDict([('characters_per_word', 4.857142857142857),
                           ('syll_per_word', 1.3928571428571428),
                           ('words_per_sentence', 28.0),
                           ('sentences_per_paragraph', 1.0),
                           ('type_token_ratio', 0.8214285714285714),
                           ('characters', 136),
              

In [11]:
pprint.pprint(results['readability grades'])

OrderedDict([('Kincaid', 11.765714285714285),
             ('ARI', 15.447142857142858),
             ('Coleman-Liau', 11.701783714285718),
             ('FleschReadingEase', 60.57928571428573),
             ('GunningFogIndex', 16.914285714285715),
             ('LIX', 60.142857142857146),
             ('SMOGIndex', 13.954451150103322),
             ('RIX', 9.0),
             ('DaleChallIndex', 11.228514285714287)])


### Now, try GeeksforGeeks code. OK, their code is a mess and needs some work

In [35]:
# Splits the text into sentences, using
# Spacy's sentence segmentation which can
# be found at https://spacy.io/usage/spacy-101
def break_sentences(text):
    nlp = spacy.load('en_core_web_md')
    doc = nlp(text)
    return list(doc.sents)
 
# Returns Number of Words in the text
def word_count(text):
    sentences = break_sentences(text)
    words = 0
    for sentence in sentences:
        words += len([token for token in sentence])
    return words
 
# Returns the number of sentences in the text
def sentence_count(text):
    sentences = break_sentences(text)
    return len(sentences)
 
# Returns average sentence length
def avg_sentence_length(text):
    words = word_count(text)
    sentences = sentence_count(text)
    average_sentence_length = float(words / sentences)
    return average_sentence_length
 
# Textstat is a python package, to calculate statistics from
# text to determine readability,
# complexity and grade level of a particular corpus.
# Package can be found at https://pypi.python.org/pypi/textstat
def syllables_count(word):
    return textstatistics().syllable_count(word)
 
# Returns the average number of syllables per
# word in the text
def avg_syllables_per_word(text):
    syllable = syllables_count(text)
    words = word_count(text)
    ASPW = float(syllable) / float(words)
    return _legacy_round(ASPW, 1)
 
# Return total Difficult Words in a text
def difficult_words(text):
     
    nlp = spacy.load('en_core_web_md')
    doc = nlp(text)
    # Find all words in the text
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [str(token) for token in sentence]
 
    # difficult words are those with syllables >= 2
    # easy_word_set is provide by Textstat as
    # a list of common words
    diff_words_set = set()
     
    for word in words:
        syllable_count = syllables_count(word)
        if word not in nlp.Defaults.stop_words and syllable_count >= 2:
            diff_words_set.add(word)
 
    return len(diff_words_set)
 
# A word is polysyllablic if it has more than 3 syllables
# this functions returns the number of all such words
# present in the text
def poly_syllable_count(text):
    count = 0
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [token for token in sentence]
     
 
    for word in words:
        syllable_count = syllables_count(word)
        if syllable_count >= 3:
            count += 1
    return count
 
 
def flesch_reading_ease(text):
    """
        Implements Flesch Formula:
        Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)
        Here,
          ASL = average sentence length (number of words
                divided by number of sentences)
          ASW = average word length in syllables (number of syllables
                divided by number of words)
    """
    FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
          float(84.6 * avg_syllables_per_word(text))
    return _legacy_round(FRE, 2)
 
 
def gunning_fog(text):
    per_diff_words = (difficult_words(text) / word_count(text) * 100) + 5
    grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
    return grade
 
 
def smog_index(text):
    """
        Implements SMOG Formula / Grading
        SMOG grading = 3 + ?polysyllable count.
        Here,
           polysyllable count = number of words of more
          than two syllables in a sample of 30 sentences.
    """
 
    if sentence_count(text) >= 3:
        poly_syllab = poly_syllable_count(text)
        SMOG = (1.043 * (30*(poly_syllab / sentence_count(text)))**0.5) \
                + 3.1291
        return legacy_round(SMOG, 1)
    else:
        return 0
 
 
def dale_chall_readability_score(text):
    """
        Implements Dale Challe Formula:
        Raw score = 0.1579*(PDW) + 0.0496*(ASL) + 3.6365
        Here,
            PDW = Percentage of difficult words.
            ASL = Average sentence length
    """
    words = word_count(text)
    # Number of words not termed as difficult words
    count = word_count - difficult_words(text)
    if words > 0:
 
        # Percentage of words not on difficult word list
 
        per = float(count) / float(words) * 100
     
    # diff_words stores percentage of difficult words
    diff_words = 100 - per
 
    raw_score = (0.1579 * diff_words) + \
                (0.0496 * avg_sentence_length(text))
     
    # If Percentage of Difficult Words is greater than 5 %, then;
    # Adjusted Score = Raw Score + 3.6365,
    # otherwise Adjusted Score = Raw Score
 
    if diff_words > 5:      
 
        raw_score += 3.6365
         
    return legacy_round(score, 2)

In [29]:
difficult_words(sentences['sentence'][0])

11

In [30]:
poly_syllable_count(sentences['sentence'][0])

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'split'

In [36]:
flesch_reading_ease(sentences['sentence'][0])

NameError: name '_legacy_round' is not defined