In [1]:
import string
import nltk
import enchant
from spello.model import SpellCorrectionModel
import textstat

nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\livio\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\livio\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
#load spell checker model
sp = SpellCorrectionModel(language='en')
sp.load('./spello_model/en_large.pkl')


from spello.model import SpellCorrectionModel 
sp = SpellCorrectionModel(language='en')  
sp.load('/home/ubuntu/model.pkl')
sp.config.min_length_for_spellcorrection = 4 # default is 3
sp.config.max_length_for_spellcorrection = 12 # default is 15
sp.save(model_save_dir='/home/ubuntu/')




<spello.model.SpellCorrectionModel at 0x1ed5d349190>

In [5]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Lexical Diversity

In [26]:
def lexical_diversity(text):
   return (len(set(text)) / len(text))

#### Percentage of Uppercased and Lowercased Words 

In [20]:
def percentage_uppercased(text_tokenized):
    islower = 0;
    isupper = 0;
    total_words = len(text_tokenized);

    for word in text_tokenized:      
        if word.islower():
            islower +=1         
        elif word.isupper():
            isupper+=1         

    return (islower/total_words,isupper/total_words)

#### Spell Checking Against Dictionary

In [6]:
def spell_checker_1(text_tokenized):
    d = enchant.Dict("en_US")
    mistakes = 0
    for word in text_tokenized:
        if not d.check(word):
            mistakes+=1
    return mistakes        

#### Spell Checking with ML model

In [7]:
def spell_checker_2(raw_text):
    corrected = sp.spell_correct(raw_text)
    return len(corrected['correction_dict'])

#### Unrecognized Words by POS Tagger

In [8]:
def unknown_words(text_tokenized):
    text_tagged = nltk.pos_tag(text_tokenized,tagset='universal')
    unknown = 0
    for t in text_tagged:
         if t[1] == "X":
                #return t
                unknown += 1
    return unknown/len(text_tagged)            

#### Average Sentence Length

In [9]:
import doctest
import re


def avg_sentence_length(text):
    
    terminating_punct = "[!?.]"
    punct = r"\W"  # non-word characters
    sentences = [
        s.strip()  # without trailing whitespace
        for s in re.split(
            terminating_punct,
            "".join(text).replace("\n", " "),  # text as 1 string
        )
        if s.strip()  # non-empty
    ]

    def wordcount(s):
        """Split sentence s on punctuation
        and return number of non-empty words
        """
        return len([w for w in re.split(punct, s) if w])
    #map each sentece to its wordcount then sum all the wordcounts
    return sum(map(wordcount, sentences)) / len(sentences)


In [25]:
path = "./data/input/input_file.txt"

with open(path, "r") as ifile:
    raw_text = ifile.read()
    
#raw_text = "Ciao mi chiamo Livio. Questa è una prova sdfsdf sdf sd f"    
print("Lexical diversity:" + str(lexical_diversity(raw_text)))

text_tokenized = removePunctuationFromTokenized(nltk.word_tokenize(raw_text))
lower,upper = percentage_uppercased(text_tokenized)
print("All lowercase: "+ "{0:.3f}".format(lower))
print("All uppercase: "+ "{0:.3f}".format(upper))
print("Spelling mistakes 1: "  + str(spell_checker_1(text_tokenized)))
print("Spelling mistakes 2: "  + str(spell_checker_2(raw_text)))
print("Unrecognized by POS tagger: "+str(unknown_words(text_tokenized)))
print("Average sentence length: " +"{0:.3f}".format(avg_sentence_length(raw_text)))
print("Readability (CLI): " + str(textstat.coleman_liau_index(raw_text)))
print("Readability (ARI): " + str(textstat.automated_readability_index(raw_text)))

Lexical diversity:0.2545454545454545
All lowercase: 0.593
All uppercase: 0.074
Spelling mistakes 1: 6
Spelling mistakes 2: 2
Unrecognized by POS tagger: 0.0
Average sentence length: 4.667
Readability (CLI): 7.57
Readability (ARI): 6.4
