In [None]:
import string
import nltk
import enchant
from spello.model import SpellCorrectionModel
import textstat
import re

nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

In [None]:
#load spell checker model
sp = SpellCorrectionModel(language='en')
sp.load('./spello_model/en.pkl')

In [None]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Lexical Diversity

In [None]:
def lexical_diversity(text):
   return (len(set(text)) / len(text))

#### Percentage of Uppercased and Lowercased Words 

In [None]:
def percentage_uppercased(text_tokenized):
    islower = 0;
    isupper = 0;
    total_words = len(text_tokenized);

    for word in text_tokenized:      
        if word.islower():
            islower +=1         
        elif word.isupper():
            isupper+=1         

    return (islower/total_words,isupper/total_words)

#### Spell Checking Against Dictionary

In [None]:
def spell_checker_1(text_tokenized):
    d = enchant.Dict("en_US")
    mistakes = 0
    for word in text_tokenized:
        if not d.check(word):
            mistakes+=1
    return mistakes        

#### Spell Checking with ML model

In [None]:
def spell_checker_2(raw_text,text_tokenized):
    corrected = sp.spell_correct(raw_text)
    i = 0
    for w in text_tokenized:
        if(w in corrected['correction_dict']):
            i+=1
    return i

#### Unrecognized Words by POS Tagger

In [None]:
def unrecognized_by_pos(text_tokenized):
    text_tagged = nltk.pos_tag(text_tokenized,tagset='universal')
    unknown = 0
    for t in text_tagged:
         if t[1] == "X":
                #return t
                unknown += 1
    return unknown/len(text_tagged)            

#### Average Sentence Length

In [None]:
def avg_sentence_length(text):
    
    terminating_punct = "[!?.]"
    punct = r"\W"  # non-word characters
    sentences = [
        s.strip()  # without trailing whitespace
        for s in re.split(
            terminating_punct,
            "".join(text).replace("\n", " "),  # text as 1 string
        )
        if s.strip()  # non-empty
    ]

    def wordcount(s):
        """Split sentence s on punctuation
        and return number of non-empty words
        """
        return len([w for w in re.split(punct, s) if w])
    #map each sentece to its wordcount then sum all the wordcounts
    return sum(map(wordcount, sentences)) / len(sentences)


#### Acronyms 1

In [None]:
#this finds acronyms with with uppercase charcters with numbers inside
def acronym1(s):
    return re.findall(r"\b(?:[0-9]+[A-Z][A-Z0-9]*)|(?:[A-Z][A-Z0-9]+)\b", s)

#### Acronyms 2

In [None]:
#this finds acronyms of all letters in uppercase and all letters in uppercase followed by dots 
def acronym2(s):
    return re.findall(r"\b[A-Z\.]{2,}\b", s)

#### Acronyms

In [None]:
def acronym(s):
    acronyms = re.findall(r"\b(?:[0-9]+[A-Z][A-Z0-9]*)|(?:[A-Z][A-Z0-9]+)\b|\b[A-Z\.]{2,}\b", s)
    d = enchant.Dict("en_US")
    for acronym in acronyms:
        if d.check(acronym.lower()):
            acronyms.remove(acronym)
        
    return  acronyms


In [None]:
path = "./data/corpora/acronimi2.txt"

with open(path, "r") as ifile:
    raw_text = ifile.read()
    
#raw_text = "Sample text to try functions"    
text_tokenized = removePunctuationFromTokenized(nltk.word_tokenize(raw_text))
lower,upper = percentage_uppercased(text_tokenized)
print("Lexical diversity: " + "{0:.3f}".format(lexical_diversity(text_tokenized)))
print("Lowercase sentences: "+ "{0:.3f}".format(lower))
print("Uppercase sentences: "+ "{0:.3f}".format(upper))
print("Spelling mistakes 1: "  + str(spell_checker_1(text_tokenized)))
print("Spelling mistakes 2: "  + str(spell_checker_2(raw_text,text_tokenized)))
print("Unrecognized by POS tagger: "+str(unrecognized_by_pos(text_tokenized)))
print("Average sentence length: " +"{0:.3f}".format(avg_sentence_length(raw_text)))
print("Readability (CLI): " + str(textstat.coleman_liau_index(raw_text)))
print("Readability (ARI): " + str(textstat.automated_readability_index(raw_text)))
print("Acronyms:" + str(acronym(raw_text)))

In [5]:
from allennlp.predictors.predictor import Predictor

#model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
#predictor = Predictor.from_path(model_url)

text = "Joe Biden is president. He's healty."


prediction = predictor.predict(document=text)  # get prediction
print("Clsuters:-")
for cluster in prediction['clusters']:
    print(cluster)  # list of clusters (the indices of spaCy tokens)
# Result: [[[0, 3], [26, 26]], [[34, 34], [50, 50]]]
print('\n\n') #Newline

print('Coref resolved: ',predictor.coref_resolved(text))  # resolved text
# Result: Joseph Robinette Biden Jr. is an American politician who is the 

Clsuters:-
[[0, 1], [5, 5]]



Coref resolved:  Joe Biden is president. Joe Biden's healty.
