#### Configuration

In [1]:
cfg = {
    "uploadDir": "./data",
    "optimal_sentence_length": 16,
}

indicatorsTemplate = {
    "parsable": None,
    "confidence_tokenizer": None,
    "confidence_pos": None,
    "confidence_ner": None,
    "confidence_chunker": None,
    "fit": None,
    "spelling_mistakes": None,
    "avg_sentence_len": None,
    "perc_lowercase": None,
    "perc_uppercase": None,
    "lexical_diversity": None,
    "recognized_by_pos": None,
    "acronyms": None,
    "present_in_dictionary": None,
    "readability_cli": None,
    "readability_ari": None}

#### Initialization

In [2]:
from http import server
import copy
import os
from os import listdir
from os.path import isfile, join
import random
from subprocess import check_output
from threading import Thread
import time
import string
import nltk
import enchant
from spello.model import SpellCorrectionModel
import re
import textstat

In [3]:
sp = SpellCorrectionModel(language='en')
# sp.load('./spello_model/en_large.pkl')
sp.load('./spello_model/en.pkl')


from spello.model import SpellCorrectionModel 
sp = SpellCorrectionModel(language='en')  
sp.load('/home/ubuntu/model.pkl')
sp.config.min_length_for_spellcorrection = 4 # default is 3
sp.config.max_length_for_spellcorrection = 12 # default is 15
sp.save(model_save_dir='/home/ubuntu/')




<spello.model.SpellCorrectionModel at 0x18e560fa6e0>

In [4]:
#list where I'm going to save the indicators for each filename
files = dict()

In [5]:
# helper function
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)

    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [
        word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

In [6]:
def computeSpellingMistakes(filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open( os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))
        corrected = sp.spell_correct(raw_text)
        mistakes = 0
        for w in text_tokenized:
            if(w in corrected['correction_dict']):
                mistakes += 1
        result = (1 - (mistakes / len(text_tokenized)))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [7]:
def computeRecognizedByPOS(filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))

        text_tagged = nltk.pos_tag(text_tokenized, tagset='universal')
        unknown = 0
        for t in text_tagged:
            if t[1] == "X":
                unknown += 1
        result = (1 - (unknown/len(text_tagged)))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [8]:
def wordcount(s):
    """Split sentence s on punctuation
    and return number of non-empty words
    """
    punct = r"\W"  # non-word characters
    return len([w for w in re.split(punct, s) if w])

def computeAvgSentLen(filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        terminating_punct = "[!?.]"
        sentences = [
            s.strip()  # without trailing whitespace
            for s in re.split(
                terminating_punct,
                "".join(raw_text).replace("\n", " "),  # text as 1 string
            )
            if s.strip()  # non-empty
        ]
        # map each sentece to its wordcount then sum all the wordcounts
        avgSentenceLength = sum(map(wordcount, sentences)) / len(sentences)
        optimalSentenceLen = cfg["optimal_sentence_length"]
        if avgSentenceLength > 2*optimalSentenceLen:
            avgSentenceLength = 2*optimalSentenceLen
        result = (1 - abs(optimalSentenceLen - avgSentenceLength) /
                  optimalSentenceLen) * 100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [9]:
def computePresentInDictionary(filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))

        d = enchant.Dict("en_US")
        correct = 0
        for word in text_tokenized:
            if d.check(word):
                correct += 1
        result = (correct / len(text_tokenized))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [10]:
def computeLexicalDiversity(filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))

        # TODO normalize

        result = (len(set(text_tokenized)) / len(text_tokenized))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [11]:
def setJavaIndicators(filename, result):
    files[filename]["parsable"] = result[0][0:4]
    files[filename]["confidence_tokenizer"] = result[1][0:4]
    files[filename]["confidence_pos"] = result[2][0:4]
    files[filename]["confidence_ner"] = result[3][0:4]
    files[filename]["confidence_chunker"] = result[4][0:4]

def computeJavaIndicators(filename):
    # get the absolute path of the file to pass as argument to jar
    path = os.path.abspath(os.path.join(cfg["uploadDir"], filename))
    pathModels = os.path.abspath("./java-indicators/models")
    # launch java jar
    result = check_output(
        ['java', '-jar', './java-indicators/java-indicators.jar', path, pathModels])
    setJavaIndicators(filename, result.decode().split(","))

In [12]:
def computeAcronyms(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))
        acronym_list = re.findall(
            r"\b(?:[0-9]+[A-Z][A-Z0-9]*)|(?:[A-Z][A-Z0-9]+)\b|\b[A-Z\.]{2,}\b", raw_text)
        acronyms_count = 0
        for word in text_tokenized:
            if word in acronym_list:
                acronyms_count += 1
        result = (1-(acronyms_count / len(text_tokenized)))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [13]:
def computeReadabilityCli( filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        score = textstat.coleman_liau_index(raw_text)
        optimalScore = 3
        worstScore = 18

        if(score > worstScore):
            score = worstScore

        result = (1 - abs(optimalScore - score) /
                  (worstScore - optimalScore)) * 100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [14]:
def computeReadabilityAri(filename, indicator):
    print(f"running computation of {indicator} for {filename}")
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        score = textstat.automated_readability_index(raw_text)
        optimalScore = 3
        worstScore = 18

        if(score > worstScore):
            score = worstScore

        result = (1 - abs(optimalScore - score) /
                  (worstScore - optimalScore)) * 100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [15]:
listOfFileNames = [fileName for fileName in listdir(cfg["uploadDir"]) if isfile(join(cfg["uploadDir"], fileName))]

for filename in listOfFileNames:
    #populate the dictionary
    files[filename] = copy.deepcopy(indicatorsTemplate)
    computeJavaIndicators(filename)
    computeSpellingMistakes(filename,"spelling_mistakes")
    computeAvgSentLen(filename,"avg_sentence_len")
    computeLexicalDiversity(filename,"lexical_diversity")
    computeRecognizedByPOS(filename, "recognized_by_pos")
    computeAcronyms(filename, "acronyms")
    computePresentInDictionary(filename,"present_in_dictionary")
    computeReadabilityCli(filename,"readability_cli")
    computeReadabilityAri(filename,"readability_ari")
    

running computation of spelling_mistakes for acronimi2.txt
running computation of avg_sentence_len for acronimi2.txt
running computation of lexical_diversity for acronimi2.txt
running computation of recognized_by_pos for acronimi2.txt
running computation of present_in_dictionary for acronimi2.txt
running computation of readability_cli for acronimi2.txt
running computation of readability_ari for acronimi2.txt
running computation of spelling_mistakes for prose.txt
running computation of avg_sentence_len for prose.txt
running computation of lexical_diversity for prose.txt
running computation of recognized_by_pos for prose.txt
running computation of present_in_dictionary for prose.txt
running computation of readability_cli for prose.txt
running computation of readability_ari for prose.txt


In [16]:
listOfFileNames = [fileName for fileName in listdir(cfg["uploadDir"]) if isfile(join(cfg["uploadDir"], fileName))]

indicatorsList = []

for f in listOfFileNames:
    indicatorsList.append(list(files[f].values()))

In [50]:
import pandas as pd

ind_df = pd.DataFrame(indicatorsList, columns=list(indicatorsTemplate.keys()))
ind_df.replace({'100.': '100'}, regex=True, inplace=True)
ind_df = ind_df.astype(float)
ind_df

Unnamed: 0,parsable,confidence_tokenizer,confidence_pos,confidence_ner,confidence_chunker,fit,spelling_mistakes,avg_sentence_len,perc_lowercase,perc_uppercase,lexical_diversity,recognized_by_pos,acronyms,present_in_dictionary,readability_cli,readability_ari
0,100.0,98.4,64.8,97.7,80.7,,60.8,37.5,,,95.6,95.6,86.9,39.1,64.8,71.3
1,100.0,98.6,92.0,99.4,94.2,,97.0,98.9,,,42.6,100.0,100.0,98.3,65.8,41.3


In [39]:
# remove the 100. strings
ind_df.replace({'100.': '100'}, regex=True, inplace=True)
ind_df = ind_df.astype(float)

In [56]:
# Average of each column using DataFrame.mean()
df2 = ind_df.mean(axis=0).to_frame()


Unnamed: 0,0
parsable,100.0
confidence_tokenizer,98.5
confidence_pos,78.4
confidence_ner,98.55
confidence_chunker,87.45
fit,
spelling_mistakes,78.9
avg_sentence_len,68.2
perc_lowercase,
perc_uppercase,


In [57]:
df2.dropna()

Unnamed: 0,0
parsable,100.0
confidence_tokenizer,98.5
confidence_pos,78.4
confidence_ner,98.55
confidence_chunker,87.45
spelling_mistakes,78.9
avg_sentence_len,68.2
lexical_diversity,69.1
recognized_by_pos,97.8
acronyms,93.45
