#### Configuration

In [1]:
cfg = {
    "uploadDir": "./data/corpora/nltkCorpus",
    "resultDir": "./data/results",
    "optimal_sentence_length": 16,
}

indicatorsTemplate = {
    "parsable": None,
    "confidence_tokenizer": None,
    "confidence_pos": None,
    "confidence_ner": None,
    "confidence_chunker": None,
    "fit": None,
    "spelling_mistakes": None,
    "avg_sentence_len": None,
    "perc_lowercase": None,
    "perc_uppercase": None,
    "lexical_diversity": None,
    "recognized_by_pos": None,
    "acronyms": None,
    "present_in_dictionary": None,
    "readability_cli": None,
    "readability_ari": None}

#### Initialization

In [2]:
from http import server
import copy
import os
from os import listdir
from os.path import isfile, join
import random
from subprocess import check_output
from threading import Thread
import time
import string
import nltk
import enchant
from spello.model import SpellCorrectionModel
import re
import textstat
import pandas as pd

In [3]:
sp = SpellCorrectionModel(language='en')
# sp.load('./spello_model/en_large.pkl')
sp.load('./spello_model/en_large.pkl')


from spello.model import SpellCorrectionModel 
sp = SpellCorrectionModel(language='en')  
sp.load('/home/ubuntu/model.pkl')
sp.config.min_length_for_spellcorrection = 4 # default is 3
sp.config.max_length_for_spellcorrection = 12 # default is 15
sp.save(model_save_dir='/home/ubuntu/')




<spello.model.SpellCorrectionModel at 0x247687f4ee0>

In [4]:
#list where I'm going to save the indicators for each filename
files = dict()

In [5]:
# helper function
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)

    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [
        word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

In [6]:
def computeSpellingMistakes(filename, indicator):
    with open( os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))
        if len(text_tokenized) == 0:
            result = 0
        else:
            corrected = sp.spell_correct(raw_text)
            mistakes = 0
            for w in text_tokenized:
                if(w in corrected['correction_dict']):
                    mistakes += 1        
            result = (1 - (mistakes / len(text_tokenized)))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [7]:
def computeRecognizedByPOS(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))
        if len(text_tokenized) == 0:
            result = 0
        else:    
            text_tagged = nltk.pos_tag(text_tokenized, tagset='universal')
            unknown = 0
            for t in text_tagged:
                if t[1] == "X":
                    unknown += 1
            result = (1 - (unknown/len(text_tagged)))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [8]:
def wordcount(s):
    """Split sentence s on punctuation
    and return number of non-empty words
    """
    punct = r"\W"  # non-word characters
    return len([w for w in re.split(punct, s) if w])

def computeAvgSentLen(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        terminating_punct = "[!?.]"
        sentences = [
            s.strip()  # without trailing whitespace
            for s in re.split(
                terminating_punct,
                "".join(raw_text).replace("\n", " "),  # text as 1 string
            )
            if s.strip()  # non-empty
        ]
        # map each sentece to its wordcount then sum all the wordcounts
        avgSentenceLength = sum(map(wordcount, sentences)) / len(sentences)
        optimalSentenceLen = cfg["optimal_sentence_length"]
        if avgSentenceLength > 2*optimalSentenceLen:
            avgSentenceLength = 2*optimalSentenceLen
        result = (1 - abs(optimalSentenceLen - avgSentenceLength) /
                  optimalSentenceLen) * 100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [9]:
def computePresentInDictionary(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))
        if len(text_tokenized) == 0:
            result = 0
        else:    
            d = enchant.Dict("en_US")
            correct = 0
            for word in text_tokenized:
                if d.check(word):
                    correct += 1
            result = (correct / len(text_tokenized))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [10]:
def computeLexicalDiversity(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(nltk.word_tokenize(raw_text))

        if len(text_tokenized) == 0:
            result = 0
        else:
            result = (len(set(text_tokenized)) / len(text_tokenized))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [11]:
def setJavaIndicators(filename, result):
    files[filename]["parsable"] = result[0][0:4]
    files[filename]["confidence_tokenizer"] = result[1][0:4]
    files[filename]["confidence_pos"] = result[2][0:4]
    files[filename]["confidence_ner"] = result[3][0:4]
    files[filename]["confidence_chunker"] = result[4][0:4]

def computeJavaIndicators(filename):
    # get the absolute path of the file to pass as argument to jar
    path = os.path.abspath(os.path.join(cfg["uploadDir"], filename))
    pathModels = os.path.abspath("./java-indicators/models")
    # launch java jar
    result = check_output(
        ['java', '-jar', './java-indicators/java-indicators.jar', path, pathModels])
    setJavaIndicators(filename, result.decode().split(","))

In [12]:
def computeAcronyms(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        text_tokenized = removePunctuationFromTokenized(
            nltk.word_tokenize(raw_text))
        if len(text_tokenized) == 0:
            result = 0
        else:    
            acronym_list = re.findall(r"\b(?:[0-9]+[A-Z][A-Z0-9]*)|(?:[A-Z][A-Z0-9]+)\b|\b[A-Z\.]{2,}\b", raw_text)
            #to remove upper case words present in dictionary from the list of acronyms
            d = enchant.Dict("en_US")
            for acronym in acronym_list:
                if d.check(acronym.lower()):
                    acronym_list.remove(acronym)
            
            acronyms_count = 0
            for word in text_tokenized:
                if word in acronym_list:
                    acronyms_count += 1
            result = (1-(acronyms_count / len(text_tokenized)))*100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [13]:
def computeReadabilityCli( filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        score = textstat.coleman_liau_index(raw_text)
        optimalScore = 3
        worstScore = 18

        if(score > worstScore):
            score = worstScore

        result = (1 - abs(optimalScore - score) /
                  (worstScore - optimalScore)) * 100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [14]:
def computeReadabilityAri(filename, indicator):
    with open(os.path.join(cfg["uploadDir"], filename), "r") as f:
        raw_text = f.read()
        score = textstat.automated_readability_index(raw_text)
        optimalScore = 3
        worstScore = 18

        if(score > worstScore):
            score = worstScore

        result = (1 - abs(optimalScore - score) /
                  (worstScore - optimalScore)) * 100
        files[filename][indicator] = str(result)[0:4]
        f.close()

In [15]:
from IPython.display import clear_output
listOfFileNames = [fileName for fileName in listdir(cfg["uploadDir"]) if isfile(join(cfg["uploadDir"], fileName))]

i = 1
totalFiles = len(listOfFileNames)
for filename in listOfFileNames:
    clear_output(wait=False)
    print(f"Analyzing {filename} | {str(i)} of {totalFiles}")
    i += 1
    #populate the dictionary
    files[filename] = copy.deepcopy(indicatorsTemplate)
    computeJavaIndicators(filename)
    computeSpellingMistakes(filename,"spelling_mistakes")
    computeAvgSentLen(filename,"avg_sentence_len")
    computeLexicalDiversity(filename,"lexical_diversity")
    computeRecognizedByPOS(filename, "recognized_by_pos")
    computeAcronyms(filename, "acronyms")
    computePresentInDictionary(filename,"present_in_dictionary")
    computeReadabilityCli(filename,"readability_cli")
    computeReadabilityAri(filename,"readability_ari")
    

Analyzing wsj_0199 | 199 of 199


In [16]:
listOfFileNames = [fileName for fileName in listdir(cfg["uploadDir"]) if isfile(join(cfg["uploadDir"], fileName))]

indicatorsList = []

for f in listOfFileNames:
    indicatorsList.append(list(files[f].values()))

In [17]:
ind_df = pd.DataFrame(indicatorsList, columns=list(indicatorsTemplate.keys()))
ind_df.replace({'100.': '100'}, regex=True, inplace=True)
ind_df = ind_df.astype(float)
ind_df

Unnamed: 0,parsable,confidence_tokenizer,confidence_pos,confidence_ner,confidence_chunker,fit,spelling_mistakes,avg_sentence_len,perc_lowercase,perc_uppercase,lexical_diversity,recognized_by_pos,acronyms,present_in_dictionary,readability_cli,readability_ari
0,100.0,99.6,83.9,91.6,97.7,,88.4,28.1,,,92.3,100.0,100.0,80.7,59.8,61.30
1,100.0,99.8,96.6,97.6,95.6,,95.6,56.2,,,95.6,100.0,95.6,91.3,11.2,2.00
2,100.0,99.0,93.0,98.0,94.9,,97.2,88.7,,,51.9,100.0,100.0,95.1,32.8,0.00
3,100.0,99.6,94.5,99.4,97.0,,98.5,74.5,,,56.1,100.0,100.0,98.2,29.4,0.00
4,100.0,97.9,81.4,87.2,95.5,,97.7,22.7,,,75.5,100.0,100.0,86.6,65.0,69.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,50.0,99.1,94.7,82.5,95.2,,100.0,75.0,,,83.3,100.0,100.0,100.0,17.4,0.00
195,100.0,99.1,94.1,86.2,96.3,,100.0,75.0,,,94.4,100.0,100.0,94.4,15.1,16.00
196,100.0,99.2,94.1,90.7,97.2,,97.6,87.5,,,78.5,100.0,100.0,95.2,14.4,5.99
197,100.0,99.6,91.0,95.9,95.3,,99.3,90.9,,,51.8,100.0,100.0,99.6,37.9,0.00


In [18]:
# remove the 100. strings
ind_df.replace({'100.': '100'}, regex=True, inplace=True)
ind_df = ind_df.astype(float)

In [19]:
# Average of each column using DataFrame.mean()
df2 = ind_df.mean(axis=0).to_frame()
df2 = df2.dropna()
df2 = df2.T
df2

Unnamed: 0,parsable,confidence_tokenizer,confidence_pos,confidence_ner,confidence_chunker,spelling_mistakes,avg_sentence_len,lexical_diversity,recognized_by_pos,acronyms,present_in_dictionary,readability_cli,readability_ari
0,98.81407,99.032663,92.731658,97.240201,95.789447,97.943216,72.729648,63.233668,99.99397,99.407035,95.517588,33.727035,8.336784


In [20]:
outputCsvName = cfg["uploadDir"].replace("./data/corpora/nltkCorpus","") + ".csv"
outputPath = os.path.join(cfg["resultDir"],outputCsvName)
df2.to_csv(outputPath, index=False)