In [None]:
# Importing libraries

import language_check
import numpy as np
import os
import pandas as pd
import sys, re, os, nltk
import requests
import warnings
from nltk import word_tokenize
from nltk.corpus import words, wordnet
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tree import Tree
from pandas.core.common import SettingWithCopyWarning
from stanfordcorenlp import StanfordCoreNLP
from textstat.textstat import textstat

nlp = StanfordCoreNLP(r'C:\Users\hp word\anaconda3\Lib\site-packages\stanfordcorenlp')
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Common functions

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
output_list = []
wordset = set(words.words())
lmtzr = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
tool = language_check.LanguageTool('en-US')

def restrict2TwoDecimals(D):
    return round(D, 2)

def handleDivByZero(n_1, n_2):
    if n_2 == 0.0:
        return 0.0;
    elif n_1/n_2 > 10000:
        return 0.0;
    else:
        return restrict2TwoDecimals(n_1/n_2)

# Reading output of previous txt to csv converter

In [None]:
testing_data = pd.read_csv('test_data.csv' ,encoding='ANSI')
test_text = testing_data

# Readability Features

In [None]:
# Using textstat library implementation of readability features
for index, row in test_text.iterrows():
    essay = row['text'] 
    fre = textstat.flesch_reading_ease(essay)
    fkg = textstat.flesch_kincaid_grade(essay)
    cli = textstat.coleman_liau_index(essay)
    ari = textstat.automated_readability_index(essay)
    dcrs = textstat.dale_chall_readability_score(essay)
    dw = textstat.difficult_words(essay)
    lwf = textstat.linsear_write_formula(essay)
    gf = textstat.gunning_fog(essay)
    test_text.at[index, 'fre'] = fre
    test_text.at[index, 'fkg'] = fkg
    test_text.at[index, 'cli'] = cli
    test_text.at[index, 'ari'] = ari
    test_text.at[index, 'dcrs'] = dcrs
    test_text.at[index, 'dw'] = dw
    test_text.at[index, 'lwf'] = lwf
    test_text.at[index, 'gf'] = gf
test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,lwf,gf
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,11.000000,13.98
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,8.166667,11.72
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,7.285714,12.29
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,6.000000,12.46
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,4.500000,6.86
...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,5.222222,11.16
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,5.888889,9.28
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,4.777778,11.26
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,8.666667,20.22


# Word Level Features

In [None]:
def getMTLD(tokens):
    types = []
    factors=0
    ttrThreshold = 0.72
    startIndex = 0
    ttr=1
    # go over the text and get ttr to get the number of factors
    for i in range(len(tokens)):
        currentToken = tokens[i]
        #each time a new type is found, compute type token ratio
        if currentToken.lower() not in types:
            types.append(currentToken.lower())
        ttr = len(types)/(i+1-startIndex)
        if ttr < ttrThreshold:
       #cut text (those portions are called factor) and reset list of types
            startIndex = i+1
            types.clear()
            #keep count of factors
            factors +=1        
        #if it is the last word and the ttr threshold is not reached, calculate the rest factor
        elif (ttr >ttrThreshold) and (i ==len(tokens)-1):
            factors += (1-ttr)/(1-0.72)
    #repeat until all tokens are finished.
    #form MTLD score:  #tokens /#factors
    mtld1 = len(tokens) / factors
    
    #repeat same starting at the end of the text
    factors = 0
    startIndex= len(tokens)-1
    ttr=1
    types.clear()
    for i in reversed(range(len(tokens))):
        currentToken = tokens[i]
        #each time a new type is found, compute type token ratio
        if currentToken.lower() not in types:
            types.append(currentToken.lower())
        ttr = len(types)/(startIndex-i+1)

        #when ttr reaches threshold
        if ttr < ttrThreshold:
            #cut text (those portions are called factor) and reset list of types 
            startIndex = i-1
            types.clear()
            factors +=1
        #if it is the last word and the ttr threshold is not reached. calculate the rest factor
        elif (ttr >ttrThreshold) and (i ==0):
            factors += (1-ttr)/(1-0.72)
            
    #repeat until not tokens left

    mtld2 = len(tokens)/ factors
    res = (mtld1+mtld2)/2
    resD = float(res)
    #take the mean of both forward and backward score
    if not np.isinf(resD):
        return resD
    else:
        return 0.0

In [None]:
for index, row in test_text.iterrows():
    essay_low = row['text'].lower()
    essay_low_np = essay_low.replace('[^\w\s]','')
    tokens_low_np = tokenizer.tokenize(essay_low_np)
    types = set(tokens_low_np)
    num_types = len(types)
    num_tokens = len(tokens_low_np)
    
    test_text.at[index, 'Word_numWords'] = num_tokens
    test_text.at[index, 'Word_TTR'] = handleDivByZero(num_types, num_tokens)
    test_text.at[index, 'Word_CTTR'] = handleDivByZero(num_types, np.sqrt(2.0*num_tokens))
    test_text.at[index, 'Word_RTTR'] = handleDivByZero(num_types,np.sqrt(num_tokens))
    test_text.at[index, 'Word_BilogTTR'] = handleDivByZero(np.log(num_types),np.log(num_tokens))
    test_text.at[index, 'Word_UberIndex'] = handleDivByZero(np.log(num_tokens)**2,np.log(num_tokens/num_types))
    test_text.at[index, 'Word_MTLD'] = restrict2TwoDecimals(getMTLD(tokens_low_np))
    
test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,lwf,gf,Word_numWords,Word_TTR,Word_CTTR,Word_RTTR,Word_BilogTTR,Word_UberIndex,Word_MTLD
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,11.000000,13.98,402.0,0.55,7.86,11.12,0.90,61.02,87.44
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,8.166667,11.72,528.0,0.51,8.22,11.62,0.89,57.64,98.65
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,7.285714,12.29,395.0,0.57,8.01,11.32,0.91,63.52,125.94
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,6.000000,12.46,322.0,0.50,6.30,8.92,0.88,47.68,53.95
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,4.500000,6.86,393.0,0.48,6.74,9.53,0.88,48.75,78.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,5.222222,11.16,346.0,0.55,7.18,10.16,0.90,56.53,106.84
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,5.888889,9.28,285.0,0.55,6.58,9.30,0.89,53.59,64.58
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,4.777778,11.26,534.0,0.49,7.93,11.21,0.88,54.51,83.82
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,8.666667,20.22,365.0,0.47,6.40,9.06,0.87,46.62,77.30


# Error features

In [None]:
# Used language_check library implementation for error detection
for index, row in test_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    num_sents = len(sentences)
    spelling_mistake = 0
    duplicate_mistake = 0
    other_mistake = 0
    for sentence in sentences:
        matches = tool.check(sentence)
        for match in matches:
            if match.locqualityissuetype == 'misspelling':
                spelling_mistake +=1
            elif match.locqualityissuetype == 'duplication':
                duplicate_mistake +=1
            else:
                other_mistake +=1
                
    all_mistakes = spelling_mistake + duplicate_mistake + other_mistake
    
    test_text.at[index, 'SpellingErrorsPerSen'] = restrict2TwoDecimals(spelling_mistake/num_sents)
    test_text.at[index, 'DuplicateErrorsPerSen'] = restrict2TwoDecimals(duplicate_mistake/num_sents)
    test_text.at[index, 'OtherErrorsPerSen'] = restrict2TwoDecimals(other_mistake/num_sents)
    test_text.at[index, 'AllErrorsPerSen'] = restrict2TwoDecimals(all_mistakes/num_sents)
    test_text.at[index, 'SpellingErrorsWrtAllErrors'] = restrict2TwoDecimals(spelling_mistake/all_mistakes)
test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,Word_CTTR,Word_RTTR,Word_BilogTTR,Word_UberIndex,Word_MTLD,SpellingErrorsPerSen,DuplicateErrorsPerSen,OtherErrorsPerSen,AllErrorsPerSen,SpellingErrorsWrtAllErrors
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,...,7.86,11.12,0.90,61.02,87.44,0.15,0.0,0.27,0.42,0.36
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,...,8.22,11.62,0.89,57.64,98.65,0.28,0.0,0.53,0.81,0.35
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,...,8.01,11.32,0.91,63.52,125.94,0.52,0.0,0.61,1.13,0.46
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,...,6.30,8.92,0.88,47.68,53.95,0.84,0.0,0.32,1.16,0.73
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,...,6.74,9.53,0.88,48.75,78.23,0.11,0.0,0.19,0.31,0.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,...,7.18,10.16,0.90,56.53,106.84,0.39,0.0,0.83,1.22,0.32
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,...,6.58,9.30,0.89,53.59,64.58,0.44,0.0,0.30,0.74,0.60
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,...,7.93,11.21,0.88,54.51,83.82,0.27,0.0,0.49,0.76,0.35
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,...,6.40,9.06,0.87,46.62,77.30,1.05,0.0,0.81,1.86,0.56


# Parts Of Speech (POS) Features

In [None]:
# Used NLTK library implementation of POS features
# Run 'nltk.help.upenn_tagset()' to get list of The Penn Treebank's POS tags

for index, row in test_text.iterrows():
    essay = row['text']
    tokens = word_tokenize(essay)
    tagged = nltk.pos_tag(tokens)
    TotalWords = 0;
    numAdj = 0;
    numNouns = 0;
    numVerbs = 0;
    numPronouns = 0;
    numConjunct = 0;
    numProperNouns = 0;	 
    numPrepositions = 0;
    numAdverbs = 0;
    numLexicals = 0;
    numModals = 0;
    numInterjections = 0;
    perpronouns = 0;
    whperpronouns = 0;
    numauxverbs = 0;
    numFunctionWords = 0;
    numDeterminers = 0;
    numVB = 0;
    numVBD = 0;
    numVBG = 0;
    numVBN = 0;
    numVBP = 0;
    numVBZ = 0;
    uniqueVerbs = []
    for word, tag in tagged:
        if tag =="PRP" or tag =="PRP$" or tag=="WP" or tag=="WP$":
            numPronouns +=1
            if tag == "PRP":
                perpronouns +=1
            if tag == "WP":
                whperpronouns +=1
            numFunctionWords +=1
            TotalWords +=1
            
        if tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            numVerbs+=1
            TotalWords+=1
            if word not in uniqueVerbs:
                uniqueVerbs.append(word)
            if tag == "VB":
                numVB+=1
            elif tag == "VBD":
                numVBD+=1
            elif tag == "VBG":
                numVBG+=1
            elif tag == "VBN":
                numVBN+=1
            if tag == "VBP":
                numVBP+=1
            if tag == "VBZ":
                numVBZ+=1
        if tag == "JJ" or tag == "JJR" or tag == "JJS":
            numAdj+=1
            TotalWords+=1
        if tag == "RB" or tag == "RBR" or tag == "RBS" or tag == "RP":
            numAdverbs+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "IN":
            numPrepositions+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "UH":
            numInterjections+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "CC":
            numConjunct+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "NN" or tag == "NNS":
            numNouns+=1
            TotalWords+=1
        if tag == "NNP" or tag == "NNPS":
            numProperNouns+=1
            TotalWords+=1
        if tag == "MD":
            numModals+=1
            numauxverbs+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "DT":
            numFunctionWords+=1
            numDeterminers+=1
            TotalWords+=1
        #End of all words in a sentence.
    #End of all sentences
    numLexicals = numAdj+numNouns+numVerbs+numAdverbs+numProperNouns
    numVerbsOnly = numVerbs-numauxverbs

    test_text.at[index, 'POS_numNouns'] = restrict2TwoDecimals(numNouns+numProperNouns/TotalWords)
    test_text.at[index, 'POS_numProperNouns'] = restrict2TwoDecimals(numProperNouns/TotalWords)
    test_text.at[index, 'POS_numPronouns'] = restrict2TwoDecimals(numPronouns/TotalWords)
    test_text.at[index, 'POS_numConjunct'] = restrict2TwoDecimals(numConjunct/TotalWords)
    test_text.at[index, 'POS_numAdjectives'] = restrict2TwoDecimals(numAdj/TotalWords)
    test_text.at[index, 'POS_numVerbs'] = restrict2TwoDecimals(numVerbs/TotalWords)
    test_text.at[index, 'POS_numAdverbs'] = restrict2TwoDecimals(numAdverbs/TotalWords)
    test_text.at[index, 'POS_numModals'] = restrict2TwoDecimals(numModals/TotalWords)
    test_text.at[index, 'POS_numPrepositions'] = restrict2TwoDecimals(numPrepositions/TotalWords)
    test_text.at[index, 'POS_numInterjections'] = restrict2TwoDecimals(numInterjections/TotalWords)
    test_text.at[index, 'POS_numPerPronouns'] = restrict2TwoDecimals(perpronouns/TotalWords)
    test_text.at[index, 'POS_numWhPronouns'] = restrict2TwoDecimals(whperpronouns/TotalWords)
    test_text.at[index, 'POS_numLexicals'] = restrict2TwoDecimals((numLexicals)/TotalWords)
    test_text.at[index, 'POS_numFunctionWords'] = restrict2TwoDecimals((numFunctionWords)/TotalWords)
    test_text.at[index, 'POS_numDeterminers'] = restrict2TwoDecimals((numDeterminers)/TotalWords)
    test_text.at[index, 'POS_numVerbsVB'] = restrict2TwoDecimals((numVB)/TotalWords)
    test_text.at[index, 'POS_numVerbsVBD'] = restrict2TwoDecimals((numVBD)/TotalWords)
    test_text.at[index, 'POS_numVerbsVBG'] = restrict2TwoDecimals((numVBG)/TotalWords)
    test_text.at[index, 'POS_numVerbsVBN'] = restrict2TwoDecimals((numVBN)/TotalWords)
    test_text.at[index, 'POS_numVerbsVBP'] = restrict2TwoDecimals((numVBP)/TotalWords)
    test_text.at[index, 'POS_numVerbsVBZ'] = restrict2TwoDecimals((numVBZ)/TotalWords)
    test_text.at[index, 'POS_advVar'] = restrict2TwoDecimals(numAdverbs/numLexicals)
    test_text.at[index, 'POS_adjVar'] = restrict2TwoDecimals(numAdj/numLexicals)
    test_text.at[index, 'POS_modVar'] = restrict2TwoDecimals((numAdj+numAdverbs)/numLexicals)
    test_text.at[index, 'POS_nounVar'] = restrict2TwoDecimals((numNouns+numProperNouns)/numLexicals)
    test_text.at[index, 'POS_verbVar1'] = restrict2TwoDecimals((numVerbsOnly)/len(uniqueVerbs))
    test_text.at[index, 'POS_verbVar2'] = restrict2TwoDecimals((numVerbsOnly)/numLexicals)
    test_text.at[index, 'POS_squaredVerbVar1'] = restrict2TwoDecimals((numVerbsOnly*numVerbsOnly)/len(uniqueVerbs))
    test_text.at[index, 'POS_correctedVV1'] = restrict2TwoDecimals((numVerbsOnly)/np.sqrt(2.0*len(uniqueVerbs)))
    
test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,POS_numVerbsVBP,POS_numVerbsVBZ,POS_advVar,POS_adjVar,POS_modVar,POS_nounVar,POS_verbVar1,POS_verbVar2,POS_squaredVerbVar1,POS_correctedVV1
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,...,0.02,0.03,0.13,0.11,0.25,0.44,1.04,0.24,55.08,5.25
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,...,0.03,0.06,0.15,0.08,0.23,0.43,1.44,0.30,126.95,7.97
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,...,0.04,0.03,0.12,0.13,0.25,0.39,1.05,0.29,62.16,5.57
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,...,0.02,0.03,0.06,0.12,0.18,0.46,1.48,0.34,87.03,6.60
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,...,0.07,0.06,0.13,0.11,0.24,0.41,1.73,0.31,119.03,7.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,...,0.02,0.05,0.18,0.17,0.34,0.37,1.15,0.22,51.92,5.10
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,...,0.03,0.03,0.12,0.07,0.19,0.55,1.13,0.21,39.52,4.45
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,...,0.04,0.04,0.14,0.11,0.25,0.41,1.47,0.30,128.29,8.01
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,...,0.05,0.05,0.10,0.16,0.26,0.40,1.65,0.31,100.57,7.09


# Syntactic Features

In [None]:
for index, row in test_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    numSBAR = 0
    avgParseTreeHeight = 0
    numNP = 0
    numVP = 0
    numPP = 0
    numSubtrees = 0
    numWhPhrases = 0
    numConjPhrases = 0
    reducedRelClauses = 0
    numWords = 0
    numClauses = 0
    numTunits = 0
    numComplexNominals = 0
    numDependentClauses = 0
    numCoordinateClauses = 0
    numComplexTunits = 0
    AvgNPSize = 0
    AvgVPSize = 0
    AvgPPSize = 0
    
    for sentence in sentences:
        try:
            cn_tree = nlp.parse(sentence)
            tree = Tree.fromstring(cn_tree)
            avgParseTreeHeight += tree.height()
            numWords += len(tree.leaves())
            for st in tree.subtrees():
                numSubtrees +=1
                if st.label() == "NP":
                    numNP += 1
                    AvgNPSize += len(st)

                if st.label() == "VP":
                    numVP += 1
                    AvgVPSize += len(st)

                if st.label() == "PP":
                    numPP += 1
                    AvgPPSize += len(st)

                if st.label() == "WHNP" or st.label() == "WHPP" or st.label() == "WHADVP" or st.label() == "WHADJP":
                    numWhPhrases += 1

                if st.label() == "RRC":
                    reducedRelClauses += 1

                if st.label() == "CONJP":
                    numConjPhrases += 1
        except:
            continue
            
    numSentences = len(sentences)
    
    test_text.at[index, 'SYN_numSentences'] = numSentences
    test_text.at[index, 'SYN_avgSentenceLength'] = restrict2TwoDecimals(numWords/numSentences)
    test_text.at[index, 'SYN_avgParseTreeHeightPerSen'] = restrict2TwoDecimals(avgParseTreeHeight/numSentences)
    test_text.at[index, 'SYN_numSubtreesPerSen'] = restrict2TwoDecimals(numSubtrees/numSentences)
    test_text.at[index, 'SYN_numNPsPerSen'] = restrict2TwoDecimals(numNP/numSentences)
    test_text.at[index, 'SYN_numVPsPerSen'] = restrict2TwoDecimals(numVP/numSentences)
    test_text.at[index, 'SYN_numPPsPerSen'] = restrict2TwoDecimals(numPP/numSentences)
    test_text.at[index, 'SYN_numNPSize'] = handleDivByZero(AvgNPSize,numNP)
    test_text.at[index, 'SYN_numVPSize'] = handleDivByZero(AvgVPSize,numVP)
    test_text.at[index, 'SYN_numPPSize'] = handleDivByZero(AvgPPSize,numPP)
    test_text.at[index, 'SYN_numWHPsPerSen'] = restrict2TwoDecimals(numWhPhrases/numSentences)
    test_text.at[index, 'SYN_numRRCsPerSen'] = restrict2TwoDecimals(reducedRelClauses/numSentences)
    test_text.at[index, 'SYN_numConjPPerSen'] = restrict2TwoDecimals(numConjPhrases/numSentences)
    
test_text   

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,SYN_numConstituentsPerSen,SYN_numNPsPerSen,SYN_numVPsPerSen,SYN_numPPsPerSen,SYN_numNPSize,SYN_numVPSize,SYN_numPPSize,SYN_numWHPsPerSen,SYN_numRRCsPerSen,SYN_numConjPPerSen
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,...,0.0,5.04,3.46,2.04,1.87,2.31,2.04,0.15,0.0,0.00
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,...,0.0,5.66,3.84,2.03,1.83,2.39,2.05,0.44,0.0,0.00
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,...,0.0,5.48,4.26,1.26,1.94,2.26,2.03,0.30,0.0,0.04
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,...,0.0,6.37,4.32,1.95,1.91,2.17,1.97,0.26,0.0,0.00
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,...,0.0,3.58,2.44,0.83,1.80,2.20,2.07,0.14,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,...,0.0,4.87,3.57,1.61,1.81,2.18,2.08,0.26,0.0,0.09
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,...,0.0,3.81,2.19,0.78,2.03,2.19,2.00,0.11,0.0,0.00
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,...,0.0,4.46,3.27,1.10,1.84,2.28,2.07,0.24,0.0,0.02
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,...,0.0,6.62,3.95,1.86,1.82,2.20,2.08,0.19,0.0,0.00


# Discourse Features

### Referring Expressions

In [None]:
for index, row in test_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    tokens = word_tokenize(essay)
    tagged = nltk.pos_tag(tokens)
    numWords = 0.0
    numSentences = len(sentences)
    numPronouns = 0.0
    numPersonalPronouns = 0.0
    numPossessivePronouns = 0.0
    numDefiniteArticles = 0.0
    numProperNouns = 0.0
    numNouns = 0.0
    for word, tag in tagged:
        if word[0].isalpha():
            numWords +=1
            if tag == 'DT' and word.lower() == 'the':
                numDefiniteArticles +=1
            elif tag == 'PRP':
                numPersonalPronouns +=1
            elif tag == 'PRP$':
                numPossessivePronouns +=1
            elif tag.startswith('NN'):
                numNouns +=1
                if tag.startswith('NNP'):
                    numProperNouns +=1
                    
    numPronouns = numPersonalPronouns + numPossessivePronouns
    
    test_text.at[index, 'DISC_RefExprPronounsPerNoun'] = restrict2TwoDecimals(numPronouns/numNouns)
    test_text.at[index, 'DISC_RefExprPronounsPerSen'] = restrict2TwoDecimals(numPronouns/numSentences)
    test_text.at[index, 'DISC_RefExprPronounsPerWord'] = restrict2TwoDecimals(numPronouns/numWords)
    test_text.at[index, 'DISC_RefExprPerPronounsPerSen'] = restrict2TwoDecimals(numPersonalPronouns/numSentences)
    test_text.at[index, 'DISC_RefExprPerProPerWord'] = restrict2TwoDecimals(numPersonalPronouns/numWords)
    test_text.at[index, 'DISC_RefExprPossProPerSen'] = restrict2TwoDecimals(numPossessivePronouns/numSentences)
    test_text.at[index, 'DISC_RefExprPossProPerWord'] = restrict2TwoDecimals(numPossessivePronouns/numWords)
    test_text.at[index, 'DISC_RefExprDefArtPerSen'] = restrict2TwoDecimals(numDefiniteArticles/numSentences)
    test_text.at[index, 'DISC_RefExprDefArtPerWord'] = restrict2TwoDecimals(numDefiniteArticles/numWords)
    test_text.at[index, 'DISC_RefExprProperNounsPerNoun'] = restrict2TwoDecimals(numProperNouns/numNouns)

test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,DISC_RefExprPronounsPerNoun,DISC_RefExprPronounsPerSen,DISC_RefExprPronounsPerWord,DISC_RefExprPerPronounsPerSen,DISC_RefExprPerProPerWord,DISC_RefExprPossProPerSen,DISC_RefExprPossProPerWord,DISC_RefExprDefArtPerSen,DISC_RefExprDefArtPerWord,DISC_RefExprProperNounsPerNoun
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,...,0.31,1.15,0.08,0.88,0.06,0.27,0.02,1.08,0.07,0.10
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,...,0.27,1.06,0.07,0.91,0.06,0.16,0.01,1.16,0.07,0.13
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,...,0.62,2.17,0.13,1.83,0.11,0.35,0.02,0.87,0.05,0.28
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,...,0.30,1.26,0.07,1.05,0.06,0.21,0.01,1.68,0.10,0.31
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,...,0.50,1.28,0.12,0.86,0.08,0.42,0.04,0.47,0.04,0.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,...,0.43,1.43,0.10,1.04,0.07,0.39,0.03,0.61,0.04,0.14
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,...,0.31,1.04,0.10,0.78,0.08,0.26,0.03,0.70,0.07,0.28
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,...,0.42,1.20,0.09,1.00,0.08,0.20,0.02,0.56,0.04,0.09
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,...,0.60,2.24,0.13,1.81,0.10,0.43,0.02,1.19,0.07,0.13


### Content Overlap features

In [None]:
def getGeneralTag(specificTag):
    generaltag = "NOTAG"
    if specificTag.startswith("VB"): 
        generaltag = "VERB"

    elif specificTag.startswith("JJ"):
        generaltag = "ADJECTIVE"

    elif specificTag.startswith("RB") or specificTag == "WRB" :
        generaltag = "ADVERB"
        
    elif specificTag.startswith("PRP") or specificTag.startswith("WP"):
        generaltag = "PRONOUN"

    elif specificTag.startswith("NN"):
        generaltag = "NOUN"

    return generaltag

def lemma_function(word, tag):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)
    
def returnFormattedSentence(sentences):
    formated_sent = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        single_sent = []
        for word, tag in tagged:
            word_prop = []
            word_prop.append(word.lower())
            word_prop.append(lemma_function(word, tag))
            word_prop.append(tag)
            word_prop.append(getGeneralTag(tag))
            single_sent.append(word_prop)
        formated_sent.append(single_sent)
    return formated_sent

def isThereNounOverlap(sent_1, sent_2):
    for word in sent_1:
        if (word[3]=="NOUN") and (word in sent_2):
            return True
    return False

def isThereArgumentOverlap(sent_1, sent_2):
    if isThereNounOverlap(sent_1, sent_2):
        return True
    else:
        for word in sent_1:
            if (word[3]=="PRONOUN") and (word in sent_2):
                return True
            else:
                if word[3]=="NOUN" or word[3]=="PRONOUN":
                    word_lemma_1 = word[1]
                    word_pos_1 = word[3]
                    for word2 in sent_2:
                        word_lemma_2 = word2[1]
                        word_pos_2 = word2[3]
                        if (word_lemma_1 == word_lemma_2) and (word_pos_1 == word_pos_2) and (not word_pos_1 == "NOTAG"):
                            return True
    return False

def isThereStemOverlap(sent_1, sent_2):
    if isThereNounOverlap(sent_1, sent_2) or isThereArgumentOverlap(sent_1, sent_2):
        return True
    else:
        for word in sent_1:
            if not word[3] == "NOTAG":
                word_lemma_1 = word[1]
                word_pos_1 = word[3]
                for word2 in sent_2:
                    word_lemma_2 = word2[1]
                    word_pos_2 = word2[3]
                    if (word_lemma_1 == word_lemma_2) and (word_pos_1=="NOUN" or word_pos_2=="NOUN" or word_pos_1 == "PRONOUN"):
                        return True
    return False

def contentWordOverlap(sent_1, sent_2):
    overlapsCount = 0
    for word in sent_1:
        word_lemma_1 = word[1]
        word_pos_1 = word[3]
        if (not word_pos_1 == "NOTAG") and (not word_pos_1 == "PRONOUN"):
            for word2 in sent_2:
                if word_lemma_1 == word2[1]:
                    overlapsCount +=1
    return overlapsCount

In [None]:
for index, row in test_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    formatted_sentences = returnFormattedSentence(sentences)
    localNounOverlapCount = 0
    localArgumentOverlapCount = 0
    localStemOverlapCount = 0
    localContentWordOverlap = 0

    globalNounOverlapCount = 0
    globalArgumentOverlapCount = 0
    globalStemOverlapCount = 0
    globalContentWordOverlap = 0
    
    totalSentencesSize = len(sentences)
    for i in range(0,totalSentencesSize):
        for j in range(i+1,totalSentencesSize):
            sent_1, sent_2 = formatted_sentences[i], formatted_sentences[j]
            if isThereNounOverlap(sent_1, sent_2):
                if (j-i) == 1:
                    localNounOverlapCount +=1
                    localArgumentOverlapCount +=1
                    localStemOverlapCount +=1
                globalNounOverlapCount +=1
                globalArgumentOverlapCount +=1
                globalStemOverlapCount +=1
            elif isThereArgumentOverlap(sent_1, sent_2):
                if (j-i) ==1:
                    localArgumentOverlapCount +=1
                    localStemOverlapCount +=1
                globalArgumentOverlapCount +=1
                globalStemOverlapCount +=1
            elif isThereStemOverlap(sent_1, sent_2):
                if (j-i) ==1:
                    localStemOverlapCount +=1
                globalStemOverlapCount +=1
            tempContentOverlap = contentWordOverlap(sent_1, sent_2)
            globalContentWordOverlap += tempContentOverlap
            if (j-i) ==1:
                localContentWordOverlap += tempContentOverlap
    test_text.at[index, 'total_sentences'] = totalSentencesSize
    test_text.at[index, 'DISC_localNounOverlapCount'] = restrict2TwoDecimals(localNounOverlapCount/totalSentencesSize)
    test_text.at[index, 'DISC_localArgumentOverlapCount'] = restrict2TwoDecimals(localArgumentOverlapCount/totalSentencesSize)
    test_text.at[index, 'DISC_localStemOverlapCount'] = restrict2TwoDecimals(localStemOverlapCount/totalSentencesSize)
    test_text.at[index, 'DISC_localContentWordOverlapCount'] = restrict2TwoDecimals(localContentWordOverlap/totalSentencesSize)
    test_text.at[index, 'DISC_globalNounOverlapCount'] = restrict2TwoDecimals(globalNounOverlapCount/totalSentencesSize)
    test_text.at[index, 'DISC_globalArgumentOverlapCount'] = restrict2TwoDecimals(globalArgumentOverlapCount/totalSentencesSize)
    test_text.at[index, 'DISC_globalStemOverlapCount'] = restrict2TwoDecimals(globalStemOverlapCount/totalSentencesSize)
    test_text.at[index, 'DISC_globalContentWordOverlapCount'] = restrict2TwoDecimals(globalContentWordOverlap/totalSentencesSize)

test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,DISC_RefExprProperNounsPerNoun,total_sentences,DISC_localNounOverlapCount,DISC_localArgumentOverlapCount,DISC_localStemOverlapCount,DISC_localContentWordOverlapCount,DISC_globalNounOverlapCount,DISC_globalArgumentOverlapCount,DISC_globalStemOverlapCount,DISC_globalContentWordOverlapCount
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,...,0.10,26.0,0.19,0.42,0.42,0.38,0.96,2.58,2.58,3.85
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,...,0.13,32.0,0.25,0.38,0.38,0.59,1.28,2.66,2.72,5.19
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,...,0.28,23.0,0.09,0.30,0.30,0.61,0.74,3.09,3.13,2.78
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,...,0.31,19.0,0.05,0.42,0.42,0.68,0.58,2.26,2.32,4.05
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,...,0.14,36.0,0.11,0.22,0.22,0.47,1.47,3.72,3.75,7.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,...,0.14,23.0,0.30,0.48,0.48,1.00,1.26,2.96,2.96,5.91
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,...,0.28,27.0,0.30,0.44,0.44,0.44,1.26,2.44,2.48,2.81
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,...,0.09,41.0,0.07,0.17,0.20,0.49,0.66,2.37,2.49,6.17
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,...,0.13,21.0,0.33,0.71,0.71,1.10,0.67,4.81,4.86,8.10


In [None]:
# path to Discourse output text marked with connective tags.
path = 'C:\\Users\\hp word\\Documents\\University of Bath Labs\\SEM 2\\Research project dissertation\\Codes_Libraries\\My_code\\Feature_engineering\\DiscourseFeaturesData\\TestDiscOutputTxt'

# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        with open (str(os.path.join(r, file)), "r") as myfile:
            data=myfile.readlines()
            
            numNonDiscConnectives = len([k for k in data if '#0' in k])
            numCompConnectives = len([k for k in data if '#Comparison' in k])
            numExpConnectives = len([k for k in data if '#Expansion' in k])
            numContConnectives = len([k for k in data if '#Contingency' in k])
            numTempConnectives = len([k for k in data if '#Temporal' in k])
            
            numDiscConnectives = numCompConnectives + numExpConnectives + numContConnectives + numTempConnectives;
            numConnectives = numDiscConnectives + numNonDiscConnectives;
        
            numSentences = test_text.loc[test_text['doc_id'] == str(file)]['total_sentences'].values[0]
            test_text.loc[test_text['doc_id'] == str(file),'DISCPlus_00_numConnectivesPerSen'] = restrict2TwoDecimals(numConnectives/numSentences)
            test_text.loc[test_text['doc_id'] == str(file),'DISCPlus_01_numDiscConnectivesPerSen'] = restrict2TwoDecimals(numDiscConnectives/numSentences)
            test_text.loc[test_text['doc_id'] == str(file),'DISCPlus_02_numNonDiscConnectivesPerSen'] = restrict2TwoDecimals(numNonDiscConnectives/numSentences)
            test_text.loc[test_text['doc_id'] == str(file),'DISCPlus_03_numCompConnectivesPerSen'] = restrict2TwoDecimals(numCompConnectives/numSentences)
            test_text.loc[test_text['doc_id'] == str(file),'DISCPlus_04_numExpConnectivesPerSen'] = restrict2TwoDecimals(numExpConnectives/numSentences)
            test_text.loc[test_text['doc_id'] == str(file),'DISCPlus_05_numContConnectives'] = restrict2TwoDecimals(numContConnectives/numSentences)
            myfile.close()
test_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,DISC_globalNounOverlapCount,DISC_globalArgumentOverlapCount,DISC_globalStemOverlapCount,DISC_globalContentWordOverlapCount,DISCPlus_00_numConnectivesPerSen,DISCPlus_01_numDiscConnectivesPerSen,DISCPlus_02_numNonDiscConnectivesPerSen,DISCPlus_03_numCompConnectivesPerSen,DISCPlus_04_numExpConnectivesPerSen,DISCPlus_05_numContConnectives
0,doc2797.txt,"Dear Mrs Smith,\r\nI am sad to read about Rich...",French,30,59.87,11.9,8.71,14.1,6.95,48.0,...,0.96,2.58,2.58,3.85,0.88,0.27,0.62,0.04,0.04,0.04
1,doc2798.txt,Dear Mrs Smith\r\nI am writing to reply your ...,Chinese,27,57.61,10.7,9.05,11.8,7.01,73.0,...,1.28,2.66,2.72,5.19,1.03,0.44,0.59,0.06,0.31,0.00
2,doc2799.txt,"Dear Maria\r\nFirst at all, I am sorry about R...",Catalan,34,63.73,10.4,9.35,12.8,6.95,52.0,...,0.74,3.09,3.13,2.78,1.39,0.61,0.78,0.22,0.17,0.22
3,doc2829.txt,Dear Mrs Smith\r\nAs you know I am in charge f...,Spanish,29,69.65,10.2,6.74,11.6,6.39,29.0,...,0.58,2.26,2.32,4.05,0.95,0.37,0.58,0.11,0.11,0.11
4,doc2868.txt,"Dear Mrs Maria Smith,\r\nas the leader of the ...",Greek,29,83.96,4.7,7.29,6.2,5.59,32.0,...,1.47,3.72,3.75,7.33,1.17,0.69,0.47,0.03,0.36,0.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,doc3285.txt,Dear Mrs Maria Smith.\r\nI was really pleased ...,Korean,20,57.40,10.8,9.40,12.1,6.73,42.0,...,1.26,2.96,2.96,5.91,1.61,0.70,0.91,0.13,0.35,0.04
93,doc3288.txt,"Dear Mrs. Smith,\r\nThank you for your letter....",Thai,25,71.75,7.3,9.68,9.7,6.54,37.0,...,1.26,2.44,2.48,2.81,0.96,0.41,0.56,0.00,0.15,0.11
94,doc3291.txt,Dear Mrs Maria Smith\r\nThank you for your let...,German,23,72.29,9.2,7.90,11.5,6.29,49.0,...,0.66,2.37,2.49,6.17,0.90,0.27,0.63,0.02,0.17,0.02
95,doc3298.txt,Dear Mrs Maria Smith\r\nthankyou for your lett...,Italian,22,50.88,17.4,8.78,22.1,7.37,34.0,...,0.67,4.81,4.86,8.10,1.24,0.48,0.76,0.10,0.14,0.14
