In [None]:
# Importing libraries

import language_check
import numpy as np
import os
import pandas as pd
import requests
import sys, re, os, nltk
import warnings
from nltk import word_tokenize
from nltk.corpus import words, wordnet
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tree import Tree
from pandas.core.common import SettingWithCopyWarning
from stanfordcorenlp import StanfordCoreNLP
from textstat.textstat import textstat

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
nlp = StanfordCoreNLP(r'C:\Users\hp word\anaconda3\Lib\site-packages\stanfordcorenlp')

# Common functions

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
output_list = []
wordset = set(words.words())
lmtzr = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
tool = language_check.LanguageTool('en-US')

def restrict2TwoDecimals(D):
    return round(D, 2)

def handleDivByZero(n_1, n_2):
    if n_2 == 0.0:
        return 0.0;
    elif n_1/n_2 > 10000:
        return 0.0;
    else:
        return restrict2TwoDecimals(n_1/n_2)

# Reading output of previous txt to csv converter

In [None]:
training_data = pd.read_csv('training_data.csv' ,encoding='ANSI')
train_text = training_data

# Readability Features

In [None]:
# Using textstat library implementation of readability features
for index, row in train_text.iterrows():
    essay = row['text'] 
    fre = textstat.flesch_reading_ease(essay)
    fkg = textstat.flesch_kincaid_grade(essay)
    cli = textstat.coleman_liau_index(essay)
    ari = textstat.automated_readability_index(essay)
    dcrs = textstat.dale_chall_readability_score(essay)
    dw = textstat.difficult_words(essay)
    lwf = textstat.linsear_write_formula(essay)
    gf = textstat.gunning_fog(essay)
    train_text.at[index, 'fre'] = fre
    train_text.at[index, 'fkg'] = fkg
    train_text.at[index, 'cli'] = cli
    train_text.at[index, 'ari'] = ari
    train_text.at[index, 'dcrs'] = dcrs
    train_text.at[index, 'dw'] = dw
    train_text.at[index, 'lwf'] = lwf
    train_text.at[index, 'gf'] = gf
train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,lwf,gf
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,10.166667,10.75
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,9.000000,18.89
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,7.142857,9.60
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,11.166667,12.10
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,5.100000,5.94
...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,6.750000,11.99
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,14.500000,29.07
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,7.833333,10.74
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,7.428571,8.63


# Word Level Features

In [None]:
# Method to get Measure of Textual Lexical Diversity
def getMTLD(tokens):
    types = []
    factors=0
    ttrThreshold = 0.72
    startIndex = 0
    ttr=1
    # go over the text and get ttr to get the number of factors
    for i in range(len(tokens)):
        currentToken = tokens[i]
        #each time a new type is found, compute type token ratio
        if currentToken.lower() not in types:
            types.append(currentToken.lower())
        ttr = len(types)/(i+1-startIndex)
        if ttr < ttrThreshold:
       #cut text (those portions are called factor) and reset list of types
            startIndex = i+1
            types.clear()
            #keep count of factors
            factors +=1        
        #if it is the last word and the ttr threshold is not reached, calculate the rest factor
        elif (ttr >ttrThreshold) and (i ==len(tokens)-1):
            factors += (1-ttr)/(1-0.72)
    #repeat until all tokens are finished.
    #form MTLD score:  #tokens /#factors
    mtld1 = len(tokens) / factors
    
    #repeat same starting at the end of the text
    factors = 0
    startIndex= len(tokens)-1
    ttr=1
    types.clear()
    for i in reversed(range(len(tokens))):
        currentToken = tokens[i]
        #each time a new type is found, compute type token ratio
        if currentToken.lower() not in types:
            types.append(currentToken.lower())
        ttr = len(types)/(startIndex-i+1)

        #when ttr reaches threshold
        if ttr < ttrThreshold:
            #cut text (those portions are called factor) and reset list of types 
            startIndex = i-1
            types.clear()
            factors +=1
        #if it is the last word and the ttr threshold is not reached. calculate the rest factor
        elif (ttr >ttrThreshold) and (i ==0):
            factors += (1-ttr)/(1-0.72)
            
    #repeat until no tokens left

    mtld2 = len(tokens)/ factors
    res = (mtld1+mtld2)/2
    resD = float(res)
    #take the mean of both forward and backward score
    if not np.isinf(resD):
        return resD
    else:
        return 0.0

In [None]:
for index, row in train_text.iterrows():
    essay_low = row['text'].lower()
    essay_low_np = essay_low.replace('[^\w\s]','')
    tokens_low_np = tokenizer.tokenize(essay_low_np)
    types = set(tokens_low_np)
    num_types = len(types)
    num_tokens = len(tokens_low_np)
    
    train_text.at[index, 'Word_numWords'] = num_tokens
    train_text.at[index, 'Word_TTR'] = handleDivByZero(num_types, num_tokens)
    train_text.at[index, 'Word_CTTR'] = handleDivByZero(num_types, np.sqrt(2.0*num_tokens))
    train_text.at[index, 'Word_RTTR'] = handleDivByZero(num_types,np.sqrt(num_tokens))
    train_text.at[index, 'Word_BilogTTR'] = handleDivByZero(np.log(num_types),np.log(num_tokens))
    train_text.at[index, 'Word_UberIndex'] = handleDivByZero(np.log(num_tokens)**2,np.log(num_tokens/num_types))
    train_text.at[index, 'Word_MTLD'] = restrict2TwoDecimals(getMTLD(tokens_low_np))
    
train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,lwf,gf,Word_numWords,Word_TTR,Word_CTTR,Word_RTTR,Word_BilogTTR,Word_UberIndex,Word_MTLD
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,10.166667,10.75,386.0,0.51,7.13,10.08,0.89,53.14,85.61
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,9.000000,18.89,333.0,0.52,6.66,9.43,0.89,51.06,87.86
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,7.142857,9.60,484.0,0.45,6.97,9.86,0.87,47.64,59.51
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,11.166667,12.10,373.0,0.49,6.63,9.37,0.88,48.49,98.60
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,5.100000,5.94,358.0,0.55,7.40,10.46,0.90,58.39,98.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,6.750000,11.99,309.0,0.57,7.12,10.07,0.90,58.99,92.12
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,14.500000,29.07,349.0,0.47,6.25,8.83,0.87,45.76,73.21
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,7.833333,10.74,368.0,0.51,6.89,9.75,0.89,51.56,67.38
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,7.428571,8.63,343.0,0.49,6.38,9.02,0.88,47.35,72.37


# Error features

In [None]:
# Used language_check library implementation for error detection
for index, row in train_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    num_sents = len(sentences)
    spelling_mistake = 0
    duplicate_mistake = 0
    other_mistake = 0
    for sentence in sentences:
        matches = tool.check(sentence)
        for match in matches:
            if match.locqualityissuetype == 'misspelling':
                spelling_mistake +=1
            elif match.locqualityissuetype == 'duplication':
                duplicate_mistake +=1
            else:
                other_mistake +=1
                
    all_mistakes = spelling_mistake + duplicate_mistake + other_mistake
    
    train_text.at[index, 'SpellingErrorsPerSen'] = restrict2TwoDecimals(spelling_mistake/num_sents)
    train_text.at[index, 'DuplicateErrorsPerSen'] = restrict2TwoDecimals(duplicate_mistake/num_sents)
    train_text.at[index, 'OtherErrorsPerSen'] = restrict2TwoDecimals(other_mistake/num_sents)
    train_text.at[index, 'AllErrorsPerSen'] = restrict2TwoDecimals(all_mistakes/num_sents)
    train_text.at[index, 'SpellingErrorsWrtAllErrors'] = restrict2TwoDecimals(spelling_mistake/all_mistakes)
train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,Word_CTTR,Word_RTTR,Word_BilogTTR,Word_UberIndex,Word_MTLD,SpellingErrorsPerSen,DuplicateErrorsPerSen,OtherErrorsPerSen,AllErrorsPerSen,SpellingErrorsWrtAllErrors
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,...,7.13,10.08,0.89,53.14,85.61,0.15,0.0,0.33,0.48,0.31
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,...,6.66,9.43,0.89,51.06,87.86,0.00,0.0,0.42,0.42,0.00
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,...,6.97,9.86,0.87,47.64,59.51,0.09,0.0,0.09,0.17,0.50
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,...,6.63,9.37,0.88,48.49,98.60,0.17,0.0,0.25,0.42,0.40
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,...,7.40,10.46,0.90,58.39,98.62,0.30,0.0,0.24,0.55,0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,...,7.12,10.07,0.90,58.99,92.12,0.27,0.0,0.36,0.64,0.43
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,...,6.25,8.83,0.87,45.76,73.21,0.13,0.0,0.07,0.20,0.67
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,...,6.89,9.75,0.89,51.56,67.38,0.32,0.0,0.32,0.64,0.50
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,...,6.38,9.02,0.88,47.35,72.37,0.64,0.0,0.57,1.21,0.53


# Parts Of Speech (POS) Features

In [None]:
# Used NLTK library implementation of POS features
# Run 'nltk.help.upenn_tagset()' to get list of The Penn Treebank's POS tags

for index, row in train_text.iterrows():
    essay = row['text']
    tokens = word_tokenize(essay)
    tagged = nltk.pos_tag(tokens)
    TotalWords = 0;
    numAdj = 0;
    numNouns = 0;
    numVerbs = 0;
    numPronouns = 0;
    numConjunct = 0;
    numProperNouns = 0;	 
    numPrepositions = 0;
    numAdverbs = 0;
    numLexicals = 0;
    numModals = 0;
    numInterjections = 0;
    perpronouns = 0;
    whperpronouns = 0;
    numauxverbs = 0;
    numFunctionWords = 0;
    numDeterminers = 0;
    numVB = 0;
    numVBD = 0;
    numVBG = 0;
    numVBN = 0;
    numVBP = 0;
    numVBZ = 0;
    uniqueVerbs = []
    for word, tag in tagged:
        if tag =="PRP" or tag =="PRP$" or tag=="WP" or tag=="WP$":
            numPronouns +=1
            if tag == "PRP":
                perpronouns +=1
            if tag == "WP":
                whperpronouns +=1
            numFunctionWords +=1
            TotalWords +=1
            
        if tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            numVerbs+=1
            TotalWords+=1
            if word not in uniqueVerbs:
                uniqueVerbs.append(word)
            if tag == "VB":
                numVB+=1
            elif tag == "VBD":
                numVBD+=1
            elif tag == "VBG":
                numVBG+=1
            elif tag == "VBN":
                numVBN+=1
            if tag == "VBP":
                numVBP+=1
            if tag == "VBZ":
                numVBZ+=1
        if tag == "JJ" or tag == "JJR" or tag == "JJS":
            numAdj+=1
            TotalWords+=1
        if tag == "RB" or tag == "RBR" or tag == "RBS" or tag == "RP":
            numAdverbs+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "IN":
            numPrepositions+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "UH":
            numInterjections+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "CC":
            numConjunct+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "NN" or tag == "NNS":
            numNouns+=1
            TotalWords+=1
        if tag == "NNP" or tag == "NNPS":
            numProperNouns+=1
            TotalWords+=1
        if tag == "MD":
            numModals+=1
            numauxverbs+=1
            numFunctionWords+=1
            TotalWords+=1
        if tag == "DT":
            numFunctionWords+=1
            numDeterminers+=1
            TotalWords+=1
        #End of all words in a sentence.
    #End of all sentences
    numLexicals = numAdj+numNouns+numVerbs+numAdverbs+numProperNouns
    numVerbsOnly = numVerbs-numauxverbs

    train_text.at[index, 'POS_numNouns'] = restrict2TwoDecimals(numNouns+numProperNouns/TotalWords)
    train_text.at[index, 'POS_numProperNouns'] = restrict2TwoDecimals(numProperNouns/TotalWords)
    train_text.at[index, 'POS_numPronouns'] = restrict2TwoDecimals(numPronouns/TotalWords)
    train_text.at[index, 'POS_numConjunct'] = restrict2TwoDecimals(numConjunct/TotalWords)
    train_text.at[index, 'POS_numAdjectives'] = restrict2TwoDecimals(numAdj/TotalWords)
    train_text.at[index, 'POS_numVerbs'] = restrict2TwoDecimals(numVerbs/TotalWords)
    train_text.at[index, 'POS_numAdverbs'] = restrict2TwoDecimals(numAdverbs/TotalWords)
    train_text.at[index, 'POS_numModals'] = restrict2TwoDecimals(numModals/TotalWords)
    train_text.at[index, 'POS_numPrepositions'] = restrict2TwoDecimals(numPrepositions/TotalWords)
    train_text.at[index, 'POS_numInterjections'] = restrict2TwoDecimals(numInterjections/TotalWords)
    train_text.at[index, 'POS_numPerPronouns'] = restrict2TwoDecimals(perpronouns/TotalWords)
    train_text.at[index, 'POS_numWhPronouns'] = restrict2TwoDecimals(whperpronouns/TotalWords)
    train_text.at[index, 'POS_numLexicals'] = restrict2TwoDecimals((numLexicals)/TotalWords)
    train_text.at[index, 'POS_numFunctionWords'] = restrict2TwoDecimals((numFunctionWords)/TotalWords)
    train_text.at[index, 'POS_numDeterminers'] = restrict2TwoDecimals((numDeterminers)/TotalWords)
    train_text.at[index, 'POS_numVerbsVB'] = restrict2TwoDecimals((numVB)/TotalWords)
    train_text.at[index, 'POS_numVerbsVBD'] = restrict2TwoDecimals((numVBD)/TotalWords)
    train_text.at[index, 'POS_numVerbsVBG'] = restrict2TwoDecimals((numVBG)/TotalWords)
    train_text.at[index, 'POS_numVerbsVBN'] = restrict2TwoDecimals((numVBN)/TotalWords)
    train_text.at[index, 'POS_numVerbsVBP'] = restrict2TwoDecimals((numVBP)/TotalWords)
    train_text.at[index, 'POS_numVerbsVBZ'] = restrict2TwoDecimals((numVBZ)/TotalWords)
    train_text.at[index, 'POS_advVar'] = restrict2TwoDecimals(numAdverbs/numLexicals)
    train_text.at[index, 'POS_adjVar'] = restrict2TwoDecimals(numAdj/numLexicals)
    train_text.at[index, 'POS_modVar'] = restrict2TwoDecimals((numAdj+numAdverbs)/numLexicals)
    train_text.at[index, 'POS_nounVar'] = restrict2TwoDecimals((numNouns+numProperNouns)/numLexicals)
    train_text.at[index, 'POS_verbVar1'] = restrict2TwoDecimals((numVerbsOnly)/len(uniqueVerbs))
    train_text.at[index, 'POS_verbVar2'] = restrict2TwoDecimals((numVerbsOnly)/numLexicals)
    train_text.at[index, 'POS_squaredVerbVar1'] = restrict2TwoDecimals((numVerbsOnly*numVerbsOnly)/len(uniqueVerbs))
    train_text.at[index, 'POS_correctedVV1'] = restrict2TwoDecimals((numVerbsOnly)/np.sqrt(2.0*len(uniqueVerbs)))
    
train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,POS_numVerbsVBP,POS_numVerbsVBZ,POS_advVar,POS_adjVar,POS_modVar,POS_nounVar,POS_verbVar1,POS_verbVar2,POS_squaredVerbVar1,POS_correctedVV1
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,...,0.03,0.01,0.17,0.14,0.30,0.38,1.28,0.27,75.67,6.15
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,...,0.02,0.01,0.00,0.00,0.00,0.96,1.00,0.03,10.00,2.24
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,...,0.04,0.03,0.09,0.11,0.20,0.43,1.33,0.29,96.89,6.96
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,...,0.06,0.02,0.13,0.11,0.25,0.36,1.72,0.37,127.35,7.98
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,...,0.02,0.02,0.16,0.11,0.27,0.32,1.36,0.37,107.60,7.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,...,0.03,0.03,0.14,0.14,0.28,0.39,1.26,0.30,67.81,5.82
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,...,0.07,0.02,0.17,0.13,0.29,0.26,1.47,0.39,105.80,7.27
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,...,0.02,0.01,0.13,0.09,0.21,0.42,1.33,0.33,85.33,6.53
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,...,0.03,0.04,0.14,0.13,0.27,0.40,1.38,0.25,64.97,5.70


# Syntactic Features

In [None]:
# This code generates parse tree for the given text and make syntactic calculations.
for index, row in train_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    numSBAR = 0
    avgParseTreeHeight = 0
    numNP = 0
    numVP = 0
    numPP = 0 
    numSubtrees = 0
    numWhPhrases = 0
    numConjPhrases = 0
    reducedRelClauses = 0
    numWords = 0
    numClauses = 0
    numTunits = 0
    numComplexNominals = 0
    numDependentClauses = 0
    numCoordinateClauses = 0
    numComplexTunits = 0
    AvgNPSize = 0
    AvgVPSize = 0
    AvgPPSize = 0
    numSentences = len(sentences)
    for sentence in sentences:
        try:
            cn_tree = nlp.parse(sentence)
            tree = Tree.fromstring(cn_tree)
            avgParseTreeHeight += tree.height()
            numWords += len(tree.leaves())
            for st in tree.subtrees():
                numSubtrees +=1
                if st.label() == "NP":
                    numNP += 1
                    AvgNPSize += len(st)

                if st.label() == "VP":
                    numVP += 1
                    AvgVPSize += len(st)

                if st.label() == "PP":
                    numPP += 1
                    AvgPPSize += len(st)

                if st.label() == "WHNP" or st.label() == "WHPP" or st.label() == "WHADVP" or st.label() == "WHADJP":
                    numWhPhrases += 1

                if st.label() == "RRC":
                    reducedRelClauses += 1

                if st.label() == "CONJP":
                    numConjPhrases += 1
        except:
            continue
    
    train_text.at[index, 'SYN_numSentences'] = numSentences
    train_text.at[index, 'SYN_avgSentenceLength'] = restrict2TwoDecimals(numWords/numSentences)
    train_text.at[index, 'SYN_avgParseTreeHeightPerSen'] = restrict2TwoDecimals(avgParseTreeHeight/numSentences)
    train_text.at[index, 'SYN_numSubtreesPerSen'] = restrict2TwoDecimals(numSubtrees/numSentences)
    train_text.at[index, 'SYN_numNPsPerSen'] = restrict2TwoDecimals(numNP/numSentences)
    train_text.at[index, 'SYN_numVPsPerSen'] = restrict2TwoDecimals(numVP/numSentences)
    train_text.at[index, 'SYN_numPPsPerSen'] = restrict2TwoDecimals(numPP/numSentences)
    train_text.at[index, 'SYN_numNPSize'] = handleDivByZero(AvgNPSize,numNP)
    train_text.at[index, 'SYN_numVPSize'] = handleDivByZero(AvgVPSize,numVP)
    train_text.at[index, 'SYN_numPPSize'] = handleDivByZero(AvgPPSize,numPP)
    train_text.at[index, 'SYN_numWHPsPerSen'] = restrict2TwoDecimals(numWhPhrases/numSentences)
    train_text.at[index, 'SYN_numRRCsPerSen'] = restrict2TwoDecimals(reducedRelClauses/numSentences)
    train_text.at[index, 'SYN_numConjPPerSen'] = restrict2TwoDecimals(numConjPhrases/numSentences)

train_text   

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,SYN_numConstituentsPerSen,SYN_numNPsPerSen,SYN_numVPsPerSen,SYN_numPPsPerSen,SYN_numNPSize,SYN_numVPSize,SYN_numPPSize,SYN_numWHPsPerSen,SYN_numRRCsPerSen,SYN_numConjPPerSen
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,...,0.0,4.67,3.48,1.37,1.70,2.44,1.97,0.26,0.0,0.00
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,...,0.0,10.67,6.50,2.67,2.04,2.32,2.03,0.00,0.0,0.00
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,...,0.0,4.46,3.49,1.37,1.90,2.25,2.04,0.20,0.0,0.00
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,...,0.0,4.79,4.25,1.58,1.70,2.31,2.03,0.12,0.0,0.00
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,...,0.0,3.45,3.24,0.61,1.52,2.30,2.05,0.33,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,...,0.0,5.00,3.18,1.68,1.82,2.30,2.05,0.14,0.0,0.00
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,...,0.0,6.93,7.80,1.60,1.54,2.30,2.08,0.47,0.0,0.00
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,...,0.0,5.59,4.41,1.77,1.73,2.36,2.03,0.32,0.0,0.05
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,...,0.0,4.21,3.07,1.46,1.73,2.33,2.07,0.29,0.0,0.00


# Discourse Features

### Referring Expressions

In [None]:
# This code generates discourse features related to referring expressions
for index, row in train_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    tokens = word_tokenize(essay)
    tagged = nltk.pos_tag(tokens)
    numWords = 0.0
    numSentences = len(sentences)
    numPronouns = 0.0
    numPersonalPronouns = 0.0
    numPossessivePronouns = 0.0
    numDefiniteArticles = 0.0
    numProperNouns = 0.0
    numNouns = 0.0
    for word, tag in tagged:
        if word[0].isalpha():
            numWords +=1
            if tag == 'DT' and word.lower() == 'the':
                numDefiniteArticles +=1
            elif tag == 'PRP':
                numPersonalPronouns +=1
            elif tag == 'PRP$':
                numPossessivePronouns +=1
            elif tag.startswith('NN'):
                numNouns +=1
                if tag.startswith('NNP'):
                    numProperNouns +=1
                    
    numPronouns = numPersonalPronouns + numPossessivePronouns
    
    train_text.at[index, 'DISC_RefExprPronounsPerNoun'] = restrict2TwoDecimals(numPronouns/numNouns)
    train_text.at[index, 'DISC_RefExprPronounsPerSen'] = restrict2TwoDecimals(numPronouns/numSentences)
    train_text.at[index, 'DISC_RefExprPronounsPerWord'] = restrict2TwoDecimals(numPronouns/numWords)
    train_text.at[index, 'DISC_RefExprPerPronounsPerSen'] = restrict2TwoDecimals(numPersonalPronouns/numSentences)
    train_text.at[index, 'DISC_RefExprPerProPerWord'] = restrict2TwoDecimals(numPersonalPronouns/numWords)
    train_text.at[index, 'DISC_RefExprPossProPerSen'] = restrict2TwoDecimals(numPossessivePronouns/numSentences)
    train_text.at[index, 'DISC_RefExprPossProPerWord'] = restrict2TwoDecimals(numPossessivePronouns/numWords)
    train_text.at[index, 'DISC_RefExprDefArtPerSen'] = restrict2TwoDecimals(numDefiniteArticles/numSentences)
    train_text.at[index, 'DISC_RefExprDefArtPerWord'] = restrict2TwoDecimals(numDefiniteArticles/numWords)
    train_text.at[index, 'DISC_RefExprProperNounsPerNoun'] = restrict2TwoDecimals(numProperNouns/numNouns)

train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,DISC_RefExprPronounsPerNoun,DISC_RefExprPronounsPerSen,DISC_RefExprPronounsPerWord,DISC_RefExprPerPronounsPerSen,DISC_RefExprPerProPerWord,DISC_RefExprPossProPerSen,DISC_RefExprPossProPerWord,DISC_RefExprDefArtPerSen,DISC_RefExprDefArtPerWord,DISC_RefExprProperNounsPerNoun
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,...,0.52,1.59,0.11,1.33,0.10,0.26,0.02,0.59,0.04,0.17
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,...,0.02,0.42,0.02,0.42,0.02,0.00,0.00,0.08,0.00,1.00
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,...,0.45,1.40,0.10,1.11,0.08,0.29,0.02,0.83,0.06,0.11
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,...,0.69,2.08,0.14,1.71,0.11,0.38,0.02,0.46,0.03,0.17
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,...,0.86,1.70,0.16,1.52,0.15,0.18,0.02,0.39,0.04,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,...,0.49,1.59,0.11,1.00,0.07,0.59,0.04,0.86,0.06,0.15
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,...,0.98,3.20,0.14,2.73,0.12,0.47,0.02,0.80,0.03,0.14
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,...,0.52,1.95,0.12,1.68,0.10,0.27,0.02,1.14,0.07,0.32
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,...,0.50,1.32,0.11,1.04,0.08,0.29,0.02,0.71,0.06,0.16


### Content Overlap features

In [None]:
def getGeneralTag(specificTag):
    generaltag = "NOTAG"
    if specificTag.startswith("VB"): 
        generaltag = "VERB"

    elif specificTag.startswith("JJ"):
        generaltag = "ADJECTIVE"

    elif specificTag.startswith("RB") or specificTag == "WRB" :
        generaltag = "ADVERB"
        
    elif specificTag.startswith("PRP") or specificTag.startswith("WP"):
        generaltag = "PRONOUN"

    elif specificTag.startswith("NN"):
        generaltag = "NOUN"

    return generaltag

def lemma_function(word, tag):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)
    
def returnFormattedSentence(sentences):
    formated_sent = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged = nltk.pos_tag(tokens)
        single_sent = []
        for word, tag in tagged:
            word_prop = []
            word_prop.append(word.lower())
            word_prop.append(lemma_function(word, tag))
            word_prop.append(tag)
            word_prop.append(getGeneralTag(tag))
            single_sent.append(word_prop)
        formated_sent.append(single_sent)
    return formated_sent

def isThereNounOverlap(sent_1, sent_2):
    for word in sent_1:
        if (word[3]=="NOUN") and (word in sent_2):
            return True
    return False

def isThereArgumentOverlap(sent_1, sent_2):
    if isThereNounOverlap(sent_1, sent_2):
        return True
    else:
        for word in sent_1:
            if (word[3]=="PRONOUN") and (word in sent_2):
                return True
            else:
                if word[3]=="NOUN" or word[3]=="PRONOUN":
                    word_lemma_1 = word[1]
                    word_pos_1 = word[3]
                    for word2 in sent_2:
                        word_lemma_2 = word2[1]
                        word_pos_2 = word2[3]
                        if (word_lemma_1 == word_lemma_2) and (word_pos_1 == word_pos_2) and (not word_pos_1 == "NOTAG"):
                            return True
    return False

def isThereStemOverlap(sent_1, sent_2):
    if isThereNounOverlap(sent_1, sent_2) or isThereArgumentOverlap(sent_1, sent_2):
        return True
    else:
        for word in sent_1:
            if not word[3] == "NOTAG":
                word_lemma_1 = word[1]
                word_pos_1 = word[3]
                for word2 in sent_2:
                    word_lemma_2 = word2[1]
                    word_pos_2 = word2[3]
                    if (word_lemma_1 == word_lemma_2) and (word_pos_1=="NOUN" or word_pos_2=="NOUN" or word_pos_1 == "PRONOUN"):
                        return True
    return False

def contentWordOverlap(sent_1, sent_2):
    overlapsCount = 0
    for word in sent_1:
        word_lemma_1 = word[1]
        word_pos_1 = word[3]
        if (not word_pos_1 == "NOTAG") and (not word_pos_1 == "PRONOUN"):
            for word2 in sent_2:
                if word_lemma_1 == word2[1]:
                    overlapsCount +=1
    return overlapsCount

In [None]:
for index, row in train_text.iterrows():
    essay = row['text']
    sentences = sent_tokenize(essay)
    formatted_sentences = returnFormattedSentence(sentences)
    localNounOverlapCount = 0
    localArgumentOverlapCount = 0
    localStemOverlapCount = 0
    localContentWordOverlap = 0

    globalNounOverlapCount = 0
    globalArgumentOverlapCount = 0
    globalStemOverlapCount = 0
    globalContentWordOverlap = 0

    totalSentencesSize = len(sentences)
    for i in range(0,totalSentencesSize):
        for j in range(i+1,totalSentencesSize):
            sent_1, sent_2 = formatted_sentences[i], formatted_sentences[j]
            if isThereNounOverlap(sent_1, sent_2):
                if (j-i) == 1:
                    localNounOverlapCount +=1
                    localArgumentOverlapCount +=1
                    localStemOverlapCount +=1
                globalNounOverlapCount +=1
                globalArgumentOverlapCount +=1
                globalStemOverlapCount +=1
            elif isThereArgumentOverlap(sent_1, sent_2):
                if (j-i) ==1:
                    localArgumentOverlapCount +=1
                    localStemOverlapCount +=1
                globalArgumentOverlapCount +=1
                globalStemOverlapCount +=1
            elif isThereStemOverlap(sent_1, sent_2):
                if (j-i) ==1:
                    localStemOverlapCount +=1
                globalStemOverlapCount +=1
            tempContentOverlap = contentWordOverlap(sent_1, sent_2)
            globalContentWordOverlap += tempContentOverlap
            if (j-i) ==1:
                localContentWordOverlap += tempContentOverlap
    train_text.at[index, 'total_sentences'] = totalSentencesSize
    train_text.at[index, 'DISC_localNounOverlapCount'] = restrict2TwoDecimals(localNounOverlapCount/totalSentencesSize)
    train_text.at[index, 'DISC_localArgumentOverlapCount'] = restrict2TwoDecimals(localArgumentOverlapCount/totalSentencesSize)
    train_text.at[index, 'DISC_localStemOverlapCount'] = restrict2TwoDecimals(localStemOverlapCount/totalSentencesSize)
    train_text.at[index, 'DISC_localContentWordOverlapCount'] = restrict2TwoDecimals(localContentWordOverlap/totalSentencesSize)
    train_text.at[index, 'DISC_globalNounOverlapCount'] = restrict2TwoDecimals(globalNounOverlapCount/totalSentencesSize)
    train_text.at[index, 'DISC_globalArgumentOverlapCount'] = restrict2TwoDecimals(globalArgumentOverlapCount/totalSentencesSize)
    train_text.at[index, 'DISC_globalStemOverlapCount'] = restrict2TwoDecimals(globalStemOverlapCount/totalSentencesSize)
    train_text.at[index, 'DISC_globalContentWordOverlapCount'] = restrict2TwoDecimals(globalContentWordOverlap/totalSentencesSize)

train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,DISC_RefExprProperNounsPerNoun,total_sentences,DISC_localNounOverlapCount,DISC_localArgumentOverlapCount,DISC_localStemOverlapCount,DISC_localContentWordOverlapCount,DISC_globalNounOverlapCount,DISC_globalArgumentOverlapCount,DISC_globalStemOverlapCount,DISC_globalContentWordOverlapCount
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,...,0.17,27.0,0.30,0.59,0.59,0.96,1.00,3.44,3.56,5.07
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,...,1.00,12.0,0.75,0.83,0.83,8.58,4.67,5.00,5.00,42.67
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,...,0.11,35.0,0.26,0.46,0.46,0.86,1.57,3.83,3.83,6.29
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,...,0.17,24.0,0.08,0.58,0.62,0.62,0.79,6.50,6.58,5.71
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,...,0.35,33.0,0.09,0.39,0.39,0.36,1.09,3.67,3.70,5.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,...,0.15,22.0,0.09,0.36,0.36,0.45,0.36,2.68,2.73,3.14
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,...,0.14,15.0,0.20,0.87,0.87,2.13,0.73,5.60,5.67,9.33
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,...,0.32,22.0,0.05,0.59,0.59,0.59,0.82,6.14,6.14,4.45
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,...,0.16,28.0,0.11,0.32,0.32,0.32,0.57,4.18,4.25,3.96


In [None]:
# path to Discourse output text marked with connective tags.
path = 'C:\\Users\\hp word\\Documents\\University of Bath Labs\\SEM 2\\Research project dissertation\\Codes_Libraries\\My_code\\Feature_engineering\\DiscourseFeaturesData\\DiscOutputTxt'

# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        with open (str(os.path.join(r, file)), "r") as myfile:
            data=myfile.readlines()
            
            numNonDiscConnectives = len([k for k in data if '#0' in k])
            numCompConnectives = len([k for k in data if '#Comparison' in k])
            numExpConnectives = len([k for k in data if '#Expansion' in k])
            numContConnectives = len([k for k in data if '#Contingency' in k])
            numTempConnectives = len([k for k in data if '#Temporal' in k])
            
            numDiscConnectives = numCompConnectives + numExpConnectives + numContConnectives + numTempConnectives;
            numConnectives = numDiscConnectives + numNonDiscConnectives;
        
            numSentences = train_text.loc[train_text['doc_id'] == str(file)]['total_sentences'].values[0]
            train_text.loc[train_text['doc_id'] == str(file),'DISCPlus_00_numConnectivesPerSen'] = restrict2TwoDecimals(numConnectives/numSentences)
            train_text.loc[train_text['doc_id'] == str(file),'DISCPlus_01_numDiscConnectivesPerSen'] = restrict2TwoDecimals(numDiscConnectives/numSentences)
            train_text.loc[train_text['doc_id'] == str(file),'DISCPlus_02_numNonDiscConnectivesPerSen'] = restrict2TwoDecimals(numNonDiscConnectives/numSentences)
            train_text.loc[train_text['doc_id'] == str(file),'DISCPlus_03_numCompConnectivesPerSen'] = restrict2TwoDecimals(numCompConnectives/numSentences)
            train_text.loc[train_text['doc_id'] == str(file),'DISCPlus_04_numExpConnectivesPerSen'] = restrict2TwoDecimals(numExpConnectives/numSentences)
            train_text.loc[train_text['doc_id'] == str(file),'DISCPlus_05_numContConnectives'] = restrict2TwoDecimals(numContConnectives/numSentences)
            myfile.close()
train_text

Unnamed: 0,doc_id,text,native_language,overall_score,fre,fkg,cli,ari,dcrs,dw,...,DISC_globalNounOverlapCount,DISC_globalArgumentOverlapCount,DISC_globalStemOverlapCount,DISC_globalContentWordOverlapCount,DISCPlus_00_numConnectivesPerSen,DISCPlus_01_numDiscConnectivesPerSen,DISCPlus_02_numNonDiscConnectivesPerSen,DISCPlus_03_numCompConnectivesPerSen,DISCPlus_04_numExpConnectivesPerSen,DISCPlus_05_numContConnectives
0,doc100.txt,"10 June 2000\r\nDear Manager,\r\nI would like ...",Korean,29,73.10,8.9,8.07,11.3,6.69,45.0,...,1.00,3.44,3.56,5.07,1.52,0.67,0.85,0.07,0.26,0.26
1,doc1000.txt,DECEMBER 12TH\r\nPRINCIPAL MR. ROBERTSON\r\nDE...,Catalan,28,46.27,17.1,10.69,21.7,7.74,43.0,...,4.67,5.00,5.00,42.67,2.75,1.17,1.58,0.42,0.33,0.17
2,doc1002.txt,To Mr. Robertson\r\nI am writing to tell you s...,Korean,32,77.37,7.2,7.31,8.8,6.46,57.0,...,1.57,3.83,3.83,6.29,1.26,0.46,0.80,0.11,0.17,0.11
3,doc1003.txt,"Dear Mrs. Jane Clark,\r\nRecently I was at the...",Russian,29,72.29,9.2,7.43,11.3,6.49,38.0,...,0.79,6.50,6.58,5.71,1.25,0.67,0.58,0.17,0.08,0.29
4,doc1005.txt,"Dear Sir/Madam,\r\nI was to Circle Theatre to...",Polish,30,85.39,4.2,6.71,5.9,6.20,43.0,...,1.09,3.67,3.70,5.76,0.67,0.33,0.33,0.06,0.06,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,doc985.txt,"Dear manager,\r\nI am writing to complain abou...",Russian,29,59.03,10.1,10.50,12.0,7.72,60.0,...,0.36,2.68,2.73,3.14,1.00,0.36,0.64,0.09,0.23,0.05
1137,doc988.txt,"Dear Mrs Ryan,\r\nI am delighted to answer you...",French,35,26.01,27.0,7.34,32.8,8.23,25.0,...,0.73,5.60,5.67,9.33,2.87,1.47,1.40,0.20,0.33,0.47
1138,doc992.txt,"Dear Mr Robertson,\r\nI am writing to tell you...",Greek,26,72.09,9.3,7.20,10.9,6.05,28.0,...,0.82,6.14,6.14,4.45,1.45,0.45,1.00,0.14,0.00,0.18
1139,doc997.txt,"Dear Mr Robertson,\r\nWe would like to thank y...",Polish,27,79.40,6.5,7.60,8.2,6.52,44.0,...,0.57,4.18,4.25,3.96,1.07,0.21,0.86,0.07,0.07,0.07


# Creating a csv file

In [None]:
# Removing a duplicate feature
train_text = train_text.drop("total_sentences", axis = 1)
train_text.to_csv(r'training_features - Copy.csv', index = False)