# Extract Lexical Features

In [1]:
import os
import pandas as pd
import pickle
import numpy as np
import stanza # Stanford's stanza package
stanza.download('en') # run this once
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize
from tqdm import tqdm


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 2.03MB/s]                    
2021-10-04 20:19:00 INFO: Downloading default packages for language: en (English)...
2021-10-04 20:19:02 INFO: File exists: /Users/bl4z3/stanza_resources/en/default.zip.
2021-10-04 20:19:11 INFO: Finished downloading models and saved to /Users/bl4z3/stanza_resources.


# Functions for Lexical Complexity

Code taken from:

This code is the lexical complexity analyzer described in

Lu, Xiaofei (2012). The relationship of lexical richnes to the quality 
of ESL speakers' oral narratives. The Modern Language Journal, 96(2), 190-208. 

Version 1.1 Released on February 12, 2013

Which can be found at:

http://www.personal.psu.edu/xxl13/download.html

It has been modified to work with search queries, as it was initially designed for sentences.

In [2]:
import string,re,sys,os,random
from math import sqrt,log

# adjust minimum sample size here
standard=50

# Returns the keys of dictionary d sorted by their values
def sort_by_value(d):
    items=d.items()
    backitems=[ [v[1],v[0]] for v in items]
    backitems.sort()
    return [ backitems[i][1] for i in range(0,len(backitems))]

# NDW for first z words in a sample
def getndwfirstz(z,lemmalist):
    ndwfirstztype={}
    for lemma in lemmalist[:z]:
        ndwfirstztype[lemma]=1
    return len(ndwfirstztype.keys())

# NDW expected random z words, 10 trials
def getndwerz(z,lemmalist):
    ndwerz=0
    for i in range(10):
        ndwerztype={}
        erzlemmalist=random.sample(lemmalist,z)
        for lemma in erzlemmalist:
            ndwerztype[lemma]=1
        ndwerz+=len(ndwerztype.keys())
    return ndwerz/10.0

# NDW expected random sequences of z words, 10 trials
def getndwesz(z,lemmalist):
    ndwesz=0
    for i in range(10):
        ndwesztype={}
        startword=random.randint(0,len(lemmalist)-z)
        eszlemmalist=lemmalist[startword:startword+z]
        for lemma in eszlemmalist:
            ndwesztype[lemma]=1
        ndwesz+=len(ndwesztype.keys())
    return ndwesz/10.0

# MSTTR
def getmsttr(z,lemmalist):
    samples=0
    msttr=0.0
    while len(lemmalist)>=z:
        samples+=1
        msttrtype={}
        for lemma in lemmalist[:z]:
            msttrtype[lemma]=1
        msttr+=len(msttrtype.keys())/float(z)
        lemmalist=lemmalist[z:]    
    return msttr/samples

def isLetterNumber(character):
    if character in string.printable and not character in string.punctuation:
        return 1
    return 0

def isSentence(line):
    for character in line:
        if isLetterNumber(character):
            return 1
    return 0

In [3]:
def getLex(queries):
    processor_dict = {
    'tokenize': 'gsd', 
    'pos': 'bnc', 
    'lemma': 'default'
    }

    nlp = stanza.Pipeline('en', processors=processor_dict)
    # reads information from bnc wordlist
    lexFeat = []
    adjdict={}
    verbdict={}
    noundict={}
    worddict={}
    wordlistfile=open("DataSets/bnc_all_filtered.txt","r")
    wordlist=wordlistfile.readlines()
    wordlistfile.close()
    for word in wordlist:
        wordinfo=word.strip()
        if not wordinfo or "Total words" in wordinfo:
            continue
        infolist=wordinfo.split()
        lemma=infolist[0]
        pos=infolist[1]
        frequency=int(infolist[2])
        worddict[lemma]=worddict.get(lemma,0)+frequency
        if pos=="Adj":
            adjdict[lemma]=adjdict.get(lemma,0)+frequency
        elif pos=="Verb":
            verbdict[lemma]=verbdict.get(lemma,0)+frequency
        elif pos=="NoC" or pos=="NoP":
            noundict[lemma]=noundict.get(lemma,0)+frequency
    wordranks=sort_by_value(worddict)
    verbranks=sort_by_value(verbdict)
    length = len(queries)
    with tqdm(total = length) as pbar:
        for query in queries:
            filename=query
            doc = nlp(query)
            for sentence in doc.sentences:
                s = ''
                for word in sentence.words:
                    s+='{}_{}'.format(word.lemma, word.xpos) + ' '
            lemlines= s
            #print(lemlines)
            # process input file
            wordtypes={}
            wordtokens=0
            swordtypes={}
            swordtokens=0
            lextypes={}
            lextokens=0
            slextypes={}
            slextokens=0
            verbtypes={}
            verbtokens=0
            sverbtypes={}
            adjtypes={}
            adjtokens=0
            advtypes={}
            advtokens=0
            nountypes={}
            nountokens=0
            lemmaposlist=[]
            lemmalist=[]

            for lemline in lemlines:
                lemline=lemline.strip()
                lemline=lemline.lower()
                if not isSentence(lemline):
                    continue
                lemmas=lemline.split()
                for lemma in lemmas:
                    word=lemma.split("_")[0]
                    pos=lemma.split("_")[-1]
                    if (not pos in string.punctuation) and pos!="sent" and pos!="sym":
                        lemmaposlist.append(lemma)
                        lemmalist.append(word)  
                        wordtokens+=1
                        wordtypes[word]=1
                        try:

                            if (not word in wordranks[-2000:]) and pos != "cd":
                                swordtypes[word]=1
                                swordtokens+=1
                            if pos[0]=="n":
                                lextypes[word]=1
                                nountypes[word]=1
                                lextokens+=1
                                nountokens+=1
                                if not word in wordranks[-2000:]:
                                    slextypes[word]=1
                                    slextokens+=1
                            elif pos[0]=="j":
                                lextypes[word]=1
                                adjtypes[word]=1
                                lextokens+=1
                                adjtokens+=1
                                if not word in wordranks[-2000:]:
                                    slextypes[word]=1
                                    slextokens+=1
                            elif pos[0]=="r" and (adjdict.has_key(word) or (word[-2:]=="ly" and adjdict.has_key(word[:-2]))):
                                lextypes[word]=1
                                advtypes[word]=1
                                lextokens+=1
                                advtokens+=1
                                if not word in wordranks[-2000:]:
                                    slextypes[word]=1
                                    slextokens+=1
                            elif pos[0]=="v" and not word in ["be","have"]:
                                verbtypes[word]=1
                                verbtokens+=1
                                lextypes[word]=1
                                lextokens+=1
                                if not word in wordranks[-2000:]:
                                    sverbtypes[word]=1
                                    slextypes[word]=1
                                    slextokens+=1
                        except(AttributeError):
                            pass

            # 1. lexical density
            if wordtokens > 0:
                ld=float(lextokens)/wordtokens
            else:
                ld=0
            # 2. lexical sophistication
            # 2.1 lexical sophistication
            if lextokens != 0:
                ls1=slextokens/float(lextokens)
            else:
                ls1 = 0
            if len(wordtypes.keys()) > 0:
                ls2=len(swordtypes.keys())/float(len(wordtypes.keys()))
            else:
                ls2 = 0

            # 2.2 verb sophistication
            vs1 = 0
            vs2=0
            cvs1=0
            if verbtokens > 0:
                vs1=len(sverbtypes.keys())/float(verbtokens)
                vs2=(len(sverbtypes.keys())*len(sverbtypes.keys()))/float(verbtokens)
                cvs1=len(sverbtypes.keys())/sqrt(2*verbtokens)

            # 3 lexical diversity or variation

            # 3.1 NDW, may adjust the values of "standard"
            ndw=len(wordtypes.keys())

            # 3.2 TTR
            
            if wordtokens > 0:
                ttr=len(wordtypes.keys())/float(wordtokens)
                if len(lemmalist)>=standard:
                    msttr=getmsttr(standard,lemmalist)
                cttr=len(wordtypes.keys())/sqrt(2*wordtokens)
                rttr=len(wordtypes.keys())/sqrt(wordtokens)
            else:
                ttr = 0
                cttr = 0
                rttr = 0
            if wordtokens == 0 or len(wordtypes.keys()) == 0:
                logttr = 0
            else:
                logttr=log(len(wordtypes.keys()))/log(wordtokens)
            # 3.3 verb diversity
            vv1, svv1, cvv1 = 0, 0, 0
            if verbtokens > 0:
                vv1=len(verbtypes.keys())/float(verbtokens)
                svv1=len(verbtypes.keys())*len(verbtypes.keys())/float(verbtokens)
                cvv1=len(verbtypes.keys())/sqrt(2*verbtokens)

            # 3.4 lexical diversity
            if lextokens != 0:
                lv=len(lextypes.keys())/float(lextokens)
                vv2=len(verbtypes.keys())/float(lextokens)
                adjv=len(adjtypes.keys())/float(lextokens)

            else:
                lv=0
                vv2=0
                adjv=0

            if nountokens != 0:
                nv=len(nountypes.keys())/float(nountokens)
            else:
                nv=0



          
            lexFeat.append([query, ld, ls1, ls2, vs1, vs2, cvs1, ndw, ttr,
                           cttr, rttr, logttr, lv, vv1, svv1, cvv1, vv2, nv, adjv])
            pbar.update()
    lexical = pd.DataFrame(data = lexFeat, columns = ["query", "ld", "ls1", "ls2", "vs1", "vs2", "cvs1", "ndw", "ttr",
                                                      "cttr", "rttr", "logttr", "lv", "vv1", "svv1", "cvv1", "vv2", "nv", "adjv"])
    return lexical

# Extract Lexical Features

In [4]:
#allSessions = pickle.load( open( "../../thesis/Data/Session/allSessionsProc.p", "rb" ) )
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsSQS = list(pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) ))
allQueries = allSessions['query'].tolist()
print(len(allQueries))
allQueries = allQueries + list(allSessionsSQS)
print(len(allQueries))
setQueries = set(allQueries)
print(len(setQueries))

230362
231867
70485


In [5]:
len(set(allSessions['query'].tolist()))

70196

In [6]:
#Lexical Characteristics

totalSyl = []
avgSyl = []
simWords = []
comWords = []
simWordsAvg = []
comWordsAvg = []
mostSyl = []
leastSyl = []
SSP = SyllableTokenizer()

with tqdm(total = len(setQueries) ) as pbar:
    for text in setQueries:
        running = 0
        count = 0
        simpleWords = 0
        complexWords = 0
        most = 0
        least = 19
        for word in text.split(" "):
            current = len(SSP.tokenize(word))
            running += current
            count +=1
            if current < 3:
                simpleWords += 1
            else:
                complexWords +=1
            if most < current:
                most = current
            if least > current:
                least = current
                
        totalSyl.append(running)
        avgSyl.append(running/count)
        simWords.append(simpleWords)
        comWords.append(complexWords)
        mostSyl.append(most)
        leastSyl.append(least)
        pbar.update()

  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
  " assigning as vowel: '{}'".format(c)
 84%|████████▍ | 59538/70485 [11:03<02:01, 89.77it/s]


KeyboardInterrupt: 

In [None]:
textComplex = pd.DataFrame(setQueries)
textComplex = textComplex.set_index(0, drop=True)
textComplex = textComplex.reset_index().rename(columns={0:'query'})
textComplex['totalSyl'] = totalSyl
textComplex['avgSyl'] = avgSyl
textComplex['simWords'] = simWords
textComplex['comWords'] = comWords
textComplex['greatestSyl'] = mostSyl
textComplex['leastSyl'] = leastSyl
textComplex['numChars'] = textComplex['query'].str.len()
textComplex['numWords'] = textComplex['query'].str.split().str.len()
textComplex['avgLenWord'] = textComplex['numChars']/textComplex['numWords']


In [None]:
#Lexical Complexity

lexFeats = getLex(setQueries)

In [None]:
lexicalFeatures = textComplex.merge(lexFeats)

In [None]:
pickle.dump(lexicalFeatures, open( "Pickles/LexFeat.p", "wb" ) )