This notebook extracts syntatical features from the queries found in SWC and SQS, returning a data frame containing those features.

# Import Libraries

In [1]:
import os
import stanza 
import pickle
import textstat
import nltk
import subprocess
import shlex
import time
import re

import pandas as pd
import numpy as np

from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize
from tqdm import tqdm

from subprocess import Popen, PIPE


# Declare Functions


In [None]:
def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

# Load Data Sets

In [2]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsSQS = list(pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) ))
allQueries = allSessions['query'].tolist() 
allQueries = allQueries + list(allSessionsSQS)
setQueries = set(allQueries)

# Extract D-Level Features

In [4]:
count = 0

input_file = 'DLA/data/lemmatize_pos_sentences.tagged'
loc_file =  '../../data/lemmatize_pos_sentences.tagged'

processor_dict = {
    'tokenize': 'gsd',
    'pos': 'bnc',
    'lemma': 'default'
}

nlp = stanza.Pipeline('en', processors=processor_dict)

from tqdm import tqdm
with tqdm(total = len(setQueries) ) as pbar:
    for text in setQueries:
        doc = nlp(text)
        out = open(input_file, 'w')
        
        for sentence in doc.sentences:
            s = ''
            l = 0
            for word in sentence.words:
                s+='{} {}'.format(word.lemma, word.xpos) + ' ' # needs to be xpos so it uses Penn Treebank
                l+=1
            out.write('{} {}\n'.format(l, s.strip()))
        out.close()
        
        cmd = 'cd DLA/d-level-analyzer/COLLINS-PARSER;'
        cmd += ' code/parser {} models/model2/grammar 10000 1 1 1 1 > ../../data/parsed.m2;'.format(loc_file)
        cmd += 'cd ..;'
        cmd += 'python d-level.py ../data/parsed.m2 > ../data/dlevel.dla;'
        proc = subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True).wait()
        if count == 0:
            dl = pd.read_csv('DLA/data/dlevel.dla')
            dl['query'] = text
            dLevel = dl
            count += 1
        else:
            dl = pd.read_csv('DLA/data/dlevel.dla')
            dl['query'] = text
            dLevel = dLevel.append(dl)
        pbar.update()

100%|██████████| 70621/70621 [11:40:52<00:00,  1.68it/s]  


# Extract Part of Speech Features

In [7]:
posData = []
for document in setQueries:
    text = nltk.word_tokenize(document)
    tags = np.array(nltk.pos_tag(text)).flatten()
    posData.append(tags[1::2])

posMod = []

for pos in posData: 
    string = []
    for entry in pos:
        string += str(entry) + " "
    posMod.append("".join(string))

    
posUni = []
posBi = []
posTri = []

for document in posMod:
    doc = generate_ngrams(document,1)
    posUni.append(doc)

for document in posMod:
    doc = generate_ngrams(document,2)
    posBi.append(doc)

for document in posMod:
    doc = generate_ngrams(document,3)
    posTri.append(doc)  
    
posDF = pd.DataFrame(setQueries)

posDF['all'] = posMod
posDF['uniPos'] = posUni
posDF['biPos'] = posBi
posDF['triPos']= posTri
posDF = posDF.rename(columns={0: "query"})

allSessionsuni = pd.concat([posDF,pd.get_dummies(posDF['uniPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)
allSessionsbi = pd.concat([posDF,pd.get_dummies(posDF['biPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['biPos', 'uniPos', 'all', 'triPos'],axis=1)
allSessionstri = pd.concat([posDF,pd.get_dummies(posDF['triPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)

In [11]:
allSessionsbiLanding = allSessionsbi[[
'nn nn',
'jj nn',
'nn nns',
'to vb',
'jj nns',
'jj to',
'nn in',
'nns in',
'in nn',
'dt nn',
'query']]

In [12]:
allSessionstriLanding = allSessionstri[[
'jj nn nn',
'nn nn nn',
'jj to vb',
'nn nn nns',
'to vb nn',
'query']]

In [13]:
synFeats = allSessionsuni.merge(allSessionsbiLanding)
synFeats = synFeats.merge(allSessionstriLanding)
synFeats = synFeats.merge(allSessionstriLanding)
synFeats['length'] = synFeats['query'].str.split().str.len()

for col in listCols:
    synFeats[col] = synFeats[col]/synFeats['length']

# Return Feature Set

In [25]:
synFeats = synFeats.merge(lexComp, on = 'query')
synFeats.drop(columns = [' Sentences', 'length'], inplace = True)
pickle.dump(synFeats, open( "Pickles/SynFeat.p", "wb" ) )