In [1]:
import os
import stanza # Stanford's stanza package
# stanza.download('en') # run this once
import pandas as pd
import pickle
import numpy as np
import textstat
import nltk
from nltk.tokenize import SyllableTokenizer
from nltk import word_tokenize
from tqdm import tqdm
import subprocess
import shlex
from subprocess import Popen, PIPE

In [2]:
allSessions = pickle.load( open( "../../thesis/Data/Session/allSessionsProc.p", "rb" ) )
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsSQS = list(pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) ))
allQueries = allSessions['query'].tolist() 
allQueries = allQueries + list(allSessionsSQS)
setQueries = set(allQueries)

In [3]:
processor_dict = {
    'tokenize': 'gsd',
    'pos': 'bnc',
    'lemma': 'default'
}

nlp = stanza.Pipeline('en', processors=processor_dict)

2021-09-18 15:33:58 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-09-18 15:33:58 INFO: Use device: cpu
2021-09-18 15:33:58 INFO: Loading: tokenize
2021-09-18 15:33:58 INFO: Loading: pos
2021-09-18 15:34:00 INFO: Loading: lemma
2021-09-18 15:34:00 INFO: Loading: depparse
2021-09-18 15:34:03 INFO: Loading: sentiment
2021-09-18 15:34:07 INFO: Loading: ner
2021-09-18 15:34:08 INFO: Done loading processors!


In [4]:
count = 0
input_file = 'readability/data/lemmatize_pos_sentences.tagged'
loc_file =  '../../data/lemmatize_pos_sentences.tagged'
import time
from tqdm import tqdm
with tqdm(total = len(setQueries) ) as pbar:
    for text in setQueries:
        #print(text)
        doc = nlp(text)
        out = open(input_file, 'w')
        for sentence in doc.sentences:
            s = ''
            l = 0
            for word in sentence.words:
                s+='{} {}'.format(word.lemma, word.xpos) + ' ' # needs to be xpos so it uses Penn Treebank
                l+=1
            out.write('{} {}\n'.format(l, s.strip()))
#             print(s)
        out.close()
        cmd = 'cd readability/d-level-analyzer/COLLINS-PARSER;'
        cmd += ' code/parser {} models/model2/grammar 10000 1 1 1 1 > ../../data/parsed.m2;'.format(loc_file)
        cmd += 'cd ..;'
        cmd += 'python d-level.py ../data/parsed.m2 > ../data/dlevel.dla;'
        proc = subprocess.Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True).wait()
#         print(cmd)
        if count == 0:
            lc = pd.read_csv('readability/data/dlevel.dla')
            lc['query'] = text
            lexComp = lc
            count += 1
        else:
            lc = pd.read_csv('readability/data/dlevel.dla')
            lc['query'] = text
            lexComp = lexComp.append(lc)
#             print(text)
#             print(lc)
        #pbar.update()
        pbar.update()


# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(lexComp)




100%|██████████| 70114/70114 [7:36:37<00:00,  2.56it/s]  


In [23]:
lexComp[' Level0'].value_counts()

1    57102
0    12575
2      431
3        6
Name:  Level0, dtype: int64

In [6]:
posData = []
for document in setQueries:
    text = nltk.word_tokenize(document)
    tags = np.array(nltk.pos_tag(text)).flatten()
    posData.append(tags[1::2])

In [7]:
import re

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

posMod = []

for pos in posData: 
    string = []
    for entry in pos:
        string += str(entry) + " "
    posMod.append("".join(string))

    
posUni = []
posBi = []
posTri = []

for document in posMod:
    doc = generate_ngrams(document,1)
    posUni.append(doc)

for document in posMod:
    doc = generate_ngrams(document,2)
    posBi.append(doc)

for document in posMod:
    doc = generate_ngrams(document,3)
    posTri.append(doc)  
    
posDF = pd.DataFrame(setQueries)

In [8]:
posDF['all'] = posMod
posDF['uniPos'] = posUni
posDF['biPos'] = posBi
posDF['triPos']= posTri
posDF = posDF.rename(columns={0: "query"})

In [9]:
allSessionsuni = pd.concat([posDF,pd.get_dummies(posDF['uniPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)
allSessionsbi = pd.concat([posDF,pd.get_dummies(posDF['biPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['biPos', 'uniPos', 'all', 'triPos'],axis=1)
allSessionstri = pd.concat([posDF,pd.get_dummies(posDF['triPos'].apply(pd.Series).stack()).sum(level=0)],axis=1).drop(['uniPos', 'all', 'biPos', 'triPos'],axis=1)



In [10]:
allSessionsuni

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,uh,vb,vbd,vbg,vbn,vbp,vbz,wdt,wp,wrb
0,psp memory stick,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,las vegas bare essentials,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,flpl.lib.al.us,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ingersoll rand pocket watch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,translate german,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70109,briar blues,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70110,convert english to french,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70111,halloween black cat,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70112,designer purses,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
allSessionsbiLanding = allSessionsbi[[
'nn nn',
'jj nn',
'nn nns',
'to vb',
'jj nns',
'jj to',
'nn in',
'nns in',
'in nn',
'dt nn',
'query']]

In [12]:
allSessionstriLanding = allSessionstri[[
'jj nn nn',
'nn nn nn',
'jj to vb',
'nn nn nns',
'to vb nn',
'query']]

In [13]:
synFeats = allSessionsuni.merge(allSessionsbiLanding)

In [14]:
synFeats = synFeats.merge(allSessionstriLanding)

In [15]:
synFeats = synFeats.merge(allSessionstriLanding)

In [16]:
listCols = synFeats.columns.tolist()

In [17]:
listCols.remove('query')

In [18]:
listCols

['cc',
 'cd',
 'dt',
 'ex',
 'fw',
 'in',
 'jj',
 'jjr',
 'jjs',
 'ls',
 'md',
 'nn',
 'nnp',
 'nnps',
 'nns',
 'pdt',
 'pos',
 'prp',
 'rb',
 'rbr',
 'rbs',
 'rp',
 'sym',
 'to',
 'uh',
 'vb',
 'vbd',
 'vbg',
 'vbn',
 'vbp',
 'vbz',
 'wdt',
 'wp',
 'wrb',
 'nn nn',
 'jj nn',
 'nn nns',
 'to vb',
 'jj nns',
 'jj to',
 'nn in',
 'nns in',
 'in nn',
 'dt nn',
 'jj nn nn',
 'nn nn nn',
 'jj to vb',
 'nn nn nns',
 'to vb nn']

In [19]:
synFeats['length'] = synFeats['query'].str.split().str.len()

In [20]:
for col in listCols:
    synFeats[col] = synFeats[col]/synFeats['length']

In [24]:
lexComp

Unnamed: 0,Filename,Sentences,Level0,Level1,Level2,Level3,Level4,Level5,Level6,Level7,MeanLevel,query
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,psp memory stick
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,las vegas bare essentials
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,flpl.lib.al.us
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,ingersoll rand pocket watch
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,translate german
...,...,...,...,...,...,...,...,...,...,...,...,...
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,briar blues
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,convert english to french
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,halloween black cat
0,parsed.m2,1,1,0,0,0,0,0,0,0,0.0,designer purses


In [28]:
synFeats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70114 entries, 0 to 70113
Data columns (total 51 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   query      70114 non-null  object 
 1   cc         70110 non-null  float64
 2   cd         70110 non-null  float64
 3   dt         70110 non-null  float64
 4   ex         70110 non-null  float64
 5   fw         70110 non-null  float64
 6   in         70110 non-null  float64
 7   jj         70110 non-null  float64
 8   jjr        70110 non-null  float64
 9   jjs        70110 non-null  float64
 10  ls         70110 non-null  float64
 11  md         70110 non-null  float64
 12  nn         70110 non-null  float64
 13  nnp        70110 non-null  float64
 14  nnps       70110 non-null  float64
 15  nns        70110 non-null  float64
 16  pdt        70110 non-null  float64
 17  pos        70110 non-null  float64
 18  prp        70110 non-null  float64
 19  rb         70110 non-null  float64
 20  rbr   

In [26]:
synFeats

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,nn in,nns in,in nn,dt nn,jj nn nn,nn nn nn,jj to vb,nn nn nns,to vb nn,length
0,psp memory stick,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,3
1,las vegas bare essentials,0.0,0.0,0.0,0.0,0.0,0.0,0.250000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,4
2,flpl.lib.al.us,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,,,,,,,,,,1
3,ingersoll rand pocket watch,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,4
4,translate german,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70109,briar blues,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,2
70110,convert english to french,0.0,0.0,0.0,0.0,0.0,0.0,0.250000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,4
70111,halloween black cat,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,3
70112,designer purses,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,2


In [29]:
synFeats = synFeats.merge(lexComp, on = 'query')

In [30]:
synFeats.drop(columns = [' Sentences', 'length'], inplace = True)

In [31]:
pickle.dump(synFeats, open( "Pickles/SynFeat.p", "wb" ) )

In [33]:
synFeats

Unnamed: 0,query,cc,cd,dt,ex,fw,in,jj,jjr,jjs,...,Filename,Level0,Level1,Level2,Level3,Level4,Level5,Level6,Level7,MeanLevel
0,psp memory stick,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
1,las vegas bare essentials,0.0,0.0,0.0,0.0,0.0,0.0,0.250000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
2,flpl.lib.al.us,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
3,ingersoll rand pocket watch,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
4,translate german,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70109,briar blues,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
70110,convert english to french,0.0,0.0,0.0,0.0,0.0,0.0,0.250000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
70111,halloween black cat,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
70112,designer purses,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,parsed.m2,1,0,0,0,0,0,0,0,0.0
