The following code in this notebook extracts vocabulary features from each query and then returns a data frame of those extracted features.

# Load Libraries

The following block of code loads all libraries needed for this notebook. Numpy has an established to ensure that the random selection of queries drawn to establish certain features, such as top word n-grams; is consistent across this code and future execution.

In [2]:
import pickle
import csv
import string
import nltk
import re
import numpy as np
import pandas as pd

from tqdm import tqdm
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(20200522)

# Functions for Vocabulary Features

Features used in the following code.

In [3]:
#Converts a list (lst) into a dictionary

def Convert(lst):
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [4]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsSQS = list(pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) ))
allQueries = allSessions['query'].tolist() 
allQueries = allQueries + list(allSessionsSQS)
allQueries = set(allQueries)

# Core Vocab

Loads all vocabulary expected to be learned between Kindergarten to Seventh grade based on Common Core Curriculum, before extracting the ratio of words in each query that are, and are not; found in this list.

In [5]:
kd = ['a', 'all', 'am', 'an', 'and', 'are', 'as', 'at', 'away', 'back', 'ball', 'bell', 'big', 'bird', 'blue', 'book', 'boot', 'box', 'boy', 'brown', 'but', 'by', 'can', 'car', 'cat', 'come', 'cow', 'day', 'do', 'dog', 'down', 'end', 'fall', 'fan', 'fish', 'fly', 'food', 'for', 'from', 'fun', 'get', 'go', 'good', 'gray', 'green', 'groundhog', 'hat', 'he', 'here', 'hill', 'I', 'in', 'into', 'is', 'it', 'inside', 'kitten', 'little', 'look', 'mad', 'me', 'mud', 'my', 'name', 'no', 'not', 'of', 'on', 'orange', 'out', 'paint', 'pet', 'pin', 'play', 'put', 'rain', 'red', 'run', 'sad', 'say', 'see', 'she', 'sing', 'sit', 'so', 'stay', 'stop', 'story', 'sun', 'take', 'that', 'the', 'them', 'then', 'there', 'they', 'this', 'to', 'too', 'up', 'we', 'wet', 'what', 'where', 'who', 'will', 'with', 'work', 'yellow', 'yes', 'you', 'zoo', 'orange', 'white', 'black', 'monday', 'tuesday', 'wednesday','thursday','friday', 'saturday','sunday']
oned = ['a', 'all', 'am', 'and', 'at', 'ball', 'be', 'bed', 'big', 'book', 'box', 'boy', 'but', 'came', 'can', 'car', 'cat', 'come', 'cow', 'dad', 'day', 'did', 'do', 'dog', 'fat', 'for', 'fun', 'get', 'go', 'good', 'got', 'had', 'hat', 'he', 'hen', 'here', 'him', 'his', 'home', 'hot', 'I', 'if', 'in', 'into', 'is', 'it', 'its', 'let', 'like', 'look', 'man', 'may', 'me', 'mom', 'my', 'no', 'not', 'of', 'oh', 'old', 'on', 'one', 'out', 'pan', 'pet', 'pig', 'play', 'ran', 'rat', 'red', 'ride', 'run', 'sat', 'see', 'she', 'sit', 'six', 'so', 'stop', 'sun', 'ten', 'the', 'this', 'to', 'top', 'toy', 'two', 'up', 'us', 'was', 'we', 'will', 'yes', 'you' ]
twod = ['about', 'add', 'after', 'ago', 'an ', 'any', 'apple', 'are ', 'as', 'ask', 'ate', 'away', 'baby ', 'back', 'bad', 'bag', 'base', 'bat', 'bee', 'been', 'before', 'being', 'best', 'bike', 'bill', 'bird', 'black', 'blue', 'boat', 'both', 'bring', 'brother ', 'brown', 'bus', 'buy ', 'by', 'cake', 'call', 'candy', 'change', 'child', 'city', 'clean', 'club', 'coat', 'cold', 'coming ', 'corn', 'could', 'cry', 'cup', 'cut', 'daddy ', 'dear', 'deep', 'deer', 'doing', 'doll', 'door', 'down ', 'dress', 'drive', 'drop', 'dry', 'duck', 'each', 'eat', 'eating', 'egg', 'end', 'fall', 'far', 'farm', 'fast', 'father ', 'feed', 'feel', 'feet', 'fell ', 'find', 'fine ', 'fire', 'first ', 'fish', 'five', 'fix', 'flag', 'floor', 'fly', 'food', 'foot', 'four', 'fox', 'from ', 'full', 'funny', 'game', 'gas', 'gave', 'girl', 'give', 'glad', 'goat', 'goes ', 'going ', 'gold', 'gone', 'grade ', 'grass', 'green', 'grow', 'hand', 'happy', 'hard', 'has ', 'have ', 'hear ', 'help', 'here ', 'hill', 'hit', 'hold', 'hole', 'hop', 'hope ', 'horse', 'house ', 'how ', 'ice', 'inch', 'inside ', 'job', 'jump', 'just ', 'keep', 'king', 'know ', 'lake', 'land', 'last', 'late', 'lay', 'left', 'leg', 'light', 'line', 'little ', 'live', 'lives', 'long', 'looking', 'lost', 'lot', 'love', 'mad', 'made ', 'make ', 'many ', 'meat', 'men', 'met', 'mile', 'milk', 'mine', 'miss', 'moon', 'more', 'most', 'mother ', 'move', 'much ', 'must', 'myself ', 'nail', 'name ', 'need', 'new ', 'next', 'nice ', 'night', 'nine', 'north', 'now ', 'nut', 'off ', 'only', 'open', 'or ', 'other', 'our', 'outside ', 'over', 'page', 'park', 'part', 'pay', 'pick', 'plant', 'playing', 'pony', 'post', 'pull', 'put', 'rabbit', 'rain', 'read', 'rest', 'riding', 'road', 'rock', 'room', 'said ', 'same', 'sang', 'saw ', 'say', 'school ', 'sea', 'seat', 'seem', 'seen', 'send', 'set', 'seven', 'sheep', 'ship', 'shoe', 'show ', 'sick', 'side', 'sing', 'sky', 'sleep', 'small', 'snow', 'some ', 'soon ', 'spell', 'start', 'stay', 'still', 'store ', 'story', 'take', 'talk', 'tall', 'teach', 'tell', 'than ', 'thank', 'that', 'them ', 'then ', 'there ', 'they ', 'thing', 'think ', 'three', 'time ', 'today ', 'told', 'too ', 'took', 'train ', 'tree', 'truck', 'try', 'use', 'very ', 'walk', 'want ', 'warm', 'wash', 'way', 'week', 'well ', 'went ', 'were ', 'wet', 'what', 'when ', 'while ', 'white', 'who', 'why', 'wind', 'wish', 'with ', 'woke', 'wood', 'work', 'yellow', 'yet', 'your', 'zoo']
threed = ['able', 'above', 'afraid', 'afternoon', 'again', 'age', 'air', 'airplane', 'almost', 'alone', 'along', 'already', 'also', 'always', 'animal', 'another', 'anything', 'around', 'art', 'aunt', 'balloon', 'bark', 'barn', 'basket', 'beach', 'bear', 'because', 'become', 'began', 'begin', 'behind', 'believe', 'below', 'belt', 'better', 'birthday', 'body', 'bones', 'born', 'bought', 'bread', 'bright', 'broke', 'brought', 'busy', 'cabin', 'cage', 'camp', 'can\'t', 'care', 'carry', 'catch', 'cattle', 'cave', 'children', 'class', 'close', 'cloth', 'coal', 'color', 'corner', 'cotton', 'cover', 'dark', 'desert', 'didn\'t', 'dinner', 'dishes', 'does', 'done', 'don\'t', 'dragon', 'draw', 'dream', 'drink', 'early', 'earth', 'east', 'eight', 'even', 'ever', 'every', 'everyone', 'everything', 'eyes', 'face', 'family', 'feeling', 'felt', 'few', 'fight', 'fishing', 'flower', 'flying', 'follow', 'forest', 'forgot', 'form', 'found', 'fourth', 'free', 'Friday', 'friend', 'front', 'getting', 'given', 'grandmother', 'great', 'grew', 'ground', 'guess', 'hair', 'half', 'having', 'head', 'heard', 'he\'s', 'heat', 'hello', 'high', 'himself', 'hour', 'hundred', 'hurry', 'hurt', 'I\'d', 'I\'ll', 'I\'m', 'inches', 'isn\'t', 'it\'s', 'I\'ve', 'kept', 'kids', 'kind', 'kitten', 'knew', 'knife', 'lady', 'large', 'largest', 'later', 'learn', 'leave', 'let\'s', 'letter', 'life', 'list', 'living', 'lovely', 'loving', 'lunch', 'mail', 'making', 'maybe', 'mean', 'merry', 'might', 'mind', 'money', 'month', 'morning', 'mouse', 'mouth', 'Mr.', 'Mrs.', 'Ms.', 'music', 'near', 'nearly', 'never', 'news', 'noise', 'nothing', 'number', 'o\'clock', 'often', 'oil', 'once', 'orange', 'order', 'own', 'pair', 'paint', 'paper', 'party', 'pass', 'past', 'penny', 'people', 'person', 'picture', 'place', 'plan', 'plane', 'please', 'pocket', 'point', 'poor', 'race', 'reach', 'reading', 'ready', 'real', 'rich', 'right', 'river', 'rocket', 'rode', 'round', 'rule', 'running', 'salt', 'says', 'sending', 'sent', 'seventh', 'sew', 'shall', 'short', 'shot', 'should', 'sight', 'sister', 'sitting', 'sixth', 'sled', 'smoke', 'soap', 'someone', 'something', 'sometime', 'song', 'sorry', 'sound', 'south', 'space', 'spelling', 'spent', 'sport', 'spring', 'stairs', 'stand', 'state', 'step', 'stick', 'stood', 'stopped', 'stove', 'street', 'strong', 'study', 'such', 'sugar', 'summer', 'Sunday', 'supper', 'table', 'taken', 'taking', 'talking', 'teacher', 'team', 'teeth', 'tenth', 'that\'s', 'their', 'these', 'thinking', 'third', 'those', 'thought', 'throw', 'tonight', 'trade', 'trick', 'trip', 'trying', 'turn', 'twelve', 'twenty', 'uncle', 'under', 'upon', 'wagon', 'wait', 'walking', 'wasn\'t', 'watch', 'water', 'weather', 'we\'re', 'west', 'wheat', 'where', 'which', 'wife', 'wild', 'win', 'window', 'winter', 'without', 'woman', 'won', 'won\'t', 'wool', 'word', 'working', 'world', 'would', 'write', 'wrong', 'yard', 'year', 'yesterday', 'you\'re'  ]
fourd = ['across', 'against', 'answer', 'awhile', 'between', 'board', 'bottom', 'breakfast', 'broken', 'build', 'building', 'built', 'captain', 'carried', 'caught', 'charge', 'chicken', 'circus', 'cities', 'clothes', 'company', 'couldn\'t', 'country', 'discover', 'doctor', 'doesn\'t', 'dollar', 'during', 'eighth', 'else', 'enjoy', 'enough', 'everybody', 'example', 'except', 'excuse', 'field', 'fifth', 'finish', 'following', 'good-by', 'group', 'happened', 'harden', 'haven\'t', 'heavy', 'held', 'hospital', 'idea', 'instead', 'known', 'laugh', 'middle', 'minute', 'mountain', 'ninth', 'ocean', 'office', 'parent', 'peanut', 'pencil', 'picnic', 'police', 'pretty', 'prize', 'quite', 'radio', 'raise', 'really', 'reason', 'remember', 'return', 'Saturday', 'scare', 'second', 'since', 'slowly', 'stories', 'student', 'sudden', 'suit', 'sure', 'swimming', 'though', 'threw', 'tired', 'together', 'tomorrow', 'toward', 'tried', 'trouble', 'truly', 'turtle', 'until', 'village', 'visit', 'wear', 'we\'ll', 'whole', 'whose', 'women', 'wouldn\'t', 'writing', 'written', 'wrote', 'yell', 'young']
fived = ['although', 'America', 'among', 'arrive', 'attention', 'beautiful', 'countries', 'course', 'cousin', 'decide', 'different', 'evening', 'favorite', 'finally', 'future', 'happiest', 'happiness', 'important', 'interest', 'piece', 'planet', 'present', 'president', 'principal', 'probably', 'problem', 'receive', 'sentence', 'several', 'special', 'suddenly', 'suppose', 'surely', 'surprise', 'they\'re', 'through', 'usually', 'action', 'actor', 'actually', 'addition', 'agreed', 'allowed', 'aloud', 'amendment', 'amount', 'amusement', 'annual', 'appointed', 'arrange', 'attention', 'awhile', 'beginning', 'bruise', 'business', 'calves', 'capital', 'capitol', 'captain', 'carefully', 'caught', 'cause', 'celebrate', 'century', 'chemical', 'chocolate', 'circle', 'climate', 'climbed', 'collar', 'column', 'company', 'condition', 'consider', 'consonant', 'constant', 'continent', 'continued', 'country', 'course', 'crystal', 'current', 'curtain', 'daughter', 'daytime', 'decided', 'decimal', 'delicious', 'desert', 'dessert', 'details', 'determine', 'dictionary', 'difference', 'different', 'difficult', 'direction', 'disappoint', 'division', 'eighth', 'election', 'elements', 'energy', 'enjoyment', 'equal', 'equation', 'errands', 'exact', 'except', 'expect', 'explain', 'explode', 'express', 'factory', 'fault', 'favorite', 'finally', 'finished', 'forward', 'fought', 'fraction', 'furniture', 'future', 'general', 'government', 'graceful', 'graph', 'grasp', 'grease', 'grown-ups', 'guest', 'guide', 'happened', 'happily', 'harvest', 'healthy', 'height', 'hoarse', 'human', 'idea', 'imagine', 'include', 'increase', 'indicate', 'information', 'instrument', 'intention', 'interesting', 'inventor', 'island', 'jewel', 'journey', 'jungle', 'knives', 'known', 'language', 'laughter', 'length', 'limb', 'located', 'lumber', 'major', 'mammal', 'manufacture', 'material', 'mayor', 'measure', 'melody', 'members', 'memories', 'message', 'method', 'million', 'minor', 'modern', 'mountain', 'music', 'natural', 'necessary', 'neither', 'newspaper', 'northern', 'notebook', 'notice', 'noun', 'numeral', 'object', 'observe', 'opposite', 'orphan', 'ought', 'outside', 'oxygen', 'paid', 'paint', 'paragraph', 'pattern', 'pause', 'payment', 'perhaps', 'period', 'permit', 'phone', 'phrase', 'pleasant', 'pleasure', 'plural', 'poison', 'position', 'possible', 'practice', 'prepared', 'president', 'probably', 'problem', 'process', 'produce', 'program', 'promise', 'property', 'protection', 'provide', 'puzzle', 'quickly', 'quietly', 'radio', 'raise', 'rarely', 'rather', 'reached', 'receive', 'record', 'region', 'relax', 'remain', 'remove', 'repay', 'repeat', 'report', 'represent', 'respond', 'result', 'rhythm', 'rising', 'ruin', 'salad', 'sandal', 'scale', 'scent', 'schedule', 'science', 'section', 'separate', 'service', 'settled', 'several', 'shadow', 'shelter', 'shoulder', 'shouted', 'shower', 'signal', 'similar', 'sincerely', 'single', 'size', 'slippery', 'soar', 'soil', 'solution', 'solve', 'southern', 'split', 'spoiled', 'sports', 'square', 'squeeze', 'stain', 'state', 'statement', 'station', 'steer', 'stomach', 'stopping', 'straight', 'straighten', 'stream', 'stretched', 'suggest', 'suitcase', 'sunset', 'supply', 'sure', 'surface', 'surprise', 'surround', 'sweater', 'syllable', 'syrup', 'tablet', 'tasty', 'teaspoon', 'terrible', 'though', 'thoughtful', 'thrown', 'tornado', 'toward', 'traffic', 'trail', 'treasure', 'treatment', 'triangle', 'trouble', 'tunnel', 'type', 'understood', 'unknown', 'usually', 'value', 'various', 'warn', 'weigh', 'weight', 'weird', 'western', 'whisper', 'whoever', 'whole', 'whose', 'wives', 'women', 'wonderful', 'wound', 'wreck', 'x-ray', 'yesterday']
sixd = ['Abandon', 'abundant', 'access', 'accommodate', 'accumulate', 'adapt', 'adhere', 'agony', 'allegiance', 'ambition', 'ample', 'anguish', 'anticipate', 'anxious', 'apparel', 'appeal', 'apprehensive', 'arid', 'arrogant', 'awe', 'Barren', 'beacon', 'beneficial', 'blunder', 'boisterous', 'boycott', 'burden', 'Campaign', 'capacity', 'capital', 'chronological', 'civic', 'clarity', 'collaborate', 'collide', 'commend', 'commentary', 'compact', 'composure', 'concise', 'consent', 'consequence', 'conserve', 'conspicuous', 'constant', 'contaminate', 'context', 'continuous', 'controversy', 'convenient', 'cope', 'cordial', 'cultivate', 'cumulative', '', 'Declare', 'deluge', 'dense', 'deplete', 'deposit', 'designate', 'desperate', 'deteriorate', 'dialogue', 'diligent', 'diminish', 'discretion', 'dissent', 'dissolve', 'distinct', 'diversity', 'domestic', 'dominate', 'drastic', 'duration', 'dwell', 'Eclipse', 'economy', 'eerie', 'effect', 'efficient', 'elaborate', 'eligible', 'elude', 'encounter', 'equivalent', 'erupt', 'esteem', 'evolve', 'exaggerate', 'excel', 'exclude', 'expanse', 'exploit', 'extinct', 'extract', 'Factor', 'former', 'formulates', 'fuse', 'futile', 'Generate', 'genre', 'Habitat', 'hazardous', 'hoax', 'hostile', 'Idiom', 'ignite', 'immense', 'improvises', 'inept', 'inevitable', 'influence', 'ingenious', 'innovation', 'intimidate', 'Jovial', 'Knack', 'Leeway', 'legislation', 'leisure', 'liberate', 'likeness', 'linger', 'literal', 'loathe', 'lure', 'Majority', 'makeshift', 'manipulate', 'marvel', 'massive', 'maximum', 'meager', 'mere', 'migration', 'mimic', 'minute', 'monotonous', 'Negotiate', 'Objective', 'obstacle', 'omniscient', 'onset', 'optimist', 'originate', 'Painstaking', 'paraphrase', 'parody', 'persecute', 'plummet', 'possess', 'poverty', 'precise', 'predicament', 'predict', 'prejudice', 'preliminary', 'primitive', 'priority', 'prominent', 'propel', 'prosecute', 'prosper', 'provoke', 'pursue', 'Quest', 'Recount', 'refuge', 'reinforce', 'reluctant', 'remorse', 'remote', 'resolute', 'restrain', 'retaliate', 'retrieve', 'rigorous', 'rural', 'Salvage', 'sanctuary', 'siege', 'significant', 'solar', 'soothe', 'stationary', 'stifle', 'strive', 'subordinate', 'subsequent', 'superior', 'supplement', 'swarm', 'Tangible', 'terminate', 'terrain', 'trait', 'transform', 'transport', 'treacherous', 'Unanimous', 'unique', 'unruly', 'urban', 'Vacate', 'verdict', 'verge', 'vibrant', 'vital', 'vow', 'accept', 'accidentally', 'acquire', 'ambulance', 'ancient', 'appearance', 'appointment', 'arithmetic', 'audience', 'autumn', 'beautifully', 'beliefs', 'blown', 'bough', 'bows', 'calendar', 'canyon', 'capable', 'capacity', 'caution', 'ceiling', 'champion', 'choir', 'cleanse', 'combination', 'comfortable', 'community', 'complain', 'concentration', 'concern', 'connection', 'constitution', 'contagious', 'conversation', 'cooperation', 'correct', 'coupon', 'creative', 'creature', 'crisis', 'culture', 'curious', 'dangerous', 'decision', 'demonstrate', 'denominator', 'department', 'departure', 'depth', 'descendant', 'disagreement', 'disastrous', 'discussion', 'distance', 'distributed', 'earliest', 'echoes', 'edition', 'educate', 'electricity', 'element', 'elevator', 'emergency', 'employer', 'emptiness', 'encouragement', 'encyclopedia', 'entire', 'entrance', 'envelope', 'equator', 'especially', 'establish', 'example', 'excellent', 'excitement', 'exercise', 'experience', 'exterior', 'familiar', 'faucet', 'fierce', 'fireproof', 'following', 'forgetting', 'forgiveness', 'fossil', 'freight', 'frighten', 'fuel', 'further', 'gallon', 'gaze', 'gesture', 'governor', 'graduation', 'grateful', 'grief', 'halves', 'hamburger', 'hangar', 'hanger', 'happiness', 'headache', 'heroes', 'history', 'honorable', 'horizon', 'hunger', 'hyphen', 'ignore', 'imagination', 'immediate', 'importance', 'improvement', 'independence', 'ingredient', 'injury', 'inquire', 'instead', 'instruction', 'intermission', 'interview', 'invisible', 'invitation', 'involve', 'jealous', 'junior', 'knowledge', 'lawyer', 'league', 'legal', 'liberty', 'liquid', 'listening', 'loaves', 'location', 'luggage', 'manager', 'manner', 'manor', 'marriage', 'meant', 'mechanic', 'medicine', 'mention', 'minus', 'minute', 'mistaken', 'misunderstand', 'mixture', 'mourn', 'multiple', 'muscle', 'museum', 'musician', 'mute', 'myth', 'nationality', 'negative', 'noisy', 'noticeable', 'novel', 'numerator', 'obtain', 'occur', 'official', 'operate', 'original', 'outline', 'partial', 'passenger', 'patient', 'penalty', 'penguin', 'percent', 'performance', 'personal', 'persuade', 'physical', 'piano', 'plumber', 'poem', 'poet', 'policy', 'pollute', 'pollution', 'positive', 'potatoes', 'predict', 'prefer', 'pressure', 'prevent', 'principal', 'private', 'project', 'pumpkins', 'purchase', 'purse', 'quote', 'radius', 'rapid', 'ratio', 'realize', 'recently', 'recycle', 'reduce', 'referred', 'regardless', 'regular', 'rehearse', 'relief', 'relieve', 'remarkable', 'remind', 'remote', 'replacement', 'replied', 'reply', 'requirement', 'rescue', 'resident', 'resources', 'respectful', 'review', 'roam', 'routine', 'rumor', 'rural', 'safety', 'sailor', 'salute', 'satisfy', 'scarcely', 'scientific', 'scissors', 'selection', 'senior', 'sentence', 'separately', 'serious', 'session', 'shampoo', 'shelves', 'shorten', 'silent', 'simply', 'sketch', 'skillful', 'solar', 'sought', 'spaghetti', 'sponge', 'squawk', 'storage', 'strain', 'strategy', 'strength', 'strive', 'struggle', 'studios', 'success', 'suggestion', 'support', 'surrounded', 'sword', 'system', 'telephone', 'television', 'temperature', 'theme', 'themselves', 'therefore', 'thicken', 'thousand', 'threat', 'tomatoes', 'trophies', 'tutor', 'unbelievable', 'underneath', 'unite', 'vacuum', 'vain', 'variety', 'vary', 'vault', 'vegetable', 'vein', 'violence', 'visible', 'vision', 'waste', 'who\'s', 'whose', 'wrestle', 'wrinkle', 'yield']
sevend = ['abbreviation', 'absence', 'absolutely', 'absorb', 'abundant', 'accessible', 'accompanied', 'accomplishment', 'accurate', 'achievement', 'acres', 'adequate', 'adjustable', 'admit', 'admittance', 'advice', 'advise', 'afghan', 'alternate', 'alternative', 'amusement', 'analysis', 'analyze', 'ancestor', 'anniversary', 'appreciate', 'artificial', 'assistance', 'association', 'athlete', 'atmosphere', 'attendance', 'authority', 'bacteria', 'bagel', 'baggage', 'benefited', 'benefiting', 'bicycle', 'biscuit', 'bizarre', 'boulevard', 'boundary', 'bouquet', 'brilliant', 'brochure', 'bulletin', 'bureau', 'campaign', 'cancellation', 'candidate', 'capable', 'capital', 'capitol', 'category', 'celery', 'cemetery', 'changeable', 'chaperone', 'character', 'cinnamon', 'civilize', 'commercial', 'committed', 'committee', 'commotion', 'companion', 'competent', 'competition', 'complement', 'complex', 'compliment', 'compressor', 'concentrate', 'concentration', 'conductor', 'confetti', 'congratulations', 'consequently', 'controlling', 'cringe', 'culminate', 'culprit', 'deceive', 'delayed', 'democracy', 'deodorant', 'descendent', 'description', 'diameter', 'diamond', 'discourage', 'disgraceful', 'dismissal', 'distinguished', 'dreadful', 'economics', 'economy', 'elementary', 'embarrass', 'emotion', 'emphasize', 'encircle', 'enclosing', 'encounter', 'endurance', 'engineer', 'environment', 'episode', 'erosion', 'eruption', 'evident', 'exchange', 'executive', 'exhibit', 'expensive', 'extinct', 'extinguish', 'extraordinary', 'extremely', 'fabricate', 'failure', 'fascinating', 'fatigue', 'flagrant', 'foreign', 'forfeit', 'frequently', 'fundamental', 'genuine', 'ghetto', 'gossiping', 'gradual', 'graffiti', 'grammar', 'grievance', 'guarantee', 'harass', 'havoc', 'heroic', 'hesitate', 'horrify', 'hospital', 'humid', 'humility', 'hygiene', 'identical', 'idle', 'idol', 'illegal', 'illustration', 'imaginary', 'immediately', 'immobilize', 'impossibility', 'inconvenient', 'incredible', 'individual', 'infamous', 'influence', 'informant', 'inhabit', 'inherit', 'innocence', 'innocent', 'instructor', 'intelligent', 'interruption', 'introduction', 'involvement', 'irate', 'irresistible', 'jealousy', 'judgment', 'juvenile', 'kettle', 'knitting', 'laboratory', 'language', 'legibly', 'liquidation', 'management', 'maneuver', 'media', 'mileage', 'miniature', 'misbehaved', 'morale', 'mortgage', 'movement', 'murmur', 'musician', 'mysterious', 'negotiate', 'nervous', 'nuisance', 'nurture', 'oases', 'oasis', 'obedient', 'obstacle', 'obviously', 'occasion', 'ordinarily', 'ordinary', 'organization', 'pamphlet', 'panic', 'panicked', 'panicky', 'parallel', 'paralysis', 'paralyze', 'penicillin', 'pedestrian', 'phantom', 'pheasant', 'phrase', 'politely', 'popular', 'precipitation', 'principal', 'principle', 'privilege', 'procedure', 'pronunciation', 'psychology', 'puny', 'qualified', 'qualifying', 'quotation', 'raspberry', 'reasonable', 'receipt', 'receiving', 'recipe', 'recognition', 'recommend', 'recruit', 'reddest', 'reprimand', 'resigned', 'restaurant', 'rotten', 'sandwich', 'scarcity', 'scenery', 'secretary', 'securing', 'significance', 'simile', 'sincerely', 'sincerity', 'situation', 'skeptical', 'slumber', 'smudge', 'solemn', 'souvenir', 'spacious', 'specific', 'stationary', 'stationery', 'statistics', 'subscription', 'substitute', 'superintendent', 'supervisor', 'supposedly', 'threatening', 'tolerate', 'tongue', 'tournament', 'tragedy', 'traitor', 'transferred', 'transferring', 'transmitted', 'traveled', 'traveling', 'unfortunately', 'uniform', 'university', 'unnecessary', 'valuable', 'various', 'vehicle', 'version', 'vertical', 'victim', 'vigorously', 'violation', 'visualize', 'volcano', 'voyage', 'wealthy', 'weapon', 'wheeze', 'wilderness', 'Abate', 'abnormal', 'abode', 'abrupt', 'accelerate', 'acclaim', 'acknowledge', 'acquire', 'aspire', 'acrid', 'addict', 'adjacent', 'admonish', 'affliction', 'agitate', 'ajar', 'akin', 'allege', 'annihilate', 'anonymous', 'antagonize', 'apathy', 'arbitrate', 'astute', 'authentic', 'avert', 'Bellow', 'beseech', 'bestow', 'bewilder', 'bigot', 'blatant', 'bleak', 'braggart', 'brawl', 'browse', 'bystander', 'Candid', 'canine', 'canny', 'capricious', 'capsize', 'casual', 'casualty', 'catastrophe', 'cater', 'chorus', 'citrus', 'clamber', 'climax', 'compromise', 'concur', 'confront', 'congested', 'conjure', 'consult', 'corrupt', 'counterfeit', 'covet', 'customary', 'Debut', 'deceased', 'dependent', 'despondent', 'detach', 'devour', 'dishearten', 'dismal', 'dismantle', 'distraught', 'docile', 'downright', 'drone', 'dumbfound', 'Emblem', 'endure', 'ensue', 'enthrall', 'epidemic', 'erode', 'exuberant', 'Fathom', 'feud', 'figment', 'firebrand', 'flabbergast', 'flagrant', 'flaw', 'fruitless', 'Gaudy', 'geography', 'gratify', 'gravity', 'grim', 'grimy', 'grueling', 'gruesome', 'Haggle', 'headlong', 'hilarious', 'homage', 'homicide', 'hospitable', 'hurtle', 'hybrid', 'Illiterate', 'impede', 'implore', 'incident', 'incredulous', 'infamous', 'infuriate', 'insinuate', 'intensified', 'inundate', 'irate', 'Lavish', 'legacy', 'legitimate', 'lethal', 'loath', 'lurk', 'Magnetic', 'mirth', 'quench', 'magnitude', 'maternal', 'maul', 'melancholy', 'mellow', 'momentum', 'mortify', 'mull', 'murky', 'Narrative', 'negligent', 'nimble', 'nomadic', 'noteworthy', 'notify', 'notorious', 'nurture', 'Obnoxious', 'oration', 'orthodox', 'overwhelm', 'Pamper', 'patronize', 'peevish', 'pelt', 'pending', 'perceived', 'perjury', 'permanent', 'persist', 'perturb', 'pique', 'pluck', 'poised', 'ponder', 'potential', 'predatory', 'presume', 'preview', 'prior', 'prowess', 'Radiant', 'random', 'rant', 'recede', 'reprimand', 'resume', 'retort', 'robust', 'rupture', 'Saga', 'sequel', 'sham', 'shirk', 'simultaneously', 'snare', 'species', 'status', 'stodgy', 'substantial', 'subtle', 'sullen', 'supervise', 'Tamper', 'throb', 'toxic', 'tragedy', 'trickle', 'trivial', 'Uncertainty', 'unscathed', 'upright', 'urgent', 'utmost', 'Vengeance', 'vicious', 'vindictive', 'vista', 'vocation', 'void', 'Wary', 'whim', 'wince', 'wrath', 'Yearn']

coreVocab = []

st = WordNetLemmatizer()
for word in kd:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in oned:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in twod:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in threed:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in fourd:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in fived:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in sixd:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))
for word in sevend:
    coreVocab.append(" ".join([st.lemmatize(i.lower()) for i in word.split()]))


In [14]:
vocab = []
nonVocab = []
with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:

        splitQuery = [st.lemmatize(i.lower()) for i in query.split(' ')]

        queryVocab = 0
        nonqueryVocab = 0
        totalVocab = 0

        for word in splitQuery:
            if word in coreVocab:
                queryVocab  +=1
                totalVocab  +=1
            else:
                nonqueryVocab +=1
                totalVocab  +=1

        vocab.append(queryVocab/totalVocab) 
        nonVocab.append(nonqueryVocab/totalVocab) 
        pbar.update()


100%|██████████| 70485/70485 [00:14<00:00, 4914.76it/s]


In [15]:
Vocab = pd.DataFrame(data=vocab, columns = ['coreVocab'])
Vocab['query'] = allQueries
Vocab['nonCoreVocab'] = nonVocab
Vocab = Vocab.set_index('query')

# Age of Acquisition features

In this block of code we first load up the Age of Acquistion data set and process it into a dictionary where the key is the word, and the value is AoA rating. We then find the AoA rating for each word in the query, extracting the min, max, average (known as query complexity), and ratio of words expected to be learned by the age of 12.

In [10]:
AoAvocab = []

with open('DataSets/AoA/AoA_51715_words.csv') as csvFile:
    csvReader = csv.reader(csvFile)
    lineCount = 0
    for row in csvReader:
        if lineCount == 0:
            lineCount += 1
        else:
            AoAvocab.append(row[7])
            AoAvocab.append(row[10])
            
AoAVConv = Convert(AoAvocab)

In [11]:
minAoA = []
maxAoA = []
averageVocab = []
ratioAoA = []

st = WordNetLemmatizer()


with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        count = 0
        vocab = []

        for word in query.split(' '):
            word = word.lower().strip()
            word = re.sub(r'[^\w\s]','',word)
            word = st.lemmatize(word)
            if word in AoAVConv:
                vocab.append(float(AoAVConv[word]))
            else:
                vocab.append(0)


        vocab = np.array(vocab)
        
        if vocab.size == 0:
            minAoA.append(-1) 
            maxAoA.append(-1) 
            averageVocab.append(-1)
            ratioAoA.append(0)
        elif vocab.size > 0:
            minAoA.append(np.min(vocab))
            maxAoA.append(np.max(vocab))
            averageVocab.append(np.mean(vocab))
            for entry in vocab:
                if entry < 13 and entry > 0:
                    count +=1
            ratioAoA.append(count/len(vocab))
        
        pbar.update()

100%|██████████| 70485/70485 [00:10<00:00, 6535.57it/s]


In [16]:
Vocab['minAoA'] = minAoA
Vocab['maxAoA'] = maxAoA
Vocab['ratioAoA'] = ratioAoA
Vocab['queryComplexity'] = averageVocab

# Sven Features

In [17]:
SVENwords = []
st = WordNetLemmatizer()
with open('DataSets/Sven/ChildrenDict.tsv') as csvFile:
    csvReader = csv.reader(csvFile, delimiter = '\t')
    lineCount = 0
    for row in csvReader:
        if lineCount == 0:
            lineCount +=1
        else:
            SVENwords.append((row[1]))

In [19]:
SVENcount = []
with tqdm(total=len(allQueries)) as pbar:
    for query in allQueries:
        vocab = []
        countWord = 0
        wordCount = 0
        for word in query.split(' '):
            wordCount +=1
            if word in SVENwords:
                countWord +=1

        SVENcount.append(countWord/wordCount)
        pbar.update()

100%|██████████| 70485/70485 [05:51<00:00, 200.70it/s]


In [None]:
Vocab['SVEN'] = SVENcount

# Top Stereotype Uni-Grams

In [20]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
allSessionsQ = allSessionsQ[allSessionsQ['class'] == 1]
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

In [21]:
text = ''
for query in queries:
    text += query.lower() + " "

In [22]:
top250Arch = stopwords.words()
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in top250Arch]
text = ' '.join(resultwords)
text1 = text.split(' ')
fdist1 = nltk.FreqDist(text1)
top250 = []

for x in fdist1.most_common(250):
    top250.append(x[0])

In [23]:
top250count = []
top250avg = []
for query in allQueries:
    vocab = []
    countWord = 0
    wordCount = 0
    for word in query.split(' '):
        wordCount +=1
        if word in top250:
            countWord +=1
        else:
            pass
    top250count.append(countWord)
    top250avg.append(countWord/wordCount)

In [24]:
Vocab['top250All'] = top250count
Vocab['top250avgAll'] = top250avg
Vocab['top250avgNotAll'] = 1-Vocab['top250avgAll']


# Top Non-Stereotype Unigrams

In [25]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
allSessionsQ = allSessionsQ[allSessionsQ['class'] == 0]
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

In [26]:
text = ''
for query in queries:
    text += query.lower() + " "

In [27]:
top250Arch = stopwords.words()
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in top250Arch]
text = ' '.join(resultwords)
text1 = text.split(' ')
fdist1 = nltk.FreqDist(text1)
top250 = []

for x in fdist1.most_common(250):
    top250.append(x[0])

In [28]:
top250count = []
top250avg = []
for query in allQueries:
    vocab = []
    countWord = 0
    wordCount = 0
    for word in query.split(' '):
        wordCount +=1
        if word in top250:
            countWord +=1
        else:
            pass
    top250count.append(countWord)
    top250avg.append(countWord/wordCount)

In [29]:
Vocab['top250AllNA'] = top250count
Vocab['top250avgAllNA'] = top250avg
Vocab['top250avgNotAllNA'] = 1-Vocab['top250avgAllNA']


# Top Stereotype Bi-Grams

In [30]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allQueries = list(set(allQueries))
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
allSessionsQ = allSessionsQ[allSessionsQ['class'] == 1]
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

queries2 = []
for query in queries:
    queries2.append(query.lower())
queries = queries2

bigrams = [b for l in queries for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

fdist1 = nltk.FreqDist(bigrams)

top50 = []

for x in fdist1.most_common(10):
     top50.append(x[0])
        
top50count = []
top50avg = []

for query in allQueries:
    vocab = []
    countWord = 0
    wordCount = 0
    query = query.lower()
    query = query.split(" ")
    split = nltk.bigrams(query)
    for word in split:
        wordCount +=1
        if word in top50:
            countWord +=1
        else:
            pass
    top50count.append(countWord)
    if wordCount > 0:
        top50avg.append(countWord/wordCount)
    else:
        top50avg.append(-1)

In [31]:
Vocab['top50bi'] = top50count
Vocab['top50biAvg'] = top50avg
Vocab['top50biNot'] = 1-Vocab['top50biAvg']

# Top Stereotype Bi-Grams

In [32]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allQueries = list(set(allQueries))
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
allSessionsQ = allSessionsQ[allSessionsQ['class'] == 0]
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

queries2 = []
for query in queries:
    queries2.append(query.lower())
queries = queries2

bigrams = [b for l in queries for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

fdist1 = nltk.FreqDist(bigrams)

top50 = []

for x in fdist1.most_common(10):
     top50.append(x[0])
        
top50count = []
top50avg = []

for query in allQueries:
    vocab = []
    countWord = 0
    wordCount = 0
    query = query.lower()
    query = query.split(" ")
    split = nltk.bigrams(query)
    for word in split:
        wordCount +=1
        if word in top50:
            countWord +=1
        else:
            pass
    top50count.append(countWord)
    if wordCount > 0:
        top50avg.append(countWord/wordCount)
    else:
        top50avg.append(-1)

In [33]:
Vocab['top50biNA'] = top50count
Vocab['top50biavgNA'] = top50avg
Vocab['top50biNotNA'] = 1-Vocab['top50biavgNA']

# TF-IDF All

In [34]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allQueries = list(set(allQueries))
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

text = ''
for query in queries:
    text += query + " "
    
stopwords = stopwords.words()
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
text = ' '.join(resultwords)

queries = allSessionsQ['query'].tolist()

vectorizer = TfidfVectorizer()
vector = vectorizer.fit(queries)
vectors = vector.transform(allQueries)

listTFIDF = []
for m in vectors:
    listTFIDF.append(m.sum() / m.count_nonzero())



In [35]:
VocabTFIDFAll = pd.DataFrame(data=listTFIDF, columns = ['tfidfAll']).fillna(-1)
VocabTFIDFAll['query'] = allQueries
Vocab = Vocab.merge(VocabTFIDFAll, on = 'query')

# TF-IDF Stereotype

In [36]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allQueries = list(set(allQueries))
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
allSessionsQ = allSessionsQ[allSessionsQ['class'] == 1]
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

text = ''
for query in queries:
    text += query + " "
    
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
text = ' '.join(resultwords)

queries = allSessionsQ['query'].tolist()

vectorizer = TfidfVectorizer()
vector = vectorizer.fit(queries)
vectors = vector.transform(allQueries)

listTFIDF = []
for m in vectors:
    listTFIDF.append(m.sum() / m.count_nonzero())




In [37]:
VocabTFIDF = pd.DataFrame(data=listTFIDF, columns = ['tfidf']).fillna(-1)
VocabTFIDF['query'] = allQueries
Vocab = Vocab.merge(VocabTFIDF, on = 'query')

# TF-IDF Non-Stereotype


In [38]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allQueries = list(set(allQueries))
allSessionsQ = allSessions.loc[allSessions['type']=='Q']
allSessionsQ = allSessionsQ[allSessionsQ['class'] == 0]
sID = allSessionsQ['sID'].unique()
corpus = np.random.choice(sID,int((len(sID)*.8)), replace=False)
allSessionsQ = allSessionsQ[allSessionsQ['sID'].isin(corpus)]
queries = allSessionsQ['query'].tolist()

text = ''
for query in queries:
    text += query + " "
    
querywords = text.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
text = ' '.join(resultwords)

queries = allSessionsQ['query'].tolist()

vectorizer = TfidfVectorizer()
vector = vectorizer.fit(queries)
vectors = vector.transform(allQueries)

listTFIDF = []
for m in vectors:
    listTFIDF.append(m.sum() / m.count_nonzero())




In [39]:
VocabTFIDFNA = pd.DataFrame(data=listTFIDF, columns = ['tfidfNA']).fillna(-1)
VocabTFIDFNA['query'] = allQueries
Vocab = Vocab.merge(VocabTFIDFNA, on = 'query')

# Stopwords

In [40]:
stopWords = []
st = WordNetLemmatizer()
with open('DataSets/stopwords.txt') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        if (row):
            stopWords.append(st.lemmatize(row[0]))
        else:
            pass

In [41]:
stopCount = []
stopAverage= []
st = WordNetLemmatizer()
import re


for query in allQueries:
    count = 0
    for word in query.split(' '):
        word = word.lower().strip()
        word = re.sub(r'[^\w\s]','',word)
        word = st.lemmatize(word)
        if word in stopWords:
            count +=1
        else:
            pass
    stopCount.append(count)
    stopAverage.append(count/len(query.split(' ')))

In [42]:
VocabStop = pd.DataFrame(data=stopCount, columns = ['stopCount'])
VocabStop['query'] = allQueries
Vocab = Vocab.merge(VocabStop, on = 'query')

# Net Vocab

In [43]:
www = []
com = []
net = []
org = []
gov = []
edu = []
http = []

for query in allQueries:



    if "www." in query:
        www.append(1)
    else:
        www.append(0)
        
    if ".com" in query:
        com.append(1)
    else:
        com.append(0)
                
    if ".net" in query:
        net.append(1)
    else:
        net.append(0)
            
    if ".org" in query:
        org.append(1)
    else:
        org.append(0)
            
    if ".edu" in query:
        edu.append(1)
    else:
        edu.append(0)
            
    if ".gov" in query:
        gov.append(1)
    else:
        gov.append(0)
            
    if "http" in query:
        http.append(1)
    else:
        http.append(0)

In [44]:
VocabNet = pd.DataFrame(data=com, columns = ['com'])
VocabNet['net'] = net
VocabNet['org'] = org
VocabNet['edu'] = edu
VocabNet['gov'] = gov
VocabNet['http'] = http
VocabNet['query'] = allQueries
Vocab = Vocab.merge(VocabNet, on = 'query')

# Search Operators

In [45]:
AND = []
OR = []
quotes = []
for query in allQueries:
    
    if "AND" in query:
        AND.append(1)
    else:
        AND.append(0)
        
    if "OR" in query:
        OR.append(1)
    else:
        OR.append(0)
        
    if "\"" in query:
        quotes.append(1) 
    else:
        quotes.append(0)

"pest control" how to
define:"cosmetic laser treatment*"
long term care insurance "united kingdom" OR britain OR england
long term care insurance "united kingdom" OR britain OR england
"what's good about internet phoning"
"pest control" courses
Isabelle "face transplants"
RHEUMATOLOGY SPORTS
us law prohibiting "redbull"
"Pseudocyesis advances"
won NCAA "march madness" 2012
research culture OR race infant OR child development stages OR milestones
"jasper fforde" chronology
"skagit valley herald" founded
infant OR child development "cultural differences"
infant OR child development "cultural differences"
"world snooker" tournaments
"connecticut fire academy"
infant development "cultural effects"
NCAA tournament "march madness" wikipedia
first "full face transplant" Oscar
"Charles Basil"
"non-extinct" marsupials
hawaii real estate family resale value house OR condo news
"SUNY albany hospital" location
"cosmetic laser treatment*" and safe*
"jeopardy game show"
Red Bull AND health
infant de

In [46]:
VocabOP = pd.DataFrame(data=AND, columns = ['AND'])
VocabOP['OR'] = OR
VocabOP['quotes'] = quotes
VocabOP['query'] = allQueries
Vocab = Vocab.merge(VocabOP, on = 'query')

# Interogatives 

In [47]:
inter = []
VocabInter = pd.DataFrame(data=vocab, columns = ['coreVocab'])

x = len(allQueries)
for num in range(x):
    query = allQueries[num]

    if re.match(r"who( |'re|'s|s)", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"what( |'re|re|'s|s)", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"when( |'re|re|'s|s)", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"where( |'re|re|'s|s)", query, flags=re.IGNORECASE):
        inter.append(1)
   
    elif re.match(r"why( |'re|re|'s|s)", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"how( |'re|re|'s|s)", query, flags=re.IGNORECASE):
        inter.append(1)
        
    elif re.match(r"is ", query, flags=re.IGNORECASE):
        inter.append(1)
    elif re.match(r"are ", query, flags=re.IGNORECASE):
        inter.append(1)
    elif re.match(r"can ", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"could ", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"should ", query, flags=re.IGNORECASE):
        inter.append(1)

    elif re.match(r"would ", query, flags=re.IGNORECASE):
        inter.append(1)

    else:
        inter.append(0)

VocabInter = pd.DataFrame(data=inter, columns = ['inter'])
VocabInter['query'] = allQueries
Vocab = Vocab.merge(VocabInter, on = 'query')

In [49]:
pickle.dump(Vocab, open( "Pickles/VocabFeat.p", "wb" ) )