This notebook is intended to extract all search related features from sessions. Notably different than the other feature extraction notebooks, as it works with entire sessions rather than solo queries. Most feature lists maintain the value for the entire session, and the mean of this value is aggregated later in the notebook. For example, if a session has 3 unique queries, we mark the number of unique queries as 3 for each query in the dataframe, and then take the average of that value in feature extraction main (for this example, it would still be 3). However, for the sections were we count number of repeat clicks/queries; we do not take the mean of these features; but instead the count.

# Load Libraries

In [1]:
import pickle

import pandas as pd
import numpy as np
#import re

from Levenshtein import distance as levenshtein_distance

# Load Data Sets

Load both data sets, process the SWC so it can be handled on a session by session basis. Not required for SQS, as each session in this data set is only one query.

In [2]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
sessions = allSessions.groupby('sID').apply(pd.Series.tolist).tolist()

allSessionsSQS = list(pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) ))

# Distance Between Queries

In the following block of code we measure the levenhsten distances between queries in a session.




In [3]:
distances = []
for session in sessions:
    distance = []
    currentQuery = ""
    for query in session:
        if query[5] == 'Q':
            if currentQuery =="":
                currentQuery = query[1]
                distance.append(0)
            else:
                distance.append(levenshtein_distance(currentQuery,query[1]))
                currentQuery = query[1]
        else:
            distance.append(-1)
    distances.append(distance)
    distance = []

# Time Between Queries

In the following block of code we measure the time between each query.

In [5]:
timeQueries = []
for session in sessions:
    distance = []
    currentTime = -1
    for query in session:
        if query[5] == 'Q':
            if currentTime ==- 1:
                currentTime = query[2]
                distance.append(0.0)
            else:
                distance.append(float(query[2])-float(currentTime))
                currentTime = query[2]
        else:
            distance.append(-1)
    timeQueries.append(distance)
    distance = []

# Number of Unique Queries and If All Queries Are The Same

In the following block of code marks whether counts the number of unique queries and determines if all queries are the same query by comparing the length of the set of queries with total number of queries. If a session contains more than one query but only has a set of one unique query, we know that session contains all the same query and is marked as such.

In [24]:
uniqueQueries = []
allSameQueries = []

for session in sessions:
    
    queries = []
    unique = []
    same = []
    
    for query in session:
        if query[5] == 'Q':
            queries.append(query[1])
    numUniqQueries = len(set(queries))
    
    for query in session:
        unique.append(numUniqQueries)
        if (numUniqQueries == 1): 
            if len(queries) >1:
                same.append(1)
            else:
                same.append(-1)
        else:
            same.append(0)
            
    uniqueQueries.append(unique)
    allSameQueries.append(same)


# Number of Repeat Queries

In the following block of code we count the number of repeated queries, not including the first instance of the query.

In [7]:
repeatQueries = []

for session in sessions:
    
    repeat = []
    
    for query in session:
        
        if query[5] == 'Q':
            for query2 in session:
                if query2[5] == 'Q':
                    if query == query2:
                        repeat.append(0)
                        break
                    if query[1] == query2[1]:
                        repeat.append(1)
                        break

        else:
            repeat.append(0)
    
    repeatQueries.append(repeat)


# Number of Unique Clicks and If All Clicks Are The Same

In the following block of code counts the number of unique clicks and determines if all queries are the same clicks by comparing the length of the set of clicks with total number of clicks. If a session contains more than one click but only has a set of one unique click, we know that session contains all the same clicks and is marked as such.

In [8]:
uniqueClicks = []
allSameClicks = []

for session in sessions:
    
    clicks = []
    unique = []
    same = []
  
    for query in session:
        if query[5] == 'C':
            clicks.append(query[4])

    numUniqClicks = len(set(clicks))
    
    for query in session:
        unique.append(numUniqClicks)
        if (numUniqClicks == 1): 
            if len(clicks) >1:
                same.append(1)
            else:
                same.append(-1)
        else:
            same.append(0)
            
    uniqueClicks.append(unique)
    allSameClicks.append(same)


# Number of Repeat Clicks

In the following block of code we count the number of repeated clicks, not including the first instance of the click.

In [25]:
repeatClicks = []

for session in sessions:
    
    repeat = []

    for entry in session:
        if entry[5] == 'C':
            for entryLoop in session:
                if entryLoop[5] == 'C':
                    if entry == entryLoop:
                        repeat.append(0)
                        break
                    if entry[4] == entryLoop[4]:
                        repeat.append(1)
                        break

        else:
            repeat.append(0)

    repeatClicks.append(repeat)


In [10]:
timeClicks = []
for session in sessions:
    distance = []
    currentTime = -1
    check = False
    for query in session:
        if query[5] == 'C':
            if currentTime ==- 1:
                currentTime = query[2]
                distance.append(0.0)
            elif check == True:
                distance.append(-1)
            else:
                distance.append(float(query[2])-float(currentTime))
                currentTime = query[2]
                check = True
        else:
            distance.append(-1)
            check = False
    timeClicks.append(distance)
    distance = []

In [11]:
clickDistance = []
for session in sessions:
    distance = []
    for query in session:
        if query[5] == 'C':
            distance.append(levenshtein_distance(query[4], query[1]))
        else:
            distance.append(-1)
    clickDistance.append(distance)


In [12]:
timeClicks

[[-1, -1, 0.0, -1, 246.0, -1],
 [-1, 0.0, -1, 157.0],
 [-1, -1, 0.0, 0.0, -1, 131.0],
 [-1, 0.0, 0.0, -1, 102.0],
 [-1, 0.0, 0.0, -1, -1, -1],
 [-1, -1, -1, 0.0, 0.0, -1, 291.0, -1, -1],
 [-1, -1, 0.0, -1, -1, 55.0],
 [-1, -1, 0.0, 0.0, -1, -1, 73.0, -1, -1, -1, -1, -1],
 [-1, 0.0, 0.0, -1],
 [-1, 0.0, 0.0, -1, -1, -1, 145.0],
 [-1, -1, 0.0, -1, 335.0, -1, -1],
 [-1, -1, -1, -1, 0.0, 0.0],
 [-1, 0.0, -1, 200.0],
 [-1, -1, 0.0, 0.0, -1],
 [-1, -1, -1, 0.0, -1, -1, 169.0],
 [-1, 0.0, 0.0, -1, -1],
 [-1, 0.0, 0.0, -1, -1, -1, -1, -1, -1],
 [-1, -1, -1, -1, -1, 0.0, 0.0],
 [-1, 0.0, 0.0, -1],
 [-1, 0.0, 0.0],
 [-1, 0.0, 0.0, -1, -1, -1],
 [-1, 0.0, -1, 47.0],
 [-1, 0.0, 0.0, -1],
 [-1, -1, 0.0, 0.0, -1],
 [-1, 0.0, 0.0, -1, 203.0],
 [-1, 0.0, 0.0],
 [-1, 0.0, -1, 41.0, -1, 47.0, -1, -1, -1, 110.0, -1],
 [-1, 0.0, 0.0, -1],
 [-1, 0.0, 0.0, -1],
 [-1, -1, 0.0, 0.0, -1, -1, -1, -1, -1],
 [-1, 0.0, 0.0, -1, -1, -1, -1, -1, -1, -1, 237.0],
 [-1, 0.0, 0.0, -1, -1, -1],
 [-1, 0.0, 0.0],
 [-1, -1,

In [13]:
distancesFlat = [item for sublist in distances for item in sublist]
allSessions['queryDistance'] = distancesFlat

In [14]:
timeQueriesFlat = [item for sublist in timeQueries for item in sublist]
allSessions['timeQueries'] = timeQueriesFlat

In [15]:
repeatQueriesFlat = [item for sublist in repeatQueries for item in sublist]
allSessions['repeatQueries'] = repeatQueriesFlat

In [16]:
uniqueQueriesFlat = [item for sublist in uniqueQueries for item in sublist]
allSessions['uniqueQueries'] = uniqueQueriesFlat

In [17]:
allSameQueriesFlat = [item for sublist in allSameQueries for item in sublist]
allSessions['allSameQueries'] = allSameQueriesFlat

In [18]:
repeatClicksFlat = [item for sublist in repeatClicks for item in sublist]
allSessions['repeatClicks'] = repeatClicksFlat

In [19]:
uniqueClicksFlat = [item for sublist in uniqueClicks for item in sublist]
allSessions['uniqueClicks'] = uniqueClicksFlat

In [20]:
allSameClicksFlat = [item for sublist in allSameClicks for item in sublist]
allSessions['allSameClicks'] = allSameClicksFlat

In [21]:
timeClicksFlat = [item for sublist in timeClicks for item in sublist]
allSessions['timeClicks'] = timeClicksFlat

In [22]:
clickDistanceFlat = [item for sublist in clickDistance for item in sublist]
allSessions['clickDistance'] = clickDistanceFlat

In [23]:
allSessions['repeatQueries']

0         0
1         1
2         0
3         1
4         0
         ..
196916    1
196917    0
196918    0
196919    1
196920    0
Name: repeatQueries, Length: 230362, dtype: int64

In [30]:
numQ = pd.DataFrame(data = allSessions[allSessions['type']=='Q'].groupby('sID')['type'].count())
numQ = numQ.rename(columns = {'type': 'numQueries'})
numC = pd.DataFrame(data = allSessions[allSessions['type']=='C'].groupby('sID')['type'].count())
numC = numC.rename(columns = {'type': 'numClicks'})
allSessionsC = allSessions[allSessions['type']=='C']
allSessionsC = allSessionsC.astype({'click': 'int32'})
avgC = pd.DataFrame(data = allSessionsC.groupby('sID').mean()['click'])
avgC = avgC.rename(columns = {'click': 'meanClickPosition'})

In [None]:
# SWCQ = SWCAll[SWCAll['type'] == 'Q'].groupby('sID')[['queryDistance', 'timeQueries',  
#                     'uniqueQueries', 'allSameQueries', 'numQueries']].mean()
# SWCQRQ = SWCAll[SWCAll['type'] == 'Q'].groupby('sID')[['repeatQueries']].count()

# SWCC = SWCAll[SWCAll['type'] == 'C'].groupby('sID')[['clickDistance','meanClickPosition',
#    'numClicks', 'numClicksPerQuery', 'timeClicks', 'allSameClicks','uniqueClicks',]].mean()
# SWCRC = SWCAll[SWCAll['type'] == 'C'].groupby('sID')[['repeatClicks']].count()

In [31]:
allSessions = allSessions.join(numQ, on = 'sID')
allSessions = allSessions.join(numC, on = 'sID')
allSessions = allSessions.join(avgC, on = 'sID')
allSessions['numClicksPerQuery'] = allSessions['numQueries']/allSessions['numClicks']

In [32]:
sessionsClick = allSessions.loc[allSessions['timeClicks'] != -1]
allSessions['timeClicks'] = sessionsClick.groupby('sID')['timeClicks'].mean().fillna(0)


In [33]:
allSessionsSearch = allSessions

In [34]:
allSessions.columns

Index(['sID', 'query', 'timestamp', 'click', 'website', 'type', 'order',
       'class', 'queryDistance', 'timeQueries', 'repeatQueries',
       'uniqueQueries', 'allSameQueries', 'repeatClicks', 'uniqueClicks',
       'allSameClicks', 'timeClicks', 'clickDistance', 'numQueries',
       'numClicks', 'meanClickPosition', 'numClicksPerQuery'],
      dtype='object')

In [35]:
pickle.dump( allSessionsSearch, open( "Pickles/SearchFeatSWC.p", "wb" ) )

In [37]:
allSessionsSQS = pd.DataFrame(data= allSessionsSQS, columns = ['query'])

In [38]:
allSessionsSQS['queryDistance'] = -1
allSessionsSQS['timeQueries'] = -1
allSessionsSQS['repeatQueries'] = -1
allSessionsSQS['repeatClicks'] = -1
allSessionsSQS['clickDistance'] = -1
allSessionsSQS['meanClickPosition'] = -1
allSessionsSQS['numClicks'] = -1
allSessionsSQS['numClicksPerQuery'] = -1
allSessionsSQS['numQueries'] = 1
allSessionsSQS['timeClicks'] = -1
allSessionsSQS['uniqueQueries'] = 1
allSessionsSQS['allSameClicks'] = -1
allSessionsSQS['uniqueClicks'] = -1
allSessionsSQS['allSameQueries'] = 0

In [39]:
allSessions.loc[allSessions['numQueries'] == 0]

Unnamed: 0,sID,query,timestamp,click,website,type,order,class,queryDistance,timeQueries,...,allSameQueries,repeatClicks,uniqueClicks,allSameClicks,timeClicks,clickDistance,numQueries,numClicks,meanClickPosition,numClicksPerQuery


In [40]:
pickle.dump( allSessionsSQS, open( "Pickles/SearchFeatSQS.p", "wb" ) )