This notebook is intended to describe the key attributes of SWC and SQS

# Load Libraries

In [30]:
import pickle
import numpy as np
import pandas as pd

# Load And Preprocess Data Sets

Loads the data sets which we will describe and preprocess them to enable this description.

In [31]:
SWC = pickle.load( open( "DataSets/SWC/SWC.p", "rb" ) )
SQS = pickle.load( open( "DataSets/SQS/SQS.p", "rb" ) )

SWCS = SWC[SWC['class'] == 1].groupby('sID')[['sID','query', 'timestamp', 'click', 'website', 'type']].apply(pd.Series.tolist).tolist()
SWCNS = SWC[SWC['class'] == 0].groupby('sID')[['sID','query', 'timestamp', 'click', 'website', 'type']].apply(pd.Series.tolist).tolist()
SQSS = SQS[SQS['class'] == 1]['query'].tolist()
SQSNS = SQS[SQS['class'] == 0]['query'].tolist()

# Declare Functions

In the following block of code we declare the functions used to describe the data sets.

In [65]:
def printStatsSWC(dataSet):
   
    """

    The primary complexity of printStatsSWC() lies in the representation of
    queries and clicks in the AOL Query logs, which can be broken down into
    a query with no clicks, a query with a click, and a query with multiple clicks.
    :param dataSet: This function takes in SWC and describes it
    
    """  
    
    numQueries = []
    numClicks = []
    lenQueriesWords = []
    lenQueriesTokens = []
    uniqueQueries = []
    for session in dataSet:
        numQ = 0
        numC = 0
        wordCount = []
        tokenCount = []
        currentQuery = session[0][1]
        first = True
        for query in session:
            if query[1] not in uniqueQueries:
                uniqueQueries.append(query[1])
            if first and query[1] == currentQuery and (query[3]):
                numQ += 1
                numC += 1
                tokenCount.append(len(query[1]))
                wordCount.append(len(query[1].split()))
            elif first and query[1] == currentQuery and not (query[3]):
                numQ += 1
                tokenCount.append(len(query[1]))
                wordCount.append(len(query[1].split()))
            elif query[1] != currentQuery and (query[3]):
                numQ += 1
                numC += 1
                tokenCount.append(len(query[1]))
                wordCount.append(len(query[1].split()))
            elif query[1] != currentQuery and not (query[3]):
                numQ += 1
                tokenCount.append(len(query[1]))
                wordCount.append(len(query[1].split()))
            elif query[1] == currentQuery and (query[3]):
                numC += 1
            elif query[1] == currentQuery and not (query[3]):
                numQ += 1
                tokenCount.append(len(query[1]))
                wordCount.append(len(query[1].split()))
            currentQuery = query[1]
            first = False
        wordCount = np.array(wordCount)
        tokenCount = np.array(tokenCount)
        for entry in wordCount:
            pass
        if wordCount.mean() != wordCount.mean():
            print(session)
            pass
        lenQueriesWords.append(wordCount.mean())
        lenQueriesTokens.append(tokenCount.mean())
        numQueries.append(numQ)
        numClicks.append(numC)
    numQueries = np.array(numQueries)
    
    print("Average number of queries per session: " + str(numQueries.mean()))
    uniqQueries = []
    for session in dataSet:
        for query in session:
            if not (query[3]):
                if query[1] not in uniqQueries:
                    uniqQueries.append(query[1])
                    
    print("Percentage of unique queries: " + str((len(uniqQueries)/numQueries.sum())))
    numClicks = np.array(numClicks)
    
    print("Average number of clicks per session: " + str(numClicks.mean()))
    
    lenQueriesWords = np.array(lenQueriesWords)
    avgNumWordsQuery = np.array(lenQueriesWords)
    
    print("Average number of words per query: " + str(avgNumWordsQuery.mean()))
    
    avgDuration = 0
    for session in dataSet:
        avgDuration += float((session[len(session)-1][2]))
        
    print("Average duration of session in minutes: " + str((avgDuration/len(dataSet))/60))

    
def printStatsSQS(dataSet):
    
    """
    
    Specifically for SQS, as this dataset has no clicks, single queries, no duration.
    
    :param dataSet: This function takes in SQS and describes it
    
    """ 
    
    numQueries = 0
    wordCount = []
    uniqueQueries = []
    for session in dataSet:
        numQueries +=1
        wordCount.append(len(session.split()))
        if session not in uniqueQueries:
                uniqueQueries.append(session)
    wordCount = np.array(wordCount)
    print("Average number of queries per session: " + str(1))
    print("Percentage of unique queries: " + str((len(uniqueQueries)/numQueries)))
    print("Average number of clicks per session: " + str(0))
    print("Average number of words per query: " + str(wordCount.mean()))
    print("Average duration of session in minutes: " + str(0))

# Describe Data Sets

In the following blocks of code we use the defined functions to describe our data sets.

In [44]:
print("Stats of Stereotype sessions found in SWC")
printStatsSWC(SWCS)

Stats of Stereotype sessions found in SWC
Average number of queries per session: 1.9906733103360164
Percentage of unique queries: 0.4801360631538412
Average number of clicks per session: 2.305225501469273
Average number of words per query: 2.3981486972959964
Average duration of session in minutes: 3.7958051190324094


In [45]:
print("Stats of Non-Stereotype sessions found in SWC")
printStatsSWC(SWCNS)

Stats of Non-Stereotype sessions found in SWC
Average number of queries per session: 3.303497125432773
Percentage of unique queries: 0.6074670204992115
Average number of clicks per session: 2.9309786233840485
Average number of words per query: 2.3558116717514674
Average duration of session in minutes: 12.067323810680362


In [66]:
print("Stats of Stereotype sessions found in SQS")
printStatsSQS(SQSS)

Stats of Stereotype sessions found in SQS
Average number of queries per session: 1
Percentage of unique queries: 0.9767441860465116
Average number of clicks per session: 0
Average number of words per query: 4.777408637873754
Average duration of session in minutes: 0


In [67]:
print("Stats of Non-Stereotype sessions found in SQS")
printStatsSQS(SQSNS)

Stats of Non-Stereotype sessions found in SQS
Average number of queries per session: 1
Percentage of unique queries: 1.0
Average number of clicks per session: 0
Average number of words per query: 3.722591362126246
Average duration of session in minutes: 0
