The code in this notebook generates the Data Sets for our experiments by drawing on several data sources. Processing the AOL query logs can take 6-10 hours, which are then saved as pickles.

# Load Libraries

In [1]:
import csv
import glob
import pickle
import xmltodict
import re

import pandas as pd
import numpy as np

from tqdm import tqdm
from datetime import datetime

# Declare Functions

In [2]:
def splitQueryClicksAOL(data):
    sample = []
    for session in data:
        currentQuery = session[0][1]
        first = True
        currentTime = -1
        newSession = []
        for query in session:
            if first and query[1] == currentQuery and (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif first and query[1] == currentQuery and not (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
            elif query[1] != currentQuery and (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif query[1] != currentQuery and not (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
            elif query[1] == currentQuery and (query[2] != currentTime) and (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif query[1] == currentQuery and (query[2] == currentTime) and (query[3]): 
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif query[1] == currentQuery and (query[2] == currentTime) and not (query[3]): 
                newSession.append([query[0],query[1],query[2],'','','Q'])
            elif query[1] == currentQuery and not (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
            currentQuery = query[1]
            currentTime = query[2]
            first = False
        sample.append(newSession)
    return sample

def splitQueryClicksTREC(data):
    sample = []
    for session in data:
        currentQuery = session[0][1]
        first = True
        currentTime = -1
        newSession = []
        for query in session:
            if first and query[1] == currentQuery and (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif first and query[1] == currentQuery and not (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
            elif query[1] != currentQuery and (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif query[1] != currentQuery and not (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
            elif query[1] == currentQuery and (query[2] != currentTime) and (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif query[1] == currentQuery and (query[2] == currentTime): 
                newSession.append([query[0],query[1],query[2],query[3],query[4],'C'])
            elif query[1] == currentQuery and not (query[3]):
                newSession.append([query[0],query[1],query[2],'','','Q'])
            currentQuery = query[1]
            currentTime = query[2]
            first = False
        sample.append(newSession)
    return sample

# Load DMOZ

This loads the DMOZ tages which are used to determine whether a website should be tagged as "designed for children".

In [3]:
sterSites = []
with open('DataSources/DMOZ/URL Classification.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter = ',')
        for row in csv_reader:
            if(row[2] == 'Kids'):
                sterSites.append(row[1])


# Load and Process AOL

This block of code loads the AOL query logs and seperates sessions into two types, one that contain websites designed for our Stereotype, and sesions that do not. This process can take 40-60 minutes for each query log, for a total of 400-600 minutes; hence the pickle files.

In [None]:
count = 1

#loads query logs
for filename in glob.glob("DataSources/AOL/*.txt"):
    queryLog = []
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter = '\t')
        lineCount = 0
        for row in csv_reader:
            if(lineCount > 0):
                queryLog.append(row)
            lineCount += 1
    
    currentUser = queryLog[0][0]
    date = datetime.strptime(queryLog[0][2], "%Y-%m-%d %H:%M:%S")
    currentTime = datetime.timestamp(date)

    sessions = []
    session = []
    
    #seperates query logs into sessions
    for row in queryLog:
        tempUser = row[0]
        date = datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S")
        tempTime = datetime.timestamp(date)
        if(tempUser != currentUser):
            sessions.append(session)
            session = []
            currentUser = tempUser
            date = datetime.strptime(row[2], "%Y-%m-%d %H:%M:%S")
            tempTime = datetime.timestamp(date)
            currentTime = tempTime
        if(tempTime > (currentTime + (60*60))):
            sessions.append(session)
            session = []
            currentTime = tempTime
        session.append(row)
  

    sterSessions = []
    notSterSessions = []
    ster = False
    
    #seperates sessions into those that do, or do not, contain a click 
    #on a website for our archetype
    with tqdm(total=len(sessions)) as pbar:
        for session in sessions:
            for query in session:
                if query[4] in sterSites:
                    sterSessions.append(session)
                    ster = True
                    break
            if ster == False:
                notSterSessions.append(session)
            else:
                ster = False
            pbar.update(1)
   

    pickle.dump( sterSessions, open( "Pickles/SterPickles/Ster"+ str(count) +".p", "wb" ) )
    pickle.dump( notSterSessions, open( "Pickles/NotSterPickles/NotSter"+ str(count) +".p", "wb" ) )
    count +=1

# Concatenate Stereotype Sessions

Opens all pickles that contain sessions with clicks on websites designated as for our Stereotype, and then further seperates them into sessions we label as generated by our Stereotype, and session that aren't.

In [4]:
ster1 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster1.p", "rb" ) ), dtype=object)
ster2 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster2.p", "rb" ) ), dtype=object)
ster3 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster3.p", "rb" ) ), dtype=object)
ster4 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster4.p", "rb" ) ), dtype=object)
ster5 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster5.p", "rb" ) ), dtype=object)
ster6 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster6.p", "rb" ) ), dtype=object)
ster7 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster7.p", "rb" ) ), dtype=object)
ster8 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster8.p", "rb" ) ), dtype=object)
ster9 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster9.p", "rb" ) ), dtype=object)
ster10 = np.asarray(pickle.load( open( "Pickles/SterPickles/Ster10.p", "rb" ) ), dtype=object)

ster = np.concatenate((ster1, ster2, ster3, ster4, ster5, ster6, ster7, ster8, ster9, ster10), axis=0)

#Removes all sessions that have more than 200 hundred entries 

sterRefined = []
for session in ster:
    if len(session) < 200:
        sterRefined.append(session)

prefSter = [] # Sessions that exclusively click on sites designed for kids
duraSter = [] # Sessions that click on a site for kids and have a session length of less than 6 minutes
notSter = [] # Sessions that belong to neither of the previous two

with tqdm(total=len(sterRefined)) as pbar:
    for session in sterRefined:
        clicks = 0
        kClicks = 0
        for query in session:
            if query[4]:
                clicks +=1
                if query[4] in sterSites:
                    kClicks +=1
        if(kClicks/clicks == 1):
            prefSter.append(session)
        else:

            startTime =  datetime.strptime(session[0][2], "%Y-%m-%d %H:%M:%S")
            startTimeStamp = datetime.timestamp(startTime)
            endTime = datetime.strptime(session[len(session)-1][2], "%Y-%m-%d %H:%M:%S")
            endTimeStamp = datetime.timestamp(endTime)
            if(endTimeStamp - startTimeStamp) < (60*6):
                duraSter.append(session)
            else:
                notSter.append(session)
        pbar.update()
        
pickle.dump( notSter, open( "Pickles/NotSterPickles/NotSter11.p", "wb" ) )

100%|██████████| 19139/19139 [01:35<00:00, 201.31it/s]


# Concatenate all sessions that are not our Archetype

This block of code concatenates all sessions that are labeled as not being to our Archetype, and removing all sessions that don't have any clicks from that set.

In [5]:
notSter1 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter1.p", "rb" )), dtype=object)
notSter2 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter2.p", "rb" )), dtype=object)
notSter3 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter3.p", "rb" )), dtype=object)
notSter4 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter4.p", "rb" )), dtype=object)
notSter5 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter5.p", "rb" )), dtype=object)
notSter6 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter6.p", "rb" )), dtype=object)
notSter7 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter7.p", "rb" )), dtype=object)
notSter8 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter8.p", "rb" )), dtype=object)
notSter9 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter9.p", "rb" )), dtype=object)
notSter10 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter10.p", "rb" )), dtype=object)
notSter11 = np.asarray(pickle.load( open( "Pickles/NotSterPickles/NotSter11.p", "rb" )), dtype=object)

notSter = np.concatenate((notSter1, notSter2, notSter3,notSter4,notSter5,notSter6,notSter7,notSter8,notSter9,notSter10, notSter11), axis=0)

# Remove all sessions with no clicks
notSterClick = []
with tqdm(total=len(notSter)) as pbar:
    for session in notSter:
        for query in session:
            if query[3]:
                #print(query)
                notSterClick.append(session)
                break
            else:
                pass
        pbar.update()

100%|██████████| 10333596/10333596 [01:25<00:00, 120563.12it/s]


# Process TREC

This loads and processes the TREC session track query logs.

In [22]:
queryLog = []
with open('DataSources/TREC/sessiontrack2014.xml') as fd:
    doc = xmltodict.parse(fd.read())
for x in range(len(doc['sessiontrack2014']['session'])):
    if type(doc['sessiontrack2014']['session'][x]['interaction']) is list:
        for entry in (doc['sessiontrack2014']['session'][x]['interaction']):
            #print(type(entry))
            if not isinstance(entry, str):
                queryLog.append([x, entry['query'], entry['@starttime'], '', ''])
                if 'clicked' in entry.keys():
                    if type(entry['clicked']['click']) is list: 
                        #print('list')
                        for clicks in entry['clicked']['click']:
                            #print(clicks['rank'])
                            if int(clicks['rank'])-1 <10:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-1]['url']])
                            elif int(clicks['rank'])-1 <20:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-11]['url']])
                            elif int(clicks['rank'])-1 <30:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-21]['url']])
                            elif int(clicks['rank'])-1 <40:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-31]['url']])
                            elif int(clicks['rank'])-1 <50:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-41]['url']])
                            elif int(clicks['rank'])-1 <60:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-51]['url']])
                            elif int(clicks['rank'])-1 <70:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-61]['url']])
                            else:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-71]['url']])
                    else:
                        if (int(entry['clicked']['click']['rank'])-1) < 10:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-1]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 20:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-11]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 30:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-21]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 40:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-31]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 50:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-41]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 60:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-51]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 70:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-61]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 80:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-71]['url']])
                        else:  
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-81]['url']])
            else:
                print(entry)
    else:
        queryLog.append([x, doc['sessiontrack2014']['session'][x]['interaction']['query'], doc['sessiontrack2014']['session'][x]['interaction']['@starttime'], '', ''])
        if 'clicked' in doc['sessiontrack2014']['session'][x]['interaction'].keys():
            if type(doc['sessiontrack2014']['session'][x]['interaction']['clicked']['click']) is list: 
                for clicks in doc['sessiontrack2014']['session'][x]['interaction']['clicked']['click']:
                    queryLog.append([x, doc['sessiontrack2014']['session'][x]['interaction']['query'], clicks['@starttime'], clicks['rank'], doc['sessiontrack2014']['session'][x]['interaction']['results']['result'][int(clicks['rank'])-1]['url']]) 
                pass
            else:
                queryLog.append([x, doc['sessiontrack2014']['session'][x]['interaction']['query'], doc['sessiontrack2014']['session'][x]['interaction']['clicked']['click']['@starttime'], doc['sessiontrack2014']['session'][x]['interaction']['clicked']['click']['rank'], doc['sessiontrack2014']['session'][x]['interaction']['results']['result'][int(doc['sessiontrack2014']['session'][x]['interaction']['clicked']['click']['rank'])-1]['url']])
                    

In [23]:
with open('DataSources/TREC/sessiontrack2013.xml') as fd:
    doc = xmltodict.parse(fd.read())
for x in range(len(doc['sessiontrack2013']['session'])):
    #print(type(doc['sessiontrack2014']['session'][x]['interaction']))
    if type(doc['sessiontrack2013']['session'][x]['interaction']) is list:
        for entry in (doc['sessiontrack2013']['session'][x]['interaction']):
            #print(type(entry))
            if not isinstance(entry, str):
                queryLog.append([x, entry['query'], entry['@starttime'], '', ''])
                if 'clicked' in entry.keys():
                    if type(entry['clicked']['click']) is list: 
                        #print('list')
                        for clicks in entry['clicked']['click']:
                            #print(clicks['rank'])
                            if int(clicks['rank'])-1 <10:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-1]['url']])
                            elif int(clicks['rank'])-1 <20:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-11]['url']])
                            elif int(clicks['rank'])-1 <30:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-21]['url']])
                            elif int(clicks['rank'])-1 <40:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-31]['url']])
                            elif int(clicks['rank'])-1 <50:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-41]['url']])
                            elif int(clicks['rank'])-1 <60:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-51]['url']])
                            elif int(clicks['rank'])-1 <70:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-61]['url']])
                            else:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-71]['url']])
                    else:
                        #print(int(entry['clicked']['click']['rank'])-1)
                        if (int(entry['clicked']['click']['rank'])-1) < 10:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-1]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 20:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-11]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 30:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-21]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 40:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-31]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 50:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-41]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 60:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-51]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 70:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-61]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 80:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-71]['url']])
                        else:  
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-81]['url']])
            else:
                queryLog.append(entry)
    else:
        queryLog.append([x, doc['sessiontrack2013']['session'][x]['interaction']['query'], doc['sessiontrack2013']['session'][x]['interaction']['@starttime'], '', ''])
        if 'clicked' in doc['sessiontrack2013']['session'][x]['interaction'].keys():
            if type(doc['sessiontrack2013']['session'][x]['interaction']['clicked']['click']) is list: 
                for clicks in doc['sessiontrack2013']['session'][x]['interaction']['clicked']['click']:
                    queryLog.append([x, doc['sessiontrack2013']['session'][x]['interaction']['query'], clicks['@starttime'], clicks['rank'], doc['sessiontrack2013']['session'][x]['interaction']['results']['result'][int(clicks['rank'])-1]['url']]) 
                pass
            else:
                queryLog.append([x, doc['sessiontrack2013']['session'][x]['interaction']['query'], doc['sessiontrack2013']['session'][x]['interaction']['clicked']['click']['@starttime'], doc['sessiontrack2013']['session'][x]['interaction']['clicked']['click']['rank'], doc['sessiontrack2013']['session'][x]['interaction']['results']['result'][int(doc['sessiontrack2013']['session'][x]['interaction']['clicked']['click']['rank'])-1]['url']])
                    

In [24]:
with open('DataSources/TREC/sessiontrack2012.xml') as fd:
    doc = xmltodict.parse(fd.read())
for x in range(len(doc['sessiontrack2012']['session'])):
    if type(doc['sessiontrack2012']['session'][x]['interaction']) is list:
        for entry in (doc['sessiontrack2012']['session'][x]['interaction']):
            #print(type(entry))
            if not isinstance(entry, str):
                queryLog.append([x, entry['query'], entry['@starttime'], '', ''])
                
                if 'clicked' in entry.keys():
                    if type(entry['clicked']) is None:
                        break
                    if type(entry['clicked']['click']) is list: 
                        #print('list')
                        for clicks in entry['clicked']['click']:
                            #print(clicks['rank'])
                            if int(clicks['rank'])-1 <10:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-1]['url']])
                            elif int(clicks['rank'])-1 <20:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-11]['url']])
                            elif int(clicks['rank'])-1 <30:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-21]['url']])
                            elif int(clicks['rank'])-1 <40:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-31]['url']])
                            elif int(clicks['rank'])-1 <50:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-41]['url']])
                            elif int(clicks['rank'])-1 <60:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-51]['url']])
                            elif int(clicks['rank'])-1 <70:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-61]['url']])
                            else:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-71]['url']])
                    else:
                        #print(int(entry['clicked']['click']['rank'])-1)
                        if (int(entry['clicked']['click']['rank'])-1) < 10:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-1]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 20:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-11]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 30:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-21]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 40:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-31]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 50:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-41]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 60:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-51]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 70:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-61]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 80:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-71]['url']])
                        else:  
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-81]['url']])
            else:
                print(entry)
    else:
        queryLog.append([x, doc['sessiontrack2012']['session'][x]['interaction']['query'], doc['sessiontrack2012']['session'][x]['interaction']['@starttime'], '', ''])
        if 'clicked' in doc['sessiontrack2012']['session'][x]['interaction'].keys():
            if type(doc['sessiontrack2012']['session'][x]['interaction']['clicked']['click']) is list: 
                for clicks in doc['sessiontrack2012']['session'][x]['interaction']['clicked']['click']:
                    queryLog.append([x, doc['sessiontrack2012']['session'][x]['interaction']['query'], clicks['@starttime'], clicks['rank'], doc['sessiontrack2012']['session'][x]['interaction']['results']['result'][int(clicks['rank'])-1]['url']]) 
                pass
            else:
                queryLog.append([x, doc['sessiontrack2012']['session'][x]['interaction']['query'], doc['sessiontrack2012']['session'][x]['interaction']['clicked']['click']['@starttime'], doc['sessiontrack2012']['session'][x]['interaction']['clicked']['click']['rank'], doc['sessiontrack2012']['session'][x]['interaction']['results']['result'][int(doc['sessiontrack2012']['session'][x]['interaction']['clicked']['click']['rank'])-1]['url']])
                    

In [25]:
with open('DataSources/TREC/sessiontrack2011.xml') as fd:
    doc = xmltodict.parse(fd.read())
for x in range(len(doc['sessiontrack2011']['session'])):
    if type(doc['sessiontrack2011']['session'][x]['interaction']) is list:
        for entry in (doc['sessiontrack2011']['session'][x]['interaction']):
            if not isinstance(entry, str):
                queryLog.append([x, entry['query'], entry['@starttime'], '', ''])
           
                if 'clicked' in entry.keys():
                    if type(entry['clicked']) is None:
                        break
                    if type(entry['clicked']['click']) is list: 
                        #print('list')
                        for clicks in entry['clicked']['click']:
                            #print(clicks['rank'])
                            if int(clicks['rank'])-1 <10:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-1]['url']])
                            elif int(clicks['rank'])-1 <20:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-11]['url']])
                            elif int(clicks['rank'])-1 <30:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-21]['url']])
                            elif int(clicks['rank'])-1 <40:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-31]['url']])
                            elif int(clicks['rank'])-1 <50:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-41]['url']])
                            elif int(clicks['rank'])-1 <60:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-51]['url']])
                            elif int(clicks['rank'])-1 <70:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-61]['url']])
                            else:
                                queryLog.append([x, entry['query'], clicks['@starttime'], clicks['rank'], entry['results']['result'][int(clicks['rank'])-71]['url']])
                    else:
                        #print(int(entry['clicked']['click']['rank'])-1)
                        if (int(entry['clicked']['click']['rank'])-1) < 10:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-1]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 20:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-11]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 30:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-21]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 40:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-31]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 50:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-41]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 60:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-51]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 70:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-61]['url']])
                        elif (int(entry['clicked']['click']['rank'])-1) < 80:
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-71]['url']])
                        else:  
                            queryLog.append([x, entry['query'], entry['clicked']['click']['@starttime'], entry['clicked']['click']['rank'], entry['results']['result'][int(entry['clicked']['click']['rank'])-81]['url']])
            else:
                print(entry)
    else:
        queryLog.append([x, doc['sessiontrack2011']['session'][x]['interaction']['query'], doc['sessiontrack2011']['session'][x]['interaction']['@starttime'], '', ''])
        if 'clicked' in doc['sessiontrack2011']['session'][x]['interaction'].keys():
            if type(doc['sessiontrack2011']['session'][x]['interaction']['clicked']['click']) is list: 
                for clicks in doc['sessiontrack2011']['session'][x]['interaction']['clicked']['click']:
                    queryLog.append([x, doc['sessiontrack2011']['session'][x]['interaction']['query'], clicks['@starttime'], clicks['rank'], doc['sessiontrack2011']['session'][x]['interaction']['results']['result'][int(clicks['rank'])-1]['url']]) 
                pass
            else:
                queryLog.append([x, doc['sessiontrack2011']['session'][x]['interaction']['query'], doc['sessiontrack2011']['session'][x]['interaction']['clicked']['click']['@starttime'], doc['sessiontrack2011']['session'][x]['interaction']['clicked']['click']['rank'], doc['sessiontrack2011']['session'][x]['interaction']['results']['result'][int(doc['sessiontrack2011']['session'][x]['interaction']['clicked']['click']['rank'])-1]['url']])
                    

In [47]:
currentUser = queryLog[0][0]
TRECS = []
session = []
for row in queryLog:
    tempUser = row[0]
    if(tempUser != currentUser):
        TRECS.append(session)
        session = []
        currentUser = tempUser
    session.append(row)

#Preprocess timestamp on TREC sessions to match AOL logs

for session in TRECS:
    if ':' in str(session[0][2]):
        initialTimeStamp = session[0][2].split('.')[0]
        date = datetime.strptime(initialTimeStamp, "%H:%M:%S")
        tempTime = datetime.timestamp(date)
        for query in session:
            date2 = datetime.strptime(query[2].split('.')[0], "%H:%M:%S")
            tempTime2 = datetime.timestamp(date2)   
            query[2] = tempTime2 - tempTime

# Only add sessions that have a click

newTRECS = []
count = 0
for session in TRECS:
    for query in session:
        if query[3]:
            newTRECS.append(session)
            break
            
TRECS = np.array(newTRECS, dtype = 'object')

# Create Sessions With Clicks

Creates the Sessions With Clicks data set by combining sessions from TREC, as well as those we labeled as belonging to users who are, and are not; our Stereotype.

In [48]:
randomNotSter = np.random.choice(notSterClick, size=(((len(duraSter) + len(prefSter))*4)-len(TRECS)), replace=False)

for session in randomNotSter:
    date = datetime.strptime(session[0][2], "%Y-%m-%d %H:%M:%S")
    tempTime = datetime.timestamp(date)
    for query in session:
        date2 = datetime.strptime(query[2], "%Y-%m-%d %H:%M:%S")
        tempTime2 = datetime.timestamp(date2)   
        query[2] = tempTime2 - tempTime
        
notSterComplete =  np.concatenate((randomNotSter, TRECS), axis=0)
sterComplete = np.concatenate((duraSter, prefSter), axis=0)

for session in sterComplete:
    date = datetime.strptime(session[0][2], "%Y-%m-%d %H:%M:%S")
    tempTime = datetime.timestamp(date)
    for query in session:
        date2 = datetime.strptime(query[2], "%Y-%m-%d %H:%M:%S")
        tempTime2 = datetime.timestamp(date2)   
        query[2] = tempTime2 - tempTime

  """Entry point for launching an IPython kernel.


# Single Query Data Set

Creates the Single Query data set from the Sven data source and extracing single queries from the TREC sessions.

In [49]:
sterSQS = [] 

with open('DataSources/Sven/ChildrenQueries.csv') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        sterSQS.append(row[0][:-2])
        
notSterSQS = []

for session in TRECS:
    for query in session:
        notSterSQS.append(query[1])

notSterSQS = list(set(notSterSQS))
notSterSQS = np.random.choice(notSterSQS, size=(len(sterSQS)*4), replace=False)


# Further Preprocessing

There is an issue with how clicks and queries are represented in the AOL query logs, the following steps clearly seperates the two allowing for experimentation.

In [50]:
sterCompleteProc = splitQueryClicksAOL(sterComplete)
notSterCompleteProc = np.concatenate((splitQueryClicksAOL(notSterComplete[:(len(notSterComplete)-len(TRECS))]), splitQueryClicksTREC(notSterComplete[(len(notSterComplete)-len(TRECS)):])))

# Further Preprocessing Pt. 2

These next steps remove some bad queries, as the AOL query logs replace some queries with '-'. Further more, some punctuation is represented in it's ascii format. We replace that too. If this causes the session to contain no clicks, we remove that session; as one of our ground rules for these kind of sessions is they must contain one click. Furthermore, we convert all sessions into a dataframe.

In [51]:
newSter = []
notSterQL = []
sterQL = []
count = 0

pattern = ' 20[^0-9. ]'
with tqdm(total = len(sterCompleteProc) ) as pbar:
    for session in sterCompleteProc:
        order = 0
        for query in session:
            if query[1] == '-':
                 pass
            else:
                if ' 20' in query[1]:
                    result = re.search(pattern, query[1])
                    if result:
                        query[1]= re.sub(' 20', " ", query[1])
                        #print(query)
                query[0] = count
                query.append(order)
                sterQL.append(query)
                order += 1
        count +=1
        pbar.update()
        
sterPD = pd.DataFrame(sterQL, columns = ["sID", "query", "timestamp","click","website","type","order"] )  
sterPD['class'] = 1

with tqdm(total = len(notSterCompleteProc) ) as pbar:
    for session in notSterCompleteProc:
        order = 0
        for query in session:
            if query[1] == '-':
                pass
            else:
                if ' 20' in query[1]:
                    result = re.search(pattern, query[1])
                    if result:
                        query[1]= re.sub(' 20', " ", query[1])
                query[0] = count
                query.append(order)
                notSterQL.append(query)
                order += 1
        count +=1
        pbar.update()
        
notSterPD = pd.DataFrame(notSterQL, columns = ["sID", "query", "timestamp","click","website","type","order"] )  
notSterPD['class'] = 0

allSessions = pd.concat([sterPD, notSterPD])

toKeep = (allSessions.groupby('sID')['type'].nunique()==2) ##checks to see if a session has both a Click and a Query
toKeep = pd.DataFrame(toKeep)
toKeep = toKeep[toKeep['type']==True].index

allSessions = allSessions[allSessions['sID'].isin(toKeep)]

pickle.dump(allSessions, open( "DataSets/SWC/SWC.p", "wb" ) )

100%|██████████| 7980/7980 [00:00<00:00, 52469.24it/s]
100%|██████████| 31920/31920 [00:00<00:00, 48419.74it/s]


# Further Preprocessing Pt. 3

In this following block of code we preprocess SQS into a usable data frame. 

In [52]:
notSterList = []
sterList = []
count = allSessions['sID'].max()

for entry in notSterSQS:
    notSterList.append([entry,count,0])
    count +=1
notSterSQS = pd.DataFrame(data = notSterList, columns = ['query','sID','class'])

for entry in sterSQS:
    sterList.append([entry,count,1])
    count +=1
sterSQS = pd.DataFrame(data = sterList, columns = ['query','sID','class'])

allSessionsSQS = pd.concat([notSterSQS, sterSQS])

pickle.dump(allSessionsSQS, open( "DataSets/SQS/SQS.p", "wb" ) )