In this notebook we run our baselines on the SQS data set.

# Load Libraries

In the following block of code we import the libraries used in this notebook. 

In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
import re
import gensim
import csv
import collections

from collections import defaultdict
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from nltk.util import ngrams
from gensim import corpora, models
from sklearn.model_selection import KFold

pd.options.mode.chained_assignment = None 



# Declare Functions

In the following block of code we declare the functions we use.

In [2]:
def rowIndex(sentence):
    # Tokenizes a sentence that is cast to lower
    tokens = word_tokenize(sentence.lower())
    
    # Retuns the LDA values for those tokens
    return ldaModel[dictionary.doc2bow(tokens)] 

def generateNgrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

# Load Data Sets and Preprocess

In the following block of code loads and preprocesses our data sets.

In [3]:
allSessions = pickle.load( open( "Pickles/SWCNoTune.p", "rb" ) )
allSessionsQ = allSessions.loc[allSessions['type']=='Q']

allSessionsS = allSessionsQ.loc[allSessionsQ['class']==1].groupby('sID')['query'].apply(pd.Series.tolist).tolist()
allSessionsNS = allSessionsQ.loc[allSessionsQ['class']==0].groupby('sID')['query'].apply(pd.Series.tolist).tolist()

allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
SQSS = allSessionsSQS[allSessionsSQS['class']==1]['query'].tolist()
SQSNS = allSessionsSQS[allSessionsSQS['class']==0]['query'].tolist()

# Global Variables

In the following block of code we declare our global variables.

In [4]:
ovaResults = []
nSplits = 5
kfold = KFold(n_splits=nSplits, random_state=20210530, shuffle=True)

# Majority

In the following block of code we run our first baseline, the Majority classifier. This classifier classifies all users as the majority class, not our stereotype. We employ this as a baseline due to the difference in number of users who do, and do not; belong to our stereotype.

In [5]:
newList = {}
newList['class']=[]

for session in SQSS:
    newList['class'].append(1)


for session in SQSNS:
    newList['class'].append(0)

data = pd.DataFrame(newList)
data['prediction'] = 0

tnOva = 0
fpOva = 0
fnOva = 0
tpOva = 0

testAccOva = 0

outputMaj = []

trI = []
tI = []

for train_index, test_index in kfold.split(data):
    
    trI.append(train_index)
    tI.append(test_index)
    
    test = data.iloc[test_index]
    
    outputMaj.append(test['prediction'].tolist())
    
    testAccOva +=  accuracy_score(test['class'], test['prediction'])
    
    tn, fp, fn, tp = confusion_matrix( test['class'], test['prediction']).ravel()
    
    tnOva += tn
    fpOva += fp
    fnOva += fn
    tpOva += tp


majorityResults = ['Majority', round(testAccOva/nSplits,3), tnOva/nSplits, fpOva/nSplits, round(((tnOva/nSplits)/((tnOva/nSplits)+ (fpOva/nSplits))),3), fnOva/nSplits, tpOva/nSplits,round(((tpOva/nSplits)/((tpOva/nSplits)+ (fnOva/nSplits))),3),]

ovaResults.append(majorityResults)

pickle.dump( outputMaj, open( "Pickles/OutputMajoritySQS.p", "wb" ) )

# Rule Based

The following block of code is runs our second baseline. Drawing inspiration from "An analysis of queries intended to search information for children" (https://dl.acm.org/doi/10.1145/1840784.1840819), we classifiy a session as belonging to our stereotype if it us under a certain length of time and has a click on a site designated for children based on the DMOZ tags.

In [6]:
ster = SQSS
notSter = SQSNS

documents = []
sessionDuration = []
seshClass = []

for session in ster:

    sites = []
    doc = ""
    documents.append(sites)
    length = 0
    sessionDuration.append(length)
    seshClass.append(1)
    
for session in notSter:
        #print(session)
    sites = []
    doc = ""
    documents.append(sites)
    length = 0
    sessionDuration.append(length)
    seshClass.append(0)
    
data = pd.DataFrame(data=documents)
data = data[data.columns[1:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)
data = pd.DataFrame(data, columns=['sites'])

data['duration'] = sessionDuration
data['class'] = seshClass

In [7]:
sites = data['sites'].tolist()
duration = data['duration'].tolist()

durationClassification = []

for sD in duration:
    try:
        if float(sD) <= (60*30):
            durationClassification.append(1)
        else:
            durationClassification.append(0)    
    except:
        pass

kidsSites = []

with open('../Data/DataSources/DMOZ/URL Classification.csv') as csv_file:
    csvReader = csv.reader(csv_file)
    lineCount = -1
    for row in csvReader:

        if( 'Kids' in row[2]):

            kidsSites.append(row[1])
            
siteClassification = []

with tqdm(total=len(sites)) as pbar:
    for visitedSites in sites:
        found = 0
        for kidsSite in kidsSites:
            if kidsSite in visitedSites:
                siteClassification.append(1)
                found = 1
                break
        if found == 0:
            siteClassification.append(0)
        pbar.update()

100%|██████████| 1505/1505 [00:11<00:00, 135.52it/s]


In [8]:
data['durationClassification'] = durationClassification
data['siteClassification'] = siteClassification
data['prediction'] = data['durationClassification'] & data['siteClassification']

data = data[data.index.isin(allSessions.sID)]

tnOva = 0
fpOva = 0
fnOva = 0
tpOva = 0

testAccOva = 0

outputDT = []

for train_index, test_index in kfold.split(data):
    test = data.iloc[test_index]
    outputDT.append( test['prediction'].tolist())
    testAccOva +=  accuracy_score(test['class'], test['prediction'])
    tn, fp, fn, tp = confusion_matrix(test['class'], test['prediction']).ravel()
    tnOva += tn
    fpOva += fp
    fnOva += fn
    tpOva += tp
    
ruleResults = ['Rule', round(testAccOva/nSplits,3), tnOva/nSplits, fpOva/nSplits, round(((tnOva/nSplits)/((tnOva/nSplits)+ (fpOva/nSplits))),3), fnOva/nSplits, tpOva/nSplits,round(((tpOva/nSplits)/((tpOva/nSplits)+ (fnOva/nSplits))),3),]

ovaResults.append(ruleResults)

pickle.dump( outputDT, open( "Pickles/OutputRuleSQS.p", "wb" ) )

# Text Based

In the following blocks of code we implement the text based classifier proposed in "Age detection in chat" (https://ieeexplore.ieee.org/document/5298540), concatenating the queries of all our sessions and performing text based classification to recognize whether or not a session was generated by a user who belongs to our stereotype.

In [9]:
newList = {}
newList['entry']=[]
newList['tag']=[]


for session in allSessionsS:
    string = ""
    for query in session:
        string += query + " "
    newList['entry'].append(string)
    newList['tag'].append(1)


for session in allSessionsNS:
    string = ""
    for query in session:
        string += query + " "
    newList['entry'].append(string)
    newList['tag'].append(0)

newList = pd.DataFrame(data = newList)

In [10]:
newList2 = {}
newList2['entry']=[]
newList2['tag']=[]

for x in range((len(SQSS))+(len(SQSNS))):
    if( x < (len(SQSS))):
        newList2['tag'].append(1)
    else:
        newList2['tag'].append(0)


for x in range((len(SQSS))+(len(SQSNS))):
    if( x < (len(SQSS))):
        newList2['entry'].append(SQSS[x])

    else:
        newList2['entry'].append(SQSNS[x-(len(SQSS))])

data2 = pd.DataFrame(data = newList2)

In [11]:
rng = np.random.RandomState(20210414)

tnOva = 0
fpOva = 0
fnOva = 0
tpOva = 0

testAccOva = 0

outputTam = []

randomNumber = 20210530

for trainIndex, testIndex in kfold.split(data2):
    
    trainX, testX = data2.iloc[train_index]['entry'], data2.iloc[test_index]['entry']
    trainY, testY = data2.iloc[train_index]['tag'], data2.iloc[test_index]['tag']
    
    test = newList.sample(frac=0.20, random_state=rng)
    trainMask = pd.Series(True, index=newList.index)
    trainMask[test.index] = False
    train = newList[trainMask].copy()

    trainX = pd.concat([trainX, test['entry']])
    trainY = pd.concat([trainY, test['tag']])
    Encoder = LabelEncoder()
    
    trainY = Encoder.fit_transform(trainY)
    testY = Encoder.fit_transform(testY)

    vectorizer = CountVectorizer(analyzer='word', ngram_range=(3, 3))
    vectorizer.fit_transform(newList['entry'])
    
    trainXTfidf = vectorizer.transform(trainX)
    testXTfidf = vectorizer.transform(testX)
    
    SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')
    
    SVM.fit(trainXTfidf,trainY)# predict the labels on validation dataset
    
    predictionsSVM = SVM.predict(testXTfidf)
    
    outputTam.append(predictionsSVM)
    
    testAccOva += accuracy_score(predictionsSVM, testY)
    
    tn, fp, fn, tp = confusion_matrix(testY, predictionsSVM).ravel()
    
    tnOva += tn
    fpOva += fp
    fnOva += fn
    tpOva += tp
    
    randomNumber+=1
    
textBasedResults = ['Text', round(testAccOva/nSplits,3), tnOva/nSplits, fpOva/nSplits, round(((tnOva/nSplits)/((tnOva/nSplits)+ (fpOva/nSplits))),3), fnOva/nSplits, tpOva/nSplits,round(((tpOva/nSplits)/((tpOva/nSplits)+ (fnOva/nSplits))),3),]

ovaResults.append(textBasedResults)

pickle.dump( outputTam, open( "Pickles/OutputTextSQS.p", "wb" ) )

# MultiFeature

In the following blocks of code we re-implement the strategy proposed in "Author profiling: Predicting age and gender from blogs" (https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.667.4496&rep=rep1&type=pdf). This process involves extracting a variety of features from the queries found in sessions, training on some of those features and using the rest for classification. We don't break up the large block of code that performs the classification due to the nature of kfold split.

In [12]:
documents = []
classList = []
numQ = []

for session in allSessionsS:
    numQueries = 0
    doc = ""
    for query in session:
        doc += " " + query
        numQueries +=1
    documents.append(doc)
    classList.append(1)
    numQ.append(numQueries)
    
for session in allSessionsNS:
    numQueries = 0
    doc = ""
    for query in session:
        doc += " " + query
        numQueries +=1
    documents.append(doc)
    classList.append(0)
    numQ.append(numQueries)
    
data = pd.DataFrame(documents, columns = ['text'])
posData = []
for document in documents:
    text = nltk.word_tokenize(document)
    tags = np.array(nltk.pos_tag(text)).flatten()
    posData.append(tags[1::2])
    
data['pos'] = posData
data['numQ'] = numQ  

docUni = []
docBi = []
docTri = []

for document in documents:
    doc = generateNgrams(document,1)
    docUni.append(doc)
    
for document in documents:
    doc = generateNgrams(document,2)
    docBi.append(doc)
    
for document in documents:
    doc = generateNgrams(document,3)
    docTri.append(doc)
    
data['uniWord'] = docUni
data['biWord'] = docBi
data['triWord']= docTri

posMod = []

for pos in posData: 
    string = []
    for entry in pos:
        string += str(entry) + " "
    posMod.append("".join(string))

    
posUni = []
posBi = []
posTri = []

for document in posMod:
    doc = generateNgrams(document,1)
    posUni.append(doc)

for document in posMod:
    doc = generateNgrams(document,2)
    posBi.append(doc)

for document in posMod:
    doc = generateNgrams(document,3)
    posTri.append(doc)  
    
data['uniPos'] = posUni
data['biPos'] = posBi
data['triPos']= posTri
data['posMod'] = posMod
data['class'] = classList

text = "".join(data['text'].tolist())
token = word_tokenize(text)

unigrams = list(ngrams(token, 1))
uniwords = collections.Counter(unigrams)
n = len(uniwords)-40000
stopWordsUni = uniwords.most_common()[:-n-1:-1]

bigrams = list(ngrams(token, 2))
biwords = collections.Counter(bigrams)
n = len(biwords)-40000
stopWordsBi =  biwords.most_common()[:-n-1:-1]

trigrams = list(ngrams(token, 3))
triwords = collections.Counter(trigrams)
n = len(triwords)-40000
stopWordsTri = triwords.most_common()[:-n-1:-1]

In [13]:
documents = []
classList = []

for query in SQSS:
    doc = ""
    doc += " " + query
    documents.append(doc)
    classList.append(1)
 
for query in SQSNS:
    doc = ""   
    doc += " " + query
    documents.append(doc)
    classList.append(0)

data2 = pd.DataFrame(documents, columns = ['text'])

posData = []
for document in documents:
    text = nltk.word_tokenize(document)
    tags = np.array(nltk.pos_tag(text)).flatten()
    posData.append(tags[1::2])
    
data2['pos'] = posData

docUni = []
docBi = []
docTri = []

for document in documents:
    doc = generateNgrams(document,1)
    docUni.append(doc)
    
for document in documents:
    doc = generateNgrams(document,2)
    docBi.append(doc)
    
for document in documents:
    doc = generateNgrams(document,3)
    docTri.append(doc)
    
data2['uniWord'] = docUni
data2['biWord'] = docBi
data2['triWord']= docTri

posMod = []

for pos in posData: 
    string = []
    for entry in pos:
        string += str(entry) + " "
    posMod.append("".join(string))

    
posUni = []
posBi = []
posTri = []

for document in posMod:
    doc = generateNgrams(document,1)
    posUni.append(doc)

for document in posMod:
    doc = generateNgrams(document,2)
    posBi.append(doc)

for document in posMod:
    doc = generateNgrams(document,3)
    posTri.append(doc)  
    
data2['uniPos'] = posUni
data2['biPos'] = posBi
data2['triPos']= posTri
data2['posMod'] = posMod
data2['class'] = classList
data2['numQ'] = 1

FlatSingle = pd.DataFrame(data= data2)

In [None]:
rn = 20210530
nSplits = 5
tnOva = 0
fpOva = 0
fnOva = 0
tpOva = 0

testAccOva = 0
x= 0
accByNum = []

outputSan = []
accByNum = []
outputQ = []

for trainIndex, testIndex in kfold.split(data2):
    
    
    print('start')
    
    accNum = []
    train = data2.iloc[testIndex]
    test = data2.iloc[trainIndex]
    
    rng = np.random.RandomState(rn)
    testSing = data.sample(frac=0.20, random_state=rng)
    trainMask = pd.Series(True, index=data.index)
    trainMask[testSing.index] = False
    trainSing = data[trainMask].copy()

    train = pd.concat([train,trainSing])
    subtest = train.sample(frac=0.20, random_state=rng)
    subtrainMask = pd.Series(True, index=train.index)
    subtrainMask[subtest.index] = False
    subtrain = train[subtrainMask].copy()

    ## word-Gram Model Training
    print('start word-Gram training')
    
    Encoder = LabelEncoder()
    
    trainY = Encoder.fit_transform(subtrain['class'])
    testY = Encoder.fit_transform(subtest['class'])

    vectorizer = CountVectorizer(stop_words=stopWordsUni)
    vectorizer.fit_transform(train['text'])
    trainXTfidf = vectorizer.transform(subtrain['text'])
    testXTfidf = vectorizer.transform(subtest['text'])

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(trainXTfidf,trainY)
    predictionsSVMUni = SVM.predict(testXTfidf)

    testXTfidf = vectorizer.transform(test['text'])
    testPredictionsSVMUni = SVM.predict(testXTfidf)

  
    trainY = Encoder.fit_transform(subtrain['class'])
    testY = Encoder.fit_transform(subtest['class'])

    vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2),stop_words=stopWordsBi)
    vectorizer.fit_transform(train['text'])
    trainXTfidf = vectorizer.transform(subtrain['text'])
    testXTfidf = vectorizer.transform(subtest['text'])

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(trainXTfidf,trainY)# predict the labels on validation dataset
    predictionsSVMBi = SVM.predict(testXTfidf)

    testXTfidf = vectorizer.transform(test['text'])
    testPredictionsSVMBi = SVM.predict(testXTfidf)

    trainY = Encoder.fit_transform(subtrain['class'])
    testY = Encoder.fit_transform(subtest['class'])

    vectorizer = CountVectorizer(analyzer='word', ngram_range=(3, 3), stop_words=stopWordsTri)
    vectorizer.fit_transform(train['text'])
    trainXTfidf = vectorizer.transform(subtrain['text'])
    testXTfidf = vectorizer.transform(subtest['text'])

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(trainXTfidf,trainY)# predict the labels on validation dataset
    predictionsSVMTri = SVM.predict(testXTfidf)

    testXTfidf = vectorizer.transform(test['text'])
    testPredictionsSVMTri = SVM.predict(testXTfidf)


    ## PoS Gram Model Training
    
    print('start PoS-Gram training')
    
    trainY = Encoder.fit_transform(subtrain['class'])
    testY = Encoder.fit_transform(subtest['class'])

    vectorizer = CountVectorizer()
    vectorizer.fit_transform(train['posMod'])
    trainXTfidf = vectorizer.transform(subtrain['posMod'])
    testXTfidf = vectorizer.transform(subtest['posMod'])

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(trainXTfidf,trainY)# predict the labels on validation dataset
    predictionsSVMPosUni = SVM.predict(testXTfidf)

    testXTfidf = vectorizer.transform(test['posMod'])
    testPredictionsSVMPosUni = SVM.predict(testXTfidf)

    trainY = Encoder.fit_transform(subtrain['class'])
    testY = Encoder.fit_transform(subtest['class'])

    vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
    vectorizer.fit_transform(train['posMod'])
    trainXTfidf = vectorizer.transform(subtrain['posMod'])
    testXTfidf = vectorizer.transform(subtest['posMod'])

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(trainXTfidf,trainY)# predict the labels on validation dataset
    predictionsSVMPosBi = SVM.predict(testXTfidf)

    testXTfidf = vectorizer.transform(test['posMod'])
    testPredictionsSVMPosBi = SVM.predict(testXTfidf)

    trainY = Encoder.fit_transform(subtrain['class'])
    testY = Encoder.fit_transform(subtest['class'])

    vectorizer = CountVectorizer(analyzer='word', ngram_range=(3, 3))
    vectorizer.fit_transform(train['posMod'])
    trainXTfidf = vectorizer.transform(subtrain['posMod'])
    testXTfidf = vectorizer.transform(subtest['posMod'])

    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(trainXTfidf,trainY)# predict the labels on validation dataset
    predictionsSVMPosTri = SVM.predict(testXTfidf)


    testXTfidf = vectorizer.transform(test['posMod'])
    testPredictionsSVMPosTri = SVM.predict(testXTfidf)

    
    ## LDA 
    
    print('start LDA training')
    
    LDAtest = train.sample(frac=0.4, random_state=rng)
    LDAtrain_mask = pd.Series(True, index=train.index)
    LDAtrain_mask[LDAtest.index] = False
    LDAtrain = train[LDAtrain_mask].copy()


    processedDocs = LDAtrain['uniWord']
    dictionary = gensim.corpora.Dictionary(processedDocs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processedDocs]
    ldaModel = models.LdaModel(corpus, num_topics=200, \
                                      id2word=dictionary, \
                                      passes=4, alpha=[0.01]*200, \
                                      eta=[0.01]*len(dictionary.keys()))

    subtest['ldaOneOverall'] = subtest['text'].apply(lambda x: rowIndex(x))
    test['ldaOneOverall'] = test['text'].apply(lambda x: rowIndex(x))
    LDAtest['ldaOneOverall'] = LDAtest['text'].apply(lambda x: rowIndex(x))

    ldaOOList = LDAtest['ldaOneOverall'].tolist()

    emptyList200 = []

    for x in range(200):
        emptyList200.append((0))

    ldaAllPad = []
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)


    LDAtest['ldaOneOverallPad'] =  ldaAllPad

    ldaOOList = test['ldaOneOverall'].tolist()

    emptyList200 = []

    for x in range(200):
        emptyList200.append((0))

    ldaAllPad = []
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    test['ldaOneOverallPad'] =  ldaAllPad

    ldaOOList = subtest['ldaOneOverall'].tolist()

    emptyList200 = []

    for x in range(200):
        emptyList200.append((0))

    ldaAllPad = []
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    subtest['ldaOneOverallPad'] =  ldaAllPad

    processedDocs = LDAtrain.loc[LDAtrain['class'] ==1]['uniWord']
    dictionary = gensim.corpora.Dictionary(processedDocs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processedDocs]
    ldaModel = models.LdaModel(corpus, num_topics=100, \
                                      id2word=dictionary, \
                                      passes=4, alpha=[0.01]*100, \
                                      eta=[0.01]*len(dictionary.keys()))

    subtest['ldaOneTrue'] = subtest['text'].apply(lambda x: rowIndex(x))
    test['ldaOneTrue'] = test['text'].apply(lambda x: rowIndex(x))
    LDAtest['ldaOneTrue'] = LDAtest['text'].apply(lambda x: rowIndex(x))

    ldaOOList = LDAtest['ldaOneTrue'].tolist()

    emptyList200 = []

    for x in range(100):
        emptyList200.append((0))

    ldaAllPad = []
    
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    LDAtest['ldaOneTrue'] =  ldaAllPad

    ldaOOList = test['ldaOneTrue'].tolist()

    emptyList200 = []

    for x in range(100):
        emptyList200.append((0))

    ldaAllPad = []
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    test['ldaOneTrue'] =  ldaAllPad

    ldaOOList = subtest['ldaOneTrue'].tolist()

    emptyList200 = []

    for x in range(100):
        emptyList200.append((0))

    ldaAllPad = []
    
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    subtest['ldaOneTrue'] =  ldaAllPad

    processedDocs = LDAtrain.loc[LDAtrain['class'] ==0]['uniWord']
    dictionary = gensim.corpora.Dictionary(processedDocs)
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    corpus = [dictionary.doc2bow(doc) for doc in processedDocs]
    ldaModel = models.LdaModel(corpus, num_topics=100, \
                                      id2word=dictionary, \
                                      passes=4, alpha=[0.01]*100, \
                                      eta=[0.01]*len(dictionary.keys()))

    subtest['ldaOneFalse'] = subtest['text'].apply(lambda x: rowIndex(x))
    test['ldaOneFalse'] = test['text'].apply(lambda x: rowIndex(x))
    LDAtest['ldaOneFalse'] = LDAtest['text'].apply(lambda x: rowIndex(x))

    ldaOOList = LDAtest['ldaOneFalse'].tolist()

    emptyList200 = []

    for x in range(100):
        emptyList200.append((0))

    ldaAllPad = []
    
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    LDAtest['ldaOneFalse'] =  ldaAllPad

    ldaOOList = test['ldaOneFalse'].tolist()

    emptyList200 = []

    for x in range(100):
        emptyList200.append((0))

    ldaAllPad = []
    
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    test['ldaOneFalse'] =  ldaAllPad

    ldaOOList = subtest['ldaOneFalse'].tolist()

    emptyList200 = []

    for x in range(100):
        emptyList200.append((0))

    ldaAllPad = []
    
    for entry in ldaOOList:
        if entry:
            newList = emptyList200.copy()
            for tupleItem in entry:
                newList[tupleItem[0]] = tupleItem[1]
            ldaAllPad.append(newList)
        else:
            ldaAllPad.append(emptyList200)

    subtest['ldaOneFalse'] =  ldaAllPad

    LDAtest['LDAClassOne'] = LDAtest['ldaOneTrue'] + LDAtest['ldaOneFalse']
    test['LDAClassOne'] = test['ldaOneTrue'] + test['ldaOneFalse'] 
    subtest['LDAClassOne'] = subtest['ldaOneTrue'] + subtest['ldaOneFalse'] 

    LDAtest['LDAallOne'] = LDAtest['ldaOneOverallPad'] + LDAtest['LDAClassOne']
    test['LDAallOne'] = test['ldaOneOverallPad'] + test['LDAClassOne'] 
    subtest['LDAallOne'] = subtest['ldaOneOverallPad'] + subtest['LDAClassOne'] 

    clf = LogisticRegression(random_state=0, solver = 'liblinear').fit(list(LDAtest['ldaOneOverallPad']),LDAtest['class'])

    resultsOne = clf.predict(list(subtest['ldaOneOverallPad']))
    testresultsOne = clf.predict(list(test['ldaOneOverallPad']))

    clf = LogisticRegression(random_state=0, solver = 'liblinear').fit(list(LDAtest['LDAClassOne']),LDAtest['class'])

    resultsTwo = clf.predict(list(subtest['LDAClassOne']))
    testresultsTwo = clf.predict(list(test['LDAClassOne']))

    clf = LogisticRegression(random_state=0, solver = 'liblinear').fit(list(LDAtest['LDAallOne']),LDAtest['class'])

    resultsThree = clf.predict(list(subtest['LDAallOne']))
    testresultsThree = clf.predict(list(test['LDAallOne']))

    print('start model testing')
    
    subtest['SVMUni'] = predictionsSVMUni
    subtest['SVMBi'] = predictionsSVMBi
    subtest['SVMTri'] = predictionsSVMTri
    subtest['SVMUniPOS'] = predictionsSVMPosUni
    subtest['SVMBiPOS'] = predictionsSVMPosBi
    subtest['SVMTriPOS'] = predictionsSVMPosTri
    subtest['SVMUniLDA'] = resultsOne
    subtest['SVMBiLDA'] = resultsTwo
    subtest['SVMTriLDA'] = resultsThree

    test['SVMUni'] = testPredictionsSVMUni
    test['SVMBi'] = testPredictionsSVMBi
    test['SVMTri'] = testPredictionsSVMTri
    test['SVMUniPOS'] = testPredictionsSVMPosUni
    test['SVMBiPOS'] = testPredictionsSVMPosBi
    test['SVMTriPOS'] = testPredictionsSVMPosTri
    test['SVMUniLDA'] = testresultsOne
    test['SVMBiLDA'] = testresultsTwo
    test['SVMTriLDA'] = testresultsThree


    feat_cols = [
        'SVMUni',
        'SVMBi',
        'SVMTri',
        'SVMUniPOS',
        'SVMBiPOS',
        'SVMTriPOS',
        'SVMUniLDA',
        'SVMBiLDA',
        'SVMTriLDA',
    ]

    out_col = 'class'

    trainX = subtest[feat_cols]
    trainY = subtest[out_col]
    testX = test[feat_cols]
    testY = test[out_col]

    pipeLine = Pipeline([
        ('standardize', StandardScaler()),
        ('classify', DecisionTreeClassifier(criterion='entropy',random_state= 20210518, class_weight = 'balanced'))
        ])

    pipeLine.fit(trainX, trainY)

    prediction = pipeLine.predict(testX)
    testAccOva += acc
    outputSan.append(prediction)
    SanAQtn, SanAQfp, SanAQfn, SanAQtp = confusion_matrix(testY, prediction).ravel()
    tnOva += SanAQtn
    fpOva += SanAQfp
    fnOva += SanAQfn
    tpOva += SanAQtp

    rn+=1
    x+=1


start
start word-Gram training
start PoS-Gram training


In [None]:
multiFeatureResults = ['MultiFeature', round(testAccOva/nSplits,3), tnOva/nSplits, fpOva/nSplits, round(((tnOva/nSplits)/((tnOva/nSplits)+ (fpOva/nSplits))),3), fnOva/nSplits, tpOva/nSplits,round(((tpOva/nSplits)/((tpOva/nSplits)+ (fnOva/nSplits))),3),]

ovaResults.append(multiFeatureResults)

pickle.dump( outputSan, open( "Pickles/OutputMultiFeatureSQS.p", "wb" ) )

# MultiModel

In the following block of code we implement the classification strategy found in "Gender and Age Prediction Multilingual Author Profiles Based on Comments" (http://ceur-ws.org/Vol-2266/T4-4.pdf). This strategy relies on an ensemble classifier to recognize users based on the text found in their sessions

In [None]:
newList = {}
newList['entry']=[]
newList['class']=[]

for session in allSessionsS:
    string = ""
    numQ = 0
    for query in session:
        string += query + " "
        numQ += 1
    newList['entry'].append(string)
    newList['class'].append(1)



for session in allSessionsNS:
    string = ""
    numQ = 0
    for query in session:
        string += query + " "
        numQ += 1
    newList['entry'].append(string)
    newList['class'].append(0)

data = pd.DataFrame(newList)

SQSY = []

for x in range((len(SQSS) +len(SQSNS))):
    if( x < (len(SQSS))):
        SQSY.append(1)
    else:
        SQSY.append(0)

SQSX = []

for x in range((len(SQSS) +len(SQSNS))):
    if( x < (len(SQSS))):
        SQSX.append(SQSS[x])
    else:
        SQSX.append(SQSNS[x-301])
        
single = pd.DataFrame(data =[SQSY,SQSX]).T
single.rename(columns ={0:'class', 1:'entry'}, inplace = True)

In [None]:
vectorizer = CountVectorizer()

estimators = [
    ('lr', make_pipeline(CountVectorizer(),
                           LogisticRegressionCV(solver = 'liblinear', max_iter=1000))),
    ('nb', make_pipeline(TfidfVectorizer(),
                          MultinomialNB(alpha = .13))),
    ('gb', make_pipeline(CountVectorizer(),
                            GradientBoostingClassifier(learning_rate = .2, max_depth = 2))),
    ('mlp', make_pipeline(CountVectorizer(),
                           MLPClassifier(hidden_layer_sizes= 21, max_iter=1500, shuffle=False, tol=0.012 ))),   
 ]

eclf1 = VotingClassifier(estimators=estimators, voting='hard')

In [None]:
tnOva = 0
fpOva = 0
fnOva = 0
tpOva = 0

testAccOva = 0

accByNum = []
outputNem = []
outputQ = []



for trainIndex, testIndex in kfold.split(single):

    rng = np.random.RandomState(rn)
    test = data.sample(frac=0.20, random_state=rng)
    trainMask = pd.Series(True, index=data.index)
    trainMask[test.index] = False
    train = data[trainMask].copy()
    train.head()

    testX, trainX = single.iloc[train_index]['entry'], single.iloc[test_index]['entry']
    testY, trainY = single.iloc[train_index]['class'], single.iloc[test_index]['class']

    trainX = pd.concat([trainX, train['entry']])
    trainY = pd.concat([trainY, train['class']])

    eclf1 = eclf1.fit(trainX.tolist(), trainY.tolist())
    
    prediction = eclf1.predict(testX)
   
    outputNem.append(prediction)
    
    testAccOva +=  accuracy_score(prediction, testY.tolist())
    
    tn, fp, fn, tp = confusion_matrix( testY.tolist(), prediction).ravel()
    
    tnOva += tn
    fpOva += fp
    fnOva += fn
    tpOva += tp
    rn+=1
    x+=1
    
multiModelResults = ['MultiModel', round(testAccOva/nSplits,3), tnOva/nSplits, fpOva/nSplits, round(((tnOva/nSplits)/((tnOva/nSplits)+ (fpOva/nSplits))),3), fnOva/nSplits, tpOva/nSplits,round(((tpOva/nSplits)/((tpOva/nSplits)+ (fnOva/nSplits))),3),]

ovaResults.append(multiModelResults)

pickle.dump( outputNem, open( "Pickles/OutputMultiModelSQS.p", "wb" ) )


# Results

In the following block of code we aggregate and then save the results.

In [None]:
allResults = pd.DataFrame(data = ovaResults, columns = ["Type","Acc", "TN", "FP", "TNR", "FN", "TP", "TPR"])

pickle.dump( allResults, open( "Pickles/BaselineSQS.p", "wb" ) )