In this notebook we perform our ablation study on the SQS data set.

# Load Libraries

In the following block of code we import the libraries used in this notebook. 

In [1]:
import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

pd.options.mode.chained_assignment = None 

# Define Functions

In the following block of code we define the features used in our ablation study.

In [2]:
## Performs the classification process determining whether or not a user belongs to our stereotype
## params is the dictionary that contains the best hyperparameters
## feats is the list of features to tets and train one
## featType is a string of the feature set being used
## returns an array of results in terms of Accuracy, TN, FP, TNR, FN, TP, TPR

def recognizeSearcher(params, feats, featType):
    
    tnOva = 0
    fpOva = 0
    fnOva = 0
    tpOva = 0
    testAccOva = 0
    nSplits = 5
    x = 0
    
    accByNum = []
    outputAll = []
    outputQ = []

    kfold = KFold(n_splits=nSplits, random_state=20210530, shuffle=True)
    
    randomNumber = 20210530 
    
    X = SWC[feats]
    y = SWC['class']
   
    if params['scaler']:
        if params['classWeight']:
            tunePipe = Pipeline([
            ('standardize', params['scaler']),
            ('classify', RandomForestClassifier(n_estimators=params['numEstimators'], bootstrap = params['bootStrap'],
                                                criterion= params['criterion'], class_weight = params['classWeight'], 
                                                random_state= randomNumber, n_jobs = -1))
            ])
        else:
            tunePipe = Pipeline([
            ('standardize', params['scaler']),
            ('classify', RandomForestClassifier(n_estimators=params['numEstimators'], bootstrap = params['bootStrap'],
                                                criterion= params['criterion'], random_state= randomNumber, n_jobs = -1))
            ])
    else:
        if params['classWeight']:
            tunePipe = Pipeline([
            ('classify', RandomForestClassifier(n_estimators=params['numEstimators'], bootstrap = params['bootStrap'],
                                                criterion= params['criterion'], class_weight = params['classWeight'], 
                                                random_state= randomNumber, n_jobs = -1))
            ])
        else:
            tunePipe = Pipeline([
            ('classify', RandomForestClassifier(n_estimators=params['numEstimators'], bootstrap = params['bootStrap'],
                                                criterion= params['criterion'], random_state= randomNumber, n_jobs = -1))
            ])
                

    for train_index, test_index in kfold.split(X):

        rng = np.random.RandomState(randomNumber)
    
        trainSQS = SQS.sample(frac=0.20, random_state=rng)
        testMask = pd.Series(True, index=SQS.index)
        testMask[trainSQS.index] = False
        test = SQS[testMask].copy()

        
        subTrainX = trainSQS[feats]
        subTrainY = trainSQS['class']
        
        trainX, testX = X.iloc[train_index], X.iloc[test_index]
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
        trainX = pd.concat([trainX,subTrainX])
        trainY = pd.concat([trainY,subTrainY])
        trainX = trainX.fillna(0)
        tunePipe.fit(trainX, trainY)

        testAccOva +=  accuracy_score(test['class'], tunePipe.predict(test[feats]))
        tn, fp, fn, tp = confusion_matrix(test['class'], tunePipe.predict(test[feats])).ravel()
        tnOva += tn
        fpOva += fp
        fnOva += fn
        tpOva += tp
        outputAll.append(tunePipe.predict(test[feats]))

        
    pickle.dump( outputAll, open( "Pickles/OutputAll" + str(featType) + "SQS.p", "wb" ) )

   
    results = [featType, round(testAccOva/5,3), tnOva/5, fpOva/5, round(((tnOva/5)/((tnOva/5)+ (fpOva/5))),3), fnOva/5, tpOva/5,round(((tpOva/5)/((tpOva/5)+ (fnOva/5))),3),]
    return results

# Load Data Set and Parameters

The following block of code loads up the features of the data set and the best parameters.

In [3]:
SWC = pickle.load( open( "Pickles/SWCFeatNoTune.p", "rb" ) )
SQS =  pickle.load( open( "../FeatureExtraction/DataSets/SQSFeatures/SQSFeat.p", "rb" ) )
bestParameters = pickle.load( open( "Pickles/BestParam.p", "rb" ) )

# Define Feature Sets

The following blocks of code define feature sets that we perform our ablation study on. We choose to seperate this code into several blocks for the sake of legibility.

In [4]:
features = list(SQS.columns)
features.remove('class')

In [5]:
P3Feats = ['ld',
 'ls1',
 'ls2',
 'vs1',
 'vs2',
 'cvs1',
 'ndw',
 'ttr',
 'cttr',
 'rttr',
 'logttr',
 'lv',
 'vv1',
 'svv1',
 'cvv1',
 'vv2',
 'nv',
 'adjv',
 'numSpellingErrors',
 'offByOne',
 'kidsError',
 'coreVocab',
 'nonCoreVocab',
 'minAoA',
 'maxAoA',
 'ratioAoA',
 'queryComplexity',
 'SVEN',
 'top250SterCount',
 'top250SterRatAnt',
 'top250SterRatCon',
 'top250NonSterCount',
 'top250NonSterRatAnt',
 'top250NonSterRatCon',
 'top50SterCount',
 'top50SterRatAnt',
 'top50SterAntCon',
 'top50NonSterCount',
 'top50NonSterRatAnt',
 'top50NonSterAntCon',
 'tfidfAll',
 'tfidfS',
 'tfidfNS'
 ]

In [6]:
DC1Feats = ['ndw',
 'ttr',
 'cttr',
 'rttr',
 'logttr',
 'lv',
 'vv1',
 'svv1',
 'cvv1',
 'vv2',
 'nv',
 'adjv',
 'totalSyl',
 'avgSyl',
 'simWords',
 'comWords',
 'greatestSyl',
 'leastSyl',
 'numChars',
 'numWords',
 'avgLenWord',
 'minAoA',
 'maxAoA',
 'queryComplexity',
 'stopCount',
 'com',
 'net',
 'org',
 'edu',
 'gov',
 'http',
 'AND',
 'OR',
 'quotes',
 'inter',
 'numSpellingErrors',
 'offByOne',
 'kidsError',
 'punct',
 'casing',
 ' Level0',
 ' Level1',
 ' Level2',
 ' Level3',
 ' Level4',
 ' Level5',
 ' Level6',
 ' Level7',
 ' MeanLevel',
 'cc',
 'cd',
 'dt',
 'ex',
 'fw',
 'in',
 'jj',
 'jjr',
 'jjs',
 'md',
 'nn',
 'nnp',
 'nnps',
 'nns',
 'pdt',
 'pos',
 'prp',
 'rb',
 'rbr',
 'rbs',
 'rp',
 'sym',
 'to',
 'uh',
 'vb',
 'vbd',
 'vbg',
 'vbn',
 'vbp',
 'vbz',
 'wdt',
 'wp',
 'wrb',
 'nn nn',
 'jj nn',
 'nn nns',
 'to vb',
 'jj nns',
 'jj to',
 'nn in',
 'nns in',
 'in nn',
 'dt nn',
 'jj nn nn',
 'nn nn nn',
 'jj to vb',
 'nn nn nns',
 'to vb nn',
 'repeatClicks',
 'clickDistance',
 'meanClickPosition',
 'numClicks',
 'numClicksPerQuery',
 'numQueries',
 'timeClicks',
 'uniqueQueries',
 'allSameClicks',
 'uniqueClicks',
 'allSameQueries',
 'queryDistance',
 'timeQueries',
 'repeatQueries']

In [7]:
TextFeat = ['cc',
 'cd',
 'dt',
 'ex',
 'fw',
 'in',
 'jj',
 'jjr',
 'jjs',
 'md',
 'nn',
 'nnp',
 'nnps',
 'nns',
 'pdt',
 'pos',
 'prp',
 'rb',
 'rbr',
 'rbs',
 'rp',
 'sym',
 'to',
 'uh',
 'vb',
 'vbd',
 'vbg',
 'vbn',
 'vbp',
 'vbz',
 'wdt',
 'wp',
 'wrb',
 'nn nn',
 'jj nn',
 'nn nns',
 'to vb',
 'jj nns',
 'jj to',
 'nn in',
 'nns in',
 'in nn',
 'dt nn',
 'jj nn nn',
 'nn nn nn',
 'jj to vb',
 'nn nn nns',
 'to vb nn',
 ' Level0',
 ' Level1',
 ' Level2',
 ' Level3',
 ' Level4',
 ' Level5',
 ' Level6',
 ' Level7',
 ' MeanLevel',
 'totalSyl',
 'avgSyl',
 'simWords',
 'comWords',
 'greatestSyl',
 'leastSyl',
 'numChars',
 'numWords',
 'avgLenWord',
 'ld',
 'ls1',
 'ls2',
 'vs1',
 'vs2',
 'cvs1',
 'ndw',
 'ttr',
 'cttr',
 'rttr',
 'logttr',
 'lv',
 'vv1',
 'svv1',
 'cvv1',
 'vv2',
 'nv',
 'adjv',
 'numSpellingErrors',
 'offByOne',
 'kidsError',
 'punct',
 'casing',
 'coreVocab',
 'nonCoreVocab',
 'minAoA',
 'maxAoA',
 'ratioAoA',
 'queryComplexity',
 'SVEN',
 'top250SterCount',
 'top250SterRatAnt',
 'top250SterRatCon',
 'top250NonSterCount',
 'top250NonSterRatAnt',
 'top250NonSterRatCon',
 'top50SterCount',
 'top50SterRatAnt',
 'top50SterAntCon',
 'top50NonSterCount',
 'top50NonSterRatAnt',
 'top50NonSterAntCon',
 'tfidfAll',
 'tfidfS',
 'tfidfNS',
 'stopCount',
 'com',
 'net',
 'org',
 'edu',
 'gov',
 'http',
 'AND',
 'OR',
 'quotes',
 'inter']

In [8]:
SessionFeats = ['repeatClicks',
 'clickDistance',
 'meanClickPosition',
 'numClicks',
 'numClicksPerQuery',
 'numQueries',
 'timeClicks',
 'uniqueQueries',
 'allSameClicks',
 'uniqueClicks',
 'allSameQueries',
 'queryDistance',
 'timeQueries',
 'repeatQueries']

# Perform Ablation Study

In the following block of code we perform the ablation study and store all the results in a dataframe.

In [9]:
RYSe = recognizeSearcher(bestParameters, features, 'RYSe')
P3Results = recognizeSearcher(bestParameters, P3Feats, 'P3')
DC1Results = recognizeSearcher(bestParameters, DC1Feats, 'DC1')
textResults = recognizeSearcher(bestParameters, TextFeat, 'TextBased')
sessionResults = recognizeSearcher(bestParameters, SessionFeats, 'SessionBased')

allResults = pd.DataFrame(data = [RYSe,P3Results,DC1Results,textResults,sessionResults  ], 
                          columns = ["Type","Acc", "TN", "FP", "TNR", "FN", "TP", "TPR"])

# Save Results

In the following block of code we save those results.

In [11]:
pickle.dump( allResults, open( "Pickles/AblationSQS.p", "wb" ) )