# Comparing Accuracy of TF-IDF tool to DocSim using Abjayon Procedures

#### Neel Datta
#### August 2021

This notebook looks further into the differences between TF-IDF and DocSim, this time using a malware search query and searching through four Abjayon procedure documents (actual text, not extracted from html).

In [8]:
import json
import docsim
import re
import csv
import nltk
nltk.download('wordnet')
from urllib.request import urlopen
from urllib.error import HTTPError
import pandas as pd
from bs4 import BeautifulSoup
from tfidf import rank_documents

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/neeldatta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
%%time

docsim_obj = docsim.DocSim(verbose=True)
# docsim_obj = docsim.DocSim_threaded(verbose=True)
print(f'Model ready: {docsim_obj.model_ready}')

Loading default GloVe word vector model: glove-wiki-gigaword-50
Model loaded
Model ready: True
CPU times: user 21.7 s, sys: 212 ms, total: 21.9 s
Wall time: 21.9 s


In [13]:
# Function that takes in xml file of a list of URLs and converts to string list where each string is a URL
def xmlToList(xml): 
    with open(xml, 'r') as f:
        temp = f.read()
    temp = re.findall("<loc>.*?</loc>", temp)
    strlist = []
    for s in temp:
        s = s[5:-6]
        strlist.append(s)
    return strlist
    
# Function that converts each url into a title + Data node in JSON
def htmlToJSON(htmlIn, JSONout):
    data = {}
    data['data'] = []
    for url in htmlIn:
        while True:
            try:
                dpoint = [url]
                page = urlopen(url)
                html = page.read().decode("ISO-8859-1")
                soup = BeautifulSoup(html)
                dpoint.append(soup.get_text())
                data['data'].append(dpoint)
                break
            except HTTPError:
                print ("HTTPError at url: " + url)
                break
    with open(JSONout, 'w') as outfile:
        json.dump(data, outfile)
        
def txtToJson(txtIn, JSONout):
    data = {}
    data['data'] = []
    
    for url in htmlIn:
        while True:
            try:
                dpoint = [url]
                page = urlopen(url)
                html = page.read().decode("ISO-8859-1")
                soup = BeautifulSoup(html)
                dpoint.append(soup.get_text())
                data['data'].append(dpoint)
                break
            except HTTPError:
                print ("HTTPError at url: " + url)
                break
    with open(JSONout, 'w') as outfile:
        json.dump(data, outfile)
    

# Function that converts the list of controls/policies from a JupiterOne PDF into a list of strings
def JSONToList(JSONin):
    policies = pd.read_json(JSONin)
    plist = []
    for sec in policies['sections']:
        for req in sec['requirements']:
            plist.append(sec['title'] + ' ' + req['ref'] + ' : ' + req['title'] + ' : ' + req['summary'])
    return plist


# Function that iterates through n of the controls and compares using docsim with the JSON documents
# and outputs file with top 5 matches for each.
    # Takes in xml list of all URLs to compare, loads them into input urlJSON files, and outputs a CSV of each control
    # and the 5 urls most similar to it
    
def finalMatches(xmlIn, urlJSON, policyJSON, CSVOut, n):
    htmlToJSON(xmlToList(xmlIn), urlJSON)

    policies = JSONToList(policyJSON)
    
#Currently only testing on the first 5 policies for runtime/testing purposes
    policies = policies[:n]
    
# Load test data
    with open(urlJSON) as in_file:
        urldata = json.load(in_file)
    titles = [item[0] for item in urldata['data']]
    documents = [item[1] for item in urldata['data']]
    print(f'{len(documents)} documents')
    
# Output findings into CSV file:
    with open(CSVOut, mode = 'w') as csvfile:
        data = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        data.writerow(['Policy', 'Score', 'URL'])
        for p in policies:
            query_string = p
            similarities = docsim_obj.similarity_query(query_string, documents)
            for idx, score in (sorted(enumerate(similarities), reverse=True, key=lambda x: x[1])[:5]):
                data.writerow([query_string, str(score), titles[idx]])
    return


#Function that tests inputted policy string(s) against inputted list of URLs.
def testMatchesDocSim(queries, urls, testJSON):
    htmlToJSON(urls, testJSON)

    with open(testJSON) as in_file:
        urldata = json.load(in_file)
    titles = [item[0] for item in urldata['data']]
    documents = [item[1] for item in urldata['data']]
    print(f'{len(documents)} documents')
    
    # Test on one string
    query_string = queries
    similarities = docsim_obj.similarity_query(query_string, documents)

    # Output the similarity scores for top 5 documents
    for idx, score in (sorted(enumerate(similarities), reverse=True, key=lambda x: x[1])[:5]):
        print(f'{idx} \t {score:0.3f} \t {titles[idx]}')
    return
    

def testMatchesTFIDF(queries, urls, testJSON):
    htmlToJSON(urls, testJSON)

    with open(testJSON) as in_file:
        urldata = json.load(in_file)
    titles = [item[0] for item in urldata['data']]
    documents = [item[1] for item in urldata['data']]
    print(f'{len(documents)} documents')
    
    # Test on one string
    document_scores = rank_documents(queries, documents)

    score_titles = [(score, title) for score, title in zip(document_scores, titles)]

    for score, title in (sorted(score_titles, reverse=True, key=lambda x: x[0])[:5]):
        print(f'{score:0.3f} \t {title}')
    return

def testTFIDF(queries, testJSON):
    with open(testJSON) as in_file:
        urldata = json.load(in_file)
    titles = [item[0] for item in urldata['data']]
    documents = [item[1] for item in urldata['data']]
    print(f'{len(documents)} documents')
    
    # Test on one string
    document_scores = rank_documents(queries, documents)

    score_titles = [(score, title) for score, title in zip(document_scores, titles)]

    for score, title in (sorted(score_titles, reverse=True, key=lambda x: x[0])[:5]):
        print(f'{score:0.3f} \t {title}')
    return

def testDocsim(queries, testJSON):
    with open(testJSON) as in_file:
        urldata = json.load(in_file)
    titles = [item[0] for item in urldata['data']]
    documents = [item[1] for item in urldata['data']]
    print(f'{len(documents)} documents')
    
    # Test on one string
    query_string = queries
    similarities = docsim_obj.similarity_query(query_string, documents)

    # Output the similarity scores for top 5 documents
    for idx, score in (sorted(enumerate(similarities), reverse=True, key=lambda x: x[1])[:5]):
        print(f'{idx} \t {score:0.3f} \t {titles[idx]}')
    return

In [14]:
%%time
testTFIDF("Detection, prevention and recovery controls to protect against malware shall be implemented, combined with appropriate user awareness.",
         "test.json")

4 documents
0.042 	 Antivirus Policy
0.028 	 Wireless Security Policy
0.013 	 Password Policy
0.010 	 Backup Policy
CPU times: user 34.8 ms, sys: 2.05 ms, total: 36.9 ms
Wall time: 36 ms


In [15]:
%%time
testDocsim("Detection, prevention and recovery controls to protect against malware shall be implemented, combined with appropriate user awareness.",
         "test.json")

4 documents
4 documents loaded into corpus
1 	 0.468 	 Wireless Security Policy
3 	 0.437 	 Backup Policy
0 	 0.401 	 Antivirus Policy
2 	 0.323 	 Password Policy
CPU times: user 18.3 s, sys: 1.52 s, total: 19.8 s
Wall time: 5.02 s


## The results for TF-IDF were overall more accurate than the results when using DocSim. 
### For a query revolving around malware prevention/detection, the Antivirus policy should rank first: 
   - With TF-IDF, the test correctly assigned Antivirus policy the highest score, signifiantly greater than the other three policies. 
   - With Docsim, the test incorrectly put Wireless Security policy as the highest scoring document, with the Antivirus coming in third, and without much discrimination between the score values for the top 3 results (i.e. they all received relatively similar scoring).