In [None]:
# Future improvements
# Separate dir for each step
# Create JSON array to assign IDs, 
#     keep track of PDF files process (each step?) etc.
# Remove numbers, urls
# Change variable names of jsonDoc, jsonInv, jsonGram to avoid redefiining argument

In [2]:
# Imports
import logging
import os # For file/directory interaction
import time, sys
from datetime import datetime, date # For log data
import re # For text replacement
import spacy # Pipeline processes (stopword and punctuation removal, lemmatization)
from nltk.stem.snowball import SnowballStemmer # Pipeline process for stemming
import json
nlp = spacy.load("en_core_web_sm")
txtFilesDir = 'Text Files'
rtnFilesDir = 'n Removed'
spaceFilesDir = 'No Spaces'
swFilesDir = 'Stop Words'
engFilesDir = 'English Words'
stemFilesDir = 'Stemmed'
jsonDocIndex = 'doc_dictionary.json'
jsonInvIndex = 'inverted_index.json'
jsonGramIndex = 'gram_index.json'
jsonTFMatrix = 'tf_matrix.json'
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

absolute = 'C:/Users/micah/Documents/IWU/CIS Practicum/Files'

In [10]:
# Pre-condition: All PDF files to be processed are in the sub-directory
#     pdfDir, and pdfDir is in absPath. absPath is by default the 
#     directory in which the program is executed
# Post-condition: All PDF files processed without error are converted to
#     text files which are placed in a new sub-directory 'Text Files'
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf#process_pdf
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

from io import StringIO

# Utilizes PDFminer3k to extract text from PDF documents
def getText(pdfPath):
# Sets up necessary objects
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, laparams=laparams)    
    
# Reads file to the text converter
    with open(pdfPath, 'rb') as pdfFile:
        process_pdf(rsrcmgr, device, pdfFile)                

# Retrieves text result
    text = sio.getvalue()
    
    device.close()
    sio.close()
    
    return text

def pdfToText(pdfDir, absPath = os.getcwd(), txtDir = txtFilesDir, stopAt = -1):
    
    pdfPath = absPath+'/'+pdfDir
    txtPath = absPath+'/'+txtDir
    if pdfDir not in os.listdir(absPath):
        print('The specified directory "' + directory + '" does not exist')
        return
# Creates 'Text Files' directory for converted PDFs
    if txtDir not in os.listdir(absPath):
        os.mkdir(txtPath)
    
    docNum = 0
    totalNum = len([file for file in os.scandir(pdfPath) if file.name.endswith('.pdf')])
    with open(absPath+'/'+'log.txt', 'a+', encoding="utf-8") as log:    
        for entity in os.scandir(pdfPath):
        # Moves on to next entity if the current entity is not a PDF
            if not entity.name.endswith('.pdf'):
                continue
            log.write("PDF to Text\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")    
            index = -4 # Remove '.pdf' from file name when creating '.txt' file
            fileName = entity.name[:index]+'.txt'
            print("Now on '"+entity.name+"'. . . ", end='')
            
        # This block attempts to read the PDF file, extract text from each page,
        #     and write the text to a text file with the same name
        # Some documents are protected, corrupted, etc. and text cannot be extracted
        # Exceptions are recorded in log.txt
        # hasError remains true until each step in the try block is complete
            if fileName not in os.listdir(txtPath): 
                docNum += 1
                hasError = True
            # Extracts text via getText method and writes to a text file. 
            # Errors are reported in log.txt
                try:
                    text = getText(pdfPath+'/'+entity.name)
                    txtFile = open(txtPath+'/'+fileName, 'w+', encoding="utf-8")
                    txtFile.write(text)
                    print("done")
                    hasError = False
                except Exception as e:
                    log.write(str(docNum)+": " + entity.name + ": \n\t" + str(e)+"\n")

                if hasError:
                    print("there was an error reading this document. See log for details. Reference number "+str(docNum)+".\n")
            else:
                max = 10
                if len(fileName) > max:
                    print('"'+fileName[:max]+'...txt"', end='')
                else:
                    print('"'+fileName+'"', end='')
                print(' already exists')
            if docNum >= stopAt and stopAt > 0:
                print("PDF to Text was stopped after "+str(docNum)+" documents.")
                break
        log.write("\n\n")
pdfToText('PDF', absPath = absolute, stopAt = 10)

Now on '00023.pdf'. . . "00023.txt" already exists
Now on '0028_504_uchida_s.pdf'. . . "0028_504_u...txt" already exists
Now on '0044_559_ishitani_y.pdf'. . . "0044_559_i...txt" already exists
Now on '00708543.pdf'. . . "00708543.t...txt" already exists
Now on '00823977.pdf'. . . "00823977.t...txt" already exists
Now on '00863988.pdf'. . . "00863988.t...txt" already exists
Now on '009.pdf'. . . "009.txt" already exists
Now on '00969115.pdf'. . . "00969115.t...txt" already exists
Now on '00a.pdf'. . . "00a.txt" already exists
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.pdf'. . . "01-Altamur...txt" already exists
Now on '01046104.pdf'. . . "01046104.t...txt" already exists
Now on '01217605.pdf'. . . "01217605.t...txt" already exists
Now on '01237554.pdf'. . . "01237554.t...txt" already exists
Now on '01308672.pdf'. . . "01308672.t...txt" already exists
Now on '01673370.pdf'. . . "01673370.t...txt" already exists
Now on '01_AMI_Alcaniz.

Now on '2016HealthcareBenchmarks_Care_Coordination_preview.pdf'. . . "2016Health...txt" already exists
Now on '21.pdf'. . . "21.txt" already exists
Now on '213.pdf'. . . "213.txt" already exists
Now on '21851407.pdf'. . . "21851407.t...txt" already exists
Now on '22.pdf'. . . "22.txt" already exists
Now on '23.pdf'. . . "23.txt" already exists
Now on '2393216.2393281.pdf'. . . "2393216.23...txt" already exists
Now on '24.pdf'. . . "24.txt" already exists
Now on '25.pdf'. . . "25.txt" already exists
Now on '2553062.2553065.pdf'. . . "2553062.25...txt" already exists
Now on '2666795.2666817.pdf'. . . "2666795.26...txt" already exists
Now on '280-p276-winberg.pdf'. . . "280-p276-w...txt" already exists
Now on '2891868.pdf'. . . "2891868.tx...txt" already exists
Now on '3.pdf'. . . "3.txt" already exists
Now on '30 Years of Minix.pdf'. . . "30 Years o...txt" already exists
Now on '30.pdf'. . . "30.txt" already exists
Now on '3178_18_1.pdf'. . . "3178_18_1....txt" already exists
Now on '331

In [10]:
# Work on 10 (good) files at a time until pipeline works
#   then incrementally add files and clean up errors

# Function to remove \n
def rmvN(txtDir = txtFilesDir, rtnDir = rtnFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    rtnPath = absPath+'/'+rtnDir
    if rtnDir not in os.listdir(absPath):
        os.mkdir(rtnPath)
        
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(rtnPath+'/'+entity.name, 'w+', encoding='utf-8') as rtnFile:
                text = txtFile.read()
                text = re.sub('-\n', '', text)
                text = re.sub('\n', '', text)
                rtnFile.write(text)
                rtnFile.truncate()
        print("done")
rmvN(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [11]:
# Funtion to move files without spaces to new 'Without Spaces' directory         
def checkSpaces(txtDir = rtnFilesDir, spacesDir = spaceFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    spacesPath = absPath+'/'+spacesDir
    if spacesDir not in os.listdir(absPath):
        os.mkdir(spacesPath)
        
    with open(absPath+'/'+'Spaces.txt', 'a+', encoding='utf-8') as spaces: 
        spaces.write("Check Spaces\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")
        for entity in os.scandir(txtPath):
            print("Now on '"+entity.name+"'. . . ", end='')
            txtFile = open(txtPath+'/'+entity.name, 'r', encoding='utf-8')
            text = txtFile.read()
            split = text.split(' ')
            if len(split) < len(text)/10 or len(text) < 100 or text == '':
                txtFile.close()
                spaces.write(entity.name+'\n')
                if entity.name not in os.listdir(spacesPath):
                    os.rename(txtPath+'/'+entity.name, spacesPath+'/'+entity.name)
                else:
                    os.remove(txtPath+'/'+entity.name)
            print("done")
        spaces.write('\n\n')
checkSpaces(absPath = absolute) 

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [12]:
# Function to remove stopwords
# NLTK or SpaCy
# Inverted File: gram:[doc1, doc3] or gram:[[doc1,freq], [doc3,freq]]
def rmvStopWords(nlp, txtDir = rtnFilesDir, swDir = swFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    swPath = absPath+'/'+swDir
    if swDir not in os.listdir(absPath):
        os.mkdir(swPath)

    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(swPath+'/'+entity.name, 'w+', encoding='utf-8') as swFile:
                doc = nlp(txtFile.read())
                noStopWords = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.text.isnumeric()]
                swFile.write(" ".join(noStopWords))
                swFile.truncate()
        print("done")

rmvStopWords(nlp, absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [39]:
# Remove non-english words
def rmvNonEng(txtDir = swFilesDir, engDir = engFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    engPath = absPath+'/'+engDir
    if engDir not in os.listdir(absPath):
        os.mkdir(engPath)
    with open(absPath+'/'+'words_dictionary.json') as json_file:
        words = json.load(json_file)
        
    lets = []
    alph = 'abcdefghijklmnopqrstuvwxyz'
    for let in alph:
        lets.append(let)
        for char in alph:
            lets.append(let+char)
        
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(engPath+'/'+entity.name, 'w+', encoding='utf-8') as engFile:
                text = txtFile.read().split(' ')
                engChars = [word for word in text if word in words and word not in lets]
                engFile.write(" ".join(engChars))
                engFile.truncate()
        print("done")
rmvNonEng(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [40]:
# Stem words in all documents
def stem(txtDir = engFilesDir, stemDir = stemFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    stemPath = absPath+'/'+stemDir
    if stemDir not in os.listdir(absPath):
        os.mkdir(stemPath)
        
    stemmer = SnowballStemmer(language='english')
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(stemPath+'/'+entity.name, 'w+', encoding='utf-8') as stemFile:
                text = txtFile.read().split(' ')
                stemmed = [stemmer.stem(word) for word in text]
                stemFile.write(" ".join(stemmed))
                stemFile.truncate()
        print("done")
stem(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [41]:
# Create JSON with file_name : doc_id
def update_doc_index(jsonDoc = jsonDocIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    jsonPath = absPath+'/'+jsonDoc
    if jsonDoc in os.listdir(absPath):
        with open(jsonPath, 'r') as jsonFile:
            docIndex = json.load(jsonFile)
    else:
        docIndex = {}
    for fileName in os.listdir(txtPath):
        if fileName not in docIndex.values():
            docIndex[len(docIndex)+1] = fileName
    with open(jsonPath, 'w') as jsonFile:
        json.dump(docIndex, jsonFile, indent=4)

update_doc_index(absPath = absolute)

In [42]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_inv_index(ngrams, jsonDoc = jsonDocIndex, jsonInv = jsonInvIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    jsonDocPath = absPath+'/'+jsonDoc
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonInvPath = absPath+'/'+jsonInv
    if jsonInv in os.listdir(absPath):
        with open(jsonInvPath, 'r') as jsonFile:
            invIndex = json.load(jsonFile)
    else:
        invIndex = {}
#     Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
        
    for ngram in ngrams:
        for docID in docIndex:
            if docID not in invIndex:
                invIndex[docID] = {}
            if ngram not in invIndex[docID]:
                invIndex[docID][ngram] = {}
                with open(txtPath+'/'+docIndex[docID], 'r', encoding='utf-8') as txtFile:
                    text = txtFile.read().split(' ')
                    while len(text) > ngram-1:
                        term = " ".join(text[:ngram])
                        if term in invIndex[docID][ngram]:
                            invIndex[docID][ngram][term] += 1
                        else:
                            invIndex[docID][ngram][term] = 1
                        text.pop(0)
    with open(jsonInvPath, 'w') as jsonFile:
        json.dump(invIndex, jsonFile, indent=4)

update_inv_index(list(range(2,4)), absPath = absolute)

In [43]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_gram_index(minFreq = 1, jsonInv = jsonInvIndex, jsonGram = jsonGramIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
#     Makes sure all files and directories exist (jsonInv, txtDir)
    jsonInvPath = absPath+'/'+jsonInv
    if jsonInv not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        return
#     jsonGram is created if it does not exist
    jsonGramPath = absPath+'/'+jsonGram
    if jsonGram in os.listdir(absPath):
        with open(jsonGramPath, 'r') as jsonFile:
            gramIndex = json.load(jsonFile)
    else:
        gramIndex = {}
#     Loads Inverted File index
    with open(jsonInvPath, 'r') as jsonFile:
        invIndex = json.load(jsonFile)
        
    for docID, ngrams in invIndex.items():
        for terms in ngrams.values():
            for term, freq in terms.items():
                if freq >= minFreq:
                    if term not in gramIndex:
                        gramIndex[term] = {}
                    gramIndex[term][docID] = freq
#     Writes gramIndex to gram JSON file
    with open(jsonGramPath, 'w') as jsonFile:
        json.dump(gramIndex, jsonFile, indent=4)

update_gram_index(absPath = absolute)              

In [44]:
# Print grams found in 3 or more documents
def getGrams(numDocs = 3, jsonDoc = jsonDocIndex, jsonGram = jsonGramIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
#     Makes sure all files and directories exist (textDir, jsonDoc, txtDir)
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    jsonDocPath = absPath+'/'+jsonDoc
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonGramPath = absPath+'/'+jsonGram
    if jsonGram not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonGramPath + '" does not exist')
        return
#    Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
#    Loads gram index 
    with open(jsonGramPath, 'r') as jsonFile:
        gramIndex = json.load(jsonFile)
    
    for gram, docs in gramIndex.items():
        if len(docs) >= numDocs:
            print("\""+gram+"\" found in "+str(len(docs))+" documents")
            for docID in docs:
                print("\t"+ str(docIndex[docID]))
                
getGrams(10, absPath = absolute)

"natur languag process" found in 14 documents
	LearningExecutableSemanticParsers.txt
	Text-Analytics-and-Natural-Language-Processing--.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	p123-zhang.txt
	QueryEffectiveness.txt
	00a.txt
	09_AMI_Schmidt.txt
	1.txt
	10.txt
	11.txt
	12.txt
	14.txt
	1412.2306v2.txt
	15.txt
"proceed intern confer" found in 10 documents
	QueryEffectiveness.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	p123-zhang.txt
	01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt
	05_AMI_Cortese.txt
	06_AMI__Piva.txt
	08_AMI_Kleiner.txt
	10.1.1.115.7110.txt
	14.txt
	1406.2661v1.txt
"pattern analysi machin" found in 10 documents
	Text Clustering Algorithms.txt
	00969115.txt
	01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt
	01217605.txt
	04359339.txt
	04407722.txt
	10.1.1.130.6691.txt
	10.1.1.72.8127.txt
	11.txt
	1506.01497v3.txt
"human com

	1311.2524v5.txt
	1311.2901v3.txt
	13_AMI_Laso.txt
	1406.2661v1.txt
	1409.1556.txt
	1412.2306v2.txt
	1506.02025.txt
"year ago" found in 10 documents
	Text-Analytics-and-Natural-Language-Processing--.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	00708543.txt
	0e.txt
	10.1.1.72.8127.txt
	12.txt
	13_AMI_Laso.txt
	142541.txt
	14_AMI_Cabrera.txt
	15.txt
"new technolog" found in 12 documents
	Text-Analytics-and-Natural-Language-Processing--.txt
	01673370.txt
	01_AMI_Alcaniz.txt
	02_AMI_Riva.txt
	03_AMI_Gaggioli.txt
	04_AMI_istag.txt
	07_AMI_Kameas.txt
	0e.txt
	12628324.txt
	12_AMI_Bettiol.txt
	142541.txt
	14_AMI_Cabrera.txt
"long term" found in 18 documents
	Text-Analytics-and-Natural-Language-Processing--.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	00708543.txt
	02_AMI_Riva.txt
	04_AMI_istag.txt
	06_AMI__Piva.txt
	09_AMI_Schmidt.txt
	0e.txt
	1.txt
	10.1.1.115.7110.txt
	10.1.1.130.6691.txt
	10.1.1.65.7635.txt
	10_AMI_Simplicity.txt
	1

In [60]:
# Create vectors for all n-grams (with freq>1?) between two docs, multiply them,
#     then divide by the product Euclidian norms
# Print out similar phrases
def freqMatrices(ngrams, jsonDoc = jsonDocIndex, jsonInv = jsonInvIndex, jsonTFM = jsonTFMatrix, absPath = os.getcwd()):
    jsonDocPath = absPath+'/'+jsonDoc
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonInvPath = absPath+'/'+jsonInv
    if jsonInv not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonInvPath, 'r') as jsonInv:
        invIndex = json.load(jsonInv)
        
#     jsonTFM is created if it does not exist
    jsonTFMPath = absPath+'/'+jsonTFM
    if jsonTFM in os.listdir(absPath):
        with open(jsonTFMPath, 'r') as jsonFile:
            tfMatrix = json.load(jsonFile)
    else:
        tfMatrix = {}     
    maxDocs = 60
    curDoc1 = 1
    for doc1 in invIndex:
#         Breaks for loop if maxDocs have been processed
        if curDoc1 > maxDocs:
            break
        curDoc1 += 1
        doc1Terms = []
        if doc1 not in tfMatrix:
            tfMatrix[doc1] = {}
        for ngram, terms in invIndex[doc1].items():
            if int(ngram) in ngrams:
                for term in terms:
                    doc1Terms.append(term)
        curDoc2 = 1
        for doc2 in invIndex:
            if curDoc2 > maxDocs:
                    break
            curDoc2 += 1
            if doc2 >= doc1 and doc2 not in tfMatrix[doc1]:
#                 Breaks for loop if maxDocs have been processed
                print(docIndex[doc1]+"({}) and ".format(doc1)+docIndex[doc2]+"({})".format(doc2))
                tfMatrix[doc1][doc2] = {}
#                 tfMatrix[doc1][doc2]["all"] = {}
                tfMatrix[doc1][doc2]["like"] = {}
#                 allTerms = doc1Terms.copy()
                likeTerms = []
                totalTerms = len(doc1Terms)
                for ngram, terms in invIndex[doc2].items():
                    if int(ngram) in ngrams:
                        for term in terms:
                            if term in doc1Terms:
                                likeTerms.append(term)
                            else:
                                totalTerms += 1
#                             if term not in allTerms:
#                                 allTerms.append(term)
#                             else:
#                                 likeTerms.append(term)
#                 allTerms.sort()
                likeTerms.sort()
#                 doc1all = {}
#                 doc2all = {}
                doc1like = {}
                doc2like = {}
                for gramLen in ngrams:
#                     doc1all[gramLen] = []
#                     doc2all[gramLen] = []
                    doc1like[gramLen] = []
                    doc2like[gramLen] = []
    
#                 for term in allTerms:
#                     length = len(term.split())
#                     ngram = str(len(term.split()))
#                     doc1all[length].append(invIndex[doc1][ngram].get(term, 0))
#                     doc2all[length].append(invIndex[doc2][ngram].get(term, 0))
                for term in likeTerms:
                    length = len(term.split())
                    ngram = str(len(term.split()))
                    doc1like[length].append(invIndex[doc1][ngram].get(term))
                    doc2like[length].append(invIndex[doc2][ngram].get(term))
#                 tfMatrix[doc1][doc2]["all"][doc1] = doc1all
#                 tfMatrix[doc1][doc2]["all"][doc2] = doc2all
                tfMatrix[doc1][doc2]["like"][doc1] = doc1like
                tfMatrix[doc1][doc2]["like"][doc2] = doc2like
                numLike = len(likeTerms)
                print("Doc1 has "+str(len(doc1Terms)-numLike)+" unique terms")
                print("Doc2 has "+str(totalTerms-len(doc1Terms))+" unique terms")
                for gramLength in ngrams:
                    print("{}-grams: {}".format(gramLength, len(doc1like[gramLength])))
                print("Total like terms: "+str(numLike))
                print("Total unlike terms: "+str(totalTerms-numLike), end="\n\n")
                
#     Writes tfMatrix to term-freq matrix JSON file
    with open(jsonTFMPath, 'w') as jsonFile:
        json.dump(tfMatrix, jsonFile, indent=4)
grams = [2, 3]
freqMatrices(grams, absPath = absolute)
    

Lafferty_pcfg-notes.txt(1) and 12_steps_paper.txt(56)
Doc1 has 2045 unique terms
Doc2 has 6309 unique terms
2-grams: 5
3-grams: 1
Total like terms: 6
Total unlike terms: 8354

Lafferty_pcfg-notes.txt(1) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Doc1 has 2049 unique terms
Doc2 has 4351 unique terms
2-grams: 2
3-grams: 0
Total like terms: 2
Total unlike terms: 6400

Lafferty_pcfg-notes.txt(1) and 1311.2524v5.txt(58)
Doc1 has 2045 unique terms
Doc2 has 7598 unique terms
2-grams: 6
3-grams: 0
Total like terms: 6
Total unlike terms: 9643

Lafferty_pcfg-notes.txt(1) and 1311.2901v3.txt(59)
Doc1 has 2043 unique terms
Doc2 has 3923 unique terms
2-grams: 8
3-grams: 0
Total like terms: 8
Total unlike terms: 5966

Lafferty_pcfg-notes.txt(1) and 13_AMI_Laso.txt(60)
Doc1 has 2047 unique terms
Doc2 has 10993 unique terms
2-grams: 4
3-grams: 0
Total like terms: 4
Total unlike terms: 13040

LearningExecutableSemanticParsers.txt(2) and 12_steps_paper.txt(56)
Doc1 has 5352 unique terms
Do

Doc1 has 3974 unique terms
Doc2 has 4343 unique terms
2-grams: 10
3-grams: 0
Total like terms: 10
Total unlike terms: 8317

00969115.txt(16) and 1311.2524v5.txt(58)
Doc1 has 3940 unique terms
Doc2 has 7560 unique terms
2-grams: 44
3-grams: 0
Total like terms: 44
Total unlike terms: 11500

00969115.txt(16) and 1311.2901v3.txt(59)
Doc1 has 3953 unique terms
Doc2 has 3900 unique terms
2-grams: 31
3-grams: 0
Total like terms: 31
Total unlike terms: 7853

00969115.txt(16) and 13_AMI_Laso.txt(60)
Doc1 has 3974 unique terms
Doc2 has 10987 unique terms
2-grams: 10
3-grams: 0
Total like terms: 10
Total unlike terms: 14961

00a.txt(17) and 12_steps_paper.txt(56)
Doc1 has 3739 unique terms
Doc2 has 6297 unique terms
2-grams: 18
3-grams: 0
Total like terms: 18
Total unlike terms: 10036

00a.txt(17) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Doc1 has 3745 unique terms
Doc2 has 4341 unique terms
2-grams: 11
3-grams: 1
Total like terms: 12
Total unlike terms: 8086

00a.txt(17) and 1311.

Doc1 has 6406 unique terms
Doc2 has 7594 unique terms
2-grams: 10
3-grams: 0
Total like terms: 10
Total unlike terms: 14000

02_AMI_Riva.txt(25) and 1311.2901v3.txt(59)
Doc1 has 6408 unique terms
Doc2 has 3923 unique terms
2-grams: 8
3-grams: 0
Total like terms: 8
Total unlike terms: 10331

02_AMI_Riva.txt(25) and 13_AMI_Laso.txt(60)
Doc1 has 6314 unique terms
Doc2 has 10895 unique terms
2-grams: 93
3-grams: 9
Total like terms: 102
Total unlike terms: 17209

03_AMI_Gaggioli.txt(26) and 12_steps_paper.txt(56)
Doc1 has 3381 unique terms
Doc2 has 6290 unique terms
2-grams: 25
3-grams: 0
Total like terms: 25
Total unlike terms: 9671

03_AMI_Gaggioli.txt(26) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Doc1 has 3284 unique terms
Doc2 has 4231 unique terms
2-grams: 81
3-grams: 41
Total like terms: 122
Total unlike terms: 7515

03_AMI_Gaggioli.txt(26) and 1311.2524v5.txt(58)
Doc1 has 3394 unique terms
Doc2 has 7592 unique terms
2-grams: 12
3-grams: 0
Total like terms: 12
Total unl

Doc1 has 5112 unique terms
Doc2 has 10930 unique terms
2-grams: 61
3-grams: 6
Total like terms: 67
Total unlike terms: 16042

09.txt(35) and 12_steps_paper.txt(56)
Doc1 has 6415 unique terms
Doc2 has 6292 unique terms
2-grams: 23
3-grams: 0
Total like terms: 23
Total unlike terms: 12707

09.txt(35) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Doc1 has 6430 unique terms
Doc2 has 4345 unique terms
2-grams: 8
3-grams: 0
Total like terms: 8
Total unlike terms: 10775

09.txt(35) and 1311.2524v5.txt(58)
Doc1 has 6421 unique terms
Doc2 has 7587 unique terms
2-grams: 17
3-grams: 0
Total like terms: 17
Total unlike terms: 14008

09.txt(35) and 1311.2901v3.txt(59)
Doc1 has 6429 unique terms
Doc2 has 3922 unique terms
2-grams: 9
3-grams: 0
Total like terms: 9
Total unlike terms: 10351

09.txt(35) and 13_AMI_Laso.txt(60)
Doc1 has 6407 unique terms
Doc2 has 10966 unique terms
2-grams: 31
3-grams: 0
Total like terms: 31
Total unlike terms: 17373

09_AMI_Schmidt.txt(36) and 12_steps_paper

Doc1 has 4240 unique terms
Doc2 has 4334 unique terms
2-grams: 19
3-grams: 0
Total like terms: 19
Total unlike terms: 8574

10.1.1.65.7635.txt(44) and 1311.2524v5.txt(58)
Doc1 has 4234 unique terms
Doc2 has 7579 unique terms
2-grams: 25
3-grams: 0
Total like terms: 25
Total unlike terms: 11813

10.1.1.65.7635.txt(44) and 1311.2901v3.txt(59)
Doc1 has 4248 unique terms
Doc2 has 3920 unique terms
2-grams: 11
3-grams: 0
Total like terms: 11
Total unlike terms: 8168

10.1.1.65.7635.txt(44) and 13_AMI_Laso.txt(60)
Doc1 has 4231 unique terms
Doc2 has 10969 unique terms
2-grams: 28
3-grams: 0
Total like terms: 28
Total unlike terms: 15200

10.1.1.72.8127.txt(45) and 12_steps_paper.txt(56)
Doc1 has 10269 unique terms
Doc2 has 6269 unique terms
2-grams: 45
3-grams: 1
Total like terms: 46
Total unlike terms: 16538

10.1.1.72.8127.txt(45) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Doc1 has 10300 unique terms
Doc2 has 4338 unique terms
2-grams: 15
3-grams: 0
Total like terms: 15
Total

Doc1 has 7641 unique terms
Doc2 has 6288 unique terms
2-grams: 27
3-grams: 0
Total like terms: 27
Total unlike terms: 13929

12628324.txt(54) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Doc1 has 7658 unique terms
Doc2 has 4343 unique terms
2-grams: 10
3-grams: 0
Total like terms: 10
Total unlike terms: 12001

12628324.txt(54) and 1311.2524v5.txt(58)
Doc1 has 7631 unique terms
Doc2 has 7567 unique terms
2-grams: 36
3-grams: 1
Total like terms: 37
Total unlike terms: 15198

12628324.txt(54) and 1311.2901v3.txt(59)
Doc1 has 7640 unique terms
Doc2 has 3903 unique terms
2-grams: 26
3-grams: 2
Total like terms: 28
Total unlike terms: 11543

12628324.txt(54) and 13_AMI_Laso.txt(60)
Doc1 has 7625 unique terms
Doc2 has 10954 unique terms
2-grams: 43
3-grams: 0
Total like terms: 43
Total unlike terms: 18579

12_AMI_Bettiol.txt(55) and 12_steps_paper.txt(56)
Doc1 has 5446 unique terms
Doc2 has 6294 unique terms
2-grams: 21
3-grams: 0
Total like terms: 21
Total unlike terms: 11740

12

In [62]:
import math
def calcCosSim(list1, list2):
    if len(list1) != len(list2):
        print("Lists do not have the same number of elements")
        return -1
    if len(list1) == 0:
        return 0
    sumProd = sum(n1*n2 for n1, n2 in zip(list1, list2))
    sumL1 = sum(n**2 for n in list1)
    sumL2 = sum(n**2 for n in list2)
    
    return sumProd/(math.sqrt(sumL1)*math.sqrt(sumL2))

def cossim(gramWeights, jsonDoc = jsonDocIndex, jsonTFM = jsonTFMatrix, absPath = os.getcwd()):
    jsonDocPath = absPath+'/'+jsonDoc
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonTFMPath = absPath+'/'+jsonTFM
    if jsonTFM not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonTFMPath + '" does not exist')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonTFMPath, 'r') as jsonInv:
        tfMatrix = json.load(jsonInv)
    
    for doc1 in tfMatrix:
        for doc2, values in tfMatrix[doc1].items():
            print(docIndex[doc1]+"({}) and ".format(doc1)+docIndex[doc2]+"({})".format(doc2))
            doc1freq = []
            doc2freq = []
            doc1weighted = []
            doc2weighted = []
            for gramLen, weight in gramWeights.items():
                gramLen = str(gramLen)
                doc1freq += values["like"][doc1][gramLen]
                doc2freq += values["like"][doc2][gramLen]
                doc1weighted += [ val*weight for val in values["like"][doc1][gramLen] ]
                doc2weighted += [ val*weight for val in values["like"][doc2][gramLen] ]
                
            print("Cosine Similarity: ", calcCosSim(doc1freq, doc2freq))
            print("Weighted: ", calcCosSim(doc1weighted, doc2weighted), end="\n\n")
            
            
gramWeight = {2:1, 3:5}
cossim(gramWeight, absPath = absolute)

Lafferty_pcfg-notes.txt(1) and Lafferty_pcfg-notes.txt(1)
Cosine Similarity:  1.0
Weighted:  1.0

Lafferty_pcfg-notes.txt(1) and LearningExecutableSemanticParsers.txt(2)
Cosine Similarity:  0.2030132572821122
Weighted:  0.23272391323219224

Lafferty_pcfg-notes.txt(1) and p123-zhang.txt(3)
Cosine Similarity:  0.8356716504931925
Weighted:  0.8356716504931925

Lafferty_pcfg-notes.txt(1) and Partial Parsing Finite-State Cascades.txt(4)
Cosine Similarity:  0.5849962754207856
Weighted:  0.5849962754207856

Lafferty_pcfg-notes.txt(1) and QueryEffectiveness.txt(5)
Cosine Similarity:  0.9561828874675149
Weighted:  0.9561828874675149

Lafferty_pcfg-notes.txt(1) and Text Clustering Algorithms.txt(6)
Cosine Similarity:  0.6375436308731434
Weighted:  0.6375436308731434

Lafferty_pcfg-notes.txt(1) and Text-Analytics-and-Natural-Language-Processing--.txt(7)
Cosine Similarity:  0.5636214801906779
Weighted:  0.5636214801906779

Lafferty_pcfg-notes.txt(1) and Workshop on Robust Methods in Analysis of Na

Cosine Similarity:  0.7568410568331935
Weighted:  0.7693779318129578

p123-zhang.txt(3) and 11.txt(49)
Cosine Similarity:  0.7230210236376229
Weighted:  0.8381665946408542

p123-zhang.txt(3) and 11_AMI_Cantoni.txt(50)
Cosine Similarity:  0.7295372041400853
Weighted:  0.7295372041400853

p123-zhang.txt(3) and 12.txt(51)
Cosine Similarity:  0.5510773761549316
Weighted:  0.5815790732295568

p123-zhang.txt(3) and 1203.txt(52)
Cosine Similarity:  0.7817359599705717
Weighted:  0.7817359599705717

p123-zhang.txt(3) and 122808.txt(53)
Cosine Similarity:  0
Weighted:  0

p123-zhang.txt(3) and 12628324.txt(54)
Cosine Similarity:  0.903696114115064
Weighted:  0.903696114115064

p123-zhang.txt(3) and 12_AMI_Bettiol.txt(55)
Cosine Similarity:  0.968962790249909
Weighted:  0.968962790249909

p123-zhang.txt(3) and 12_steps_paper.txt(56)
Cosine Similarity:  0.9999999999999999
Weighted:  0.9999999999999999

p123-zhang.txt(3) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Cosine Similarity:  0

Cosine Similarity:  0.6440906402680618
Weighted:  0.6440906402680618

Text-Analytics-and-Natural-Language-Processing--.txt(7) and 14_AMI_Cabrera.txt(72)
Cosine Similarity:  0.820412654142367
Weighted:  0.8606034498375779

Text-Analytics-and-Natural-Language-Processing--.txt(7) and 15.txt(73)
Cosine Similarity:  0.8380429976542928
Weighted:  0.7469902418718567

Text-Analytics-and-Natural-Language-Processing--.txt(7) and 1504.08083.txt(74)
Cosine Similarity:  0.5043619793543149
Weighted:  0.5043619793543149

Text-Analytics-and-Natural-Language-Processing--.txt(7) and 1506.01497v3.txt(75)
Cosine Similarity:  0.7072908037742268
Weighted:  0.7072908037742268

Text-Analytics-and-Natural-Language-Processing--.txt(7) and 1506.02025.txt(76)
Cosine Similarity:  0.4811403776364295
Weighted:  0.4811403776364295

Text-Analytics-and-Natural-Language-Processing--.txt(7) and 1512.03385v1.txt(77)
Cosine Similarity:  0.5079365079365079
Weighted:  0.5079365079365079

Text-Analytics-and-Natural-Language-P

Cosine Similarity:  0.9999999999999999
Weighted:  1.0

0044_559_ishitani_y.txt(11) and 00708543.txt(12)
Cosine Similarity:  0.8701395618766321
Weighted:  0.8933506694223418

0044_559_ishitani_y.txt(11) and 00823977.txt(13)
Cosine Similarity:  0.734891670044831
Weighted:  0.7739331030895461

0044_559_ishitani_y.txt(11) and 00863988.txt(14)
Cosine Similarity:  0.8221921916437785
Weighted:  0.9530729253345639

0044_559_ishitani_y.txt(11) and 009.txt(15)
Cosine Similarity:  0.587605960347241
Weighted:  0.587605960347241

0044_559_ishitani_y.txt(11) and 00969115.txt(16)
Cosine Similarity:  0.5036133395789492
Weighted:  0.490451443803563

0044_559_ishitani_y.txt(11) and 00a.txt(17)
Cosine Similarity:  0.9166666666666669
Weighted:  0.9166666666666669

0044_559_ishitani_y.txt(11) and 01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt(18)
Cosine Similarity:  0.4981496407206247
Weighted:  0.3912268639258096

0044_559_ishitani_y.txt(11) and 01046104.txt(1

00823977.txt(13) and 01217605.txt(20)
Cosine Similarity:  0.6353795214179134
Weighted:  0.7011166167308396

00823977.txt(13) and 01237554.txt(21)
Cosine Similarity:  0.8733132635761511
Weighted:  0.8733132635761511

00823977.txt(13) and 01308672.txt(22)
Cosine Similarity:  0.7786486411043855
Weighted:  0.8224338973607243

00823977.txt(13) and 01673370.txt(23)
Cosine Similarity:  0.6973779695320637
Weighted:  0.7507567703750052

00823977.txt(13) and 01_AMI_Alcaniz.txt(24)
Cosine Similarity:  0.8527151384298505
Weighted:  0.8960818751560127

00823977.txt(13) and 02_AMI_Riva.txt(25)
Cosine Similarity:  0.848528137423857
Weighted:  0.848528137423857

00823977.txt(13) and 03_AMI_Gaggioli.txt(26)
Cosine Similarity:  0.9864400504156211
Weighted:  0.9864400504156211

00823977.txt(13) and 04100664.txt(27)
Cosine Similarity:  0.6834604285811093
Weighted:  0.6051703320569299

00823977.txt(13) and 04359339.txt(28)
Cosine Similarity:  0.5358156892956704
Weighted:  0.5688030232035766

00823977.txt(1

Cosine Similarity:  1.0
Weighted:  1.0000000000000002

00a.txt(17) and 01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt(18)
Cosine Similarity:  0.9086895100729543
Weighted:  0.8912374332033544

00a.txt(17) and 01046104.txt(19)
Cosine Similarity:  0.4576620428378843
Weighted:  0.4576620428378843

00a.txt(17) and 01217605.txt(20)
Cosine Similarity:  1.0000000000000002
Weighted:  1.0000000000000002

00a.txt(17) and 01237554.txt(21)
Cosine Similarity:  0.6014167670256413
Weighted:  0.6014167670256413

00a.txt(17) and 01308672.txt(22)
Cosine Similarity:  0.9525793444156805
Weighted:  0.9525793444156805

00a.txt(17) and 01673370.txt(23)
Cosine Similarity:  0.7969722925954422
Weighted:  0.7969722925954422

00a.txt(17) and 01_AMI_Alcaniz.txt(24)
Cosine Similarity:  0.8230047070091902
Weighted:  0.8709270531385193

00a.txt(17) and 02_AMI_Riva.txt(25)
Cosine Similarity:  0.553638285434996
Weighted:  0.553638285434996

00a.txt(17) and 03_AMI_Gaggioli.tx

Cosine Similarity:  0.9384356103005076
Weighted:  0.9384356103005076

01217605.txt(20) and 1203.txt(52)
Cosine Similarity:  0.6963106238227914
Weighted:  0.6963106238227914

01217605.txt(20) and 122808.txt(53)
Cosine Similarity:  0
Weighted:  0

01217605.txt(20) and 12628324.txt(54)
Cosine Similarity:  0.9209109202723808
Weighted:  0.9594032236002469

01217605.txt(20) and 12_AMI_Bettiol.txt(55)
Cosine Similarity:  0.9999999999999998
Weighted:  0.9999999999999998

01217605.txt(20) and 12_steps_paper.txt(56)
Cosine Similarity:  0.9013878188659974
Weighted:  0.9013878188659974

01217605.txt(20) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Cosine Similarity:  1.0
Weighted:  1.0

01217605.txt(20) and 1311.2524v5.txt(58)
Cosine Similarity:  0.5827715174143586
Weighted:  0.5827715174143586

01217605.txt(20) and 1311.2901v3.txt(59)
Cosine Similarity:  0.9053662845289709
Weighted:  0.9053662845289709

01217605.txt(20) and 13_AMI_Laso.txt(60)
Cosine Similarity:  0.944911182523068
Wei

01_AMI_Alcaniz.txt(24) and 02_AMI_Riva.txt(25)
Cosine Similarity:  0.4543028679569389
Weighted:  0.559607264218168

01_AMI_Alcaniz.txt(24) and 03_AMI_Gaggioli.txt(26)
Cosine Similarity:  0.47755392036556654
Weighted:  0.5425775396629502

01_AMI_Alcaniz.txt(24) and 04100664.txt(27)
Cosine Similarity:  0.7324220137800287
Weighted:  0.7917405507680266

01_AMI_Alcaniz.txt(24) and 04359339.txt(28)
Cosine Similarity:  0.6423906871066664
Weighted:  0.6423906871066664

01_AMI_Alcaniz.txt(24) and 04407722.txt(29)
Cosine Similarity:  0.7954673856179907
Weighted:  0.7954673856179907

01_AMI_Alcaniz.txt(24) and 04_AMI_istag.txt(30)
Cosine Similarity:  0.43422109146704413
Weighted:  0.48351878063399784

01_AMI_Alcaniz.txt(24) and 05_AMI_Cortese.txt(31)
Cosine Similarity:  0.44333957046751543
Weighted:  0.5301685516922172

01_AMI_Alcaniz.txt(24) and 06_AMI__Piva.txt(32)
Cosine Similarity:  0.6584112852165059
Weighted:  0.6239176907377969

01_AMI_Alcaniz.txt(24) and 07_AMI_Kameas.txt(33)
Cosine Simil

Cosine Similarity:  1.0
Weighted:  1.0000000000000002

04100664.txt(27) and 04359339.txt(28)
Cosine Similarity:  0.6777118380219448
Weighted:  0.7152191702817127

04100664.txt(27) and 04407722.txt(29)
Cosine Similarity:  0.6105686838080208
Weighted:  0.6105686838080208

04100664.txt(27) and 04_AMI_istag.txt(30)
Cosine Similarity:  0.8295150620062532
Weighted:  0.8295150620062532

04100664.txt(27) and 05_AMI_Cortese.txt(31)
Cosine Similarity:  0.6680144904669488
Weighted:  0.6680144904669488

04100664.txt(27) and 06_AMI__Piva.txt(32)
Cosine Similarity:  0.7113412137959383
Weighted:  0.7856732912353833

04100664.txt(27) and 07_AMI_Kameas.txt(33)
Cosine Similarity:  0.9298956298171
Weighted:  0.9298956298171

04100664.txt(27) and 08_AMI_Kleiner.txt(34)
Cosine Similarity:  0.8381163549234938
Weighted:  0.8381163549234938

04100664.txt(27) and 09.txt(35)
Cosine Similarity:  1.0000000000000002
Weighted:  1.0000000000000002

04100664.txt(27) and 09_AMI_Schmidt.txt(36)
Cosine Similarity:  0.67

Weighted:  1.0000000000000002

04407722.txt(29) and 04_AMI_istag.txt(30)
Cosine Similarity:  0.7569781192451159
Weighted:  0.7569781192451159

04407722.txt(29) and 05_AMI_Cortese.txt(31)
Cosine Similarity:  0.8888888888888888
Weighted:  0.8888888888888888

04407722.txt(29) and 06_AMI__Piva.txt(32)
Cosine Similarity:  0.7174960033341753
Weighted:  0.830780403254297

04407722.txt(29) and 07_AMI_Kameas.txt(33)
Cosine Similarity:  0.8001322641986387
Weighted:  0.8001322641986387

04407722.txt(29) and 08_AMI_Kleiner.txt(34)
Cosine Similarity:  0.2408551192023351
Weighted:  0.2408551192023351

04407722.txt(29) and 09.txt(35)
Cosine Similarity:  0.9375
Weighted:  0.9375

04407722.txt(29) and 09_AMI_Schmidt.txt(36)
Cosine Similarity:  0.7723749163823411
Weighted:  0.7723749163823411

04407722.txt(29) and 0e.txt(37)
Cosine Similarity:  0.5508226327552436
Weighted:  0.5508226327552436

04407722.txt(29) and 1.txt(38)
Cosine Similarity:  0.691094740465088
Weighted:  0.8340576562282991

04407722.tx

Cosine Similarity:  0.44908642597185827
Weighted:  0.4893359055302826

05_AMI_Cortese.txt(31) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Cosine Similarity:  0.9185586535436917
Weighted:  0.9185586535436917

05_AMI_Cortese.txt(31) and 1311.2524v5.txt(58)
Cosine Similarity:  0.7814678814950932
Weighted:  0.7814678814950932

05_AMI_Cortese.txt(31) and 1311.2901v3.txt(59)
Cosine Similarity:  0.6123724356957945
Weighted:  0.6123724356957945

05_AMI_Cortese.txt(31) and 13_AMI_Laso.txt(60)
Cosine Similarity:  0.3220638909720264
Weighted:  0.41014078184272246

06_AMI__Piva.txt(32) and Partial Parsing Finite-State Cascades.txt(4)
Cosine Similarity:  0.6875436785532983
Weighted:  0.6875436785532983

06_AMI__Piva.txt(32) and QueryEffectiveness.txt(5)
Cosine Similarity:  0.6417763105978397
Weighted:  0.6241579547584742

06_AMI__Piva.txt(32) and Text Clustering Algorithms.txt(6)
Cosine Similarity:  0.6295840766367962
Weighted:  0.7019352048923426

06_AMI__Piva.txt(32) and Text-Analyti

Weighted:  0.5193448648642688

09_AMI_Schmidt.txt(36) and 12.txt(51)
Cosine Similarity:  0.6796919458447578
Weighted:  0.7462731443456728

09_AMI_Schmidt.txt(36) and 1203.txt(52)
Cosine Similarity:  0.4111446986794089
Weighted:  0.4111446986794089

09_AMI_Schmidt.txt(36) and 122808.txt(53)
Cosine Similarity:  0
Weighted:  0

09_AMI_Schmidt.txt(36) and 12628324.txt(54)
Cosine Similarity:  0.7255220489327023
Weighted:  0.7922384660264228

09_AMI_Schmidt.txt(36) and 12_AMI_Bettiol.txt(55)
Cosine Similarity:  0.9576412229920795
Weighted:  0.9596866706515367

09_AMI_Schmidt.txt(36) and 12_steps_paper.txt(56)
Cosine Similarity:  0.8623748005545344
Weighted:  0.8623748005545344

09_AMI_Schmidt.txt(36) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Cosine Similarity:  0.7699607172920183
Weighted:  0.7699607172920183

09_AMI_Schmidt.txt(36) and 1311.2524v5.txt(58)
Cosine Similarity:  0.7656633137858437
Weighted:  0.7656633137858437

09_AMI_Schmidt.txt(36) and 1311.2901v3.txt(59)
Cosin

Cosine Similarity:  1.0
Weighted:  1.0000000000000002

10.1.1.130.6691.txt(41) and 10.1.1.32.4689.txt(42)
Cosine Similarity:  0.6413725092738025
Weighted:  0.6413725092738025

10.1.1.130.6691.txt(41) and 10.1.1.62.889.txt(43)
Cosine Similarity:  0.40613542361574634
Weighted:  0.4374934831278582

10.1.1.130.6691.txt(41) and 10.1.1.65.7635.txt(44)
Cosine Similarity:  0.4892961080977772
Weighted:  0.6066726071999665

10.1.1.130.6691.txt(41) and 10.1.1.72.8127.txt(45)
Cosine Similarity:  0.40325413956480033
Weighted:  0.4330361347745447

10.1.1.130.6691.txt(41) and 10.1.1.91.9906.txt(46)
Cosine Similarity:  0.7805652127029818
Weighted:  0.835721388470872

10.1.1.130.6691.txt(41) and 10.txt(47)
Cosine Similarity:  0.5435046346204712
Weighted:  0.5954250458263362

10.1.1.130.6691.txt(41) and 10_AMI_Simplicity.txt(48)
Cosine Similarity:  0.5048140126348761
Weighted:  0.5779877324170204

10.1.1.130.6691.txt(41) and 11.txt(49)
Cosine Similarity:  0.3843368832999644
Weighted:  0.4271666429202335

Cosine Similarity:  1.0000000000000002
Weighted:  1.0

10.txt(47) and 10_AMI_Simplicity.txt(48)
Cosine Similarity:  0.8537860945941769
Weighted:  0.8800973696536624

10.txt(47) and 11.txt(49)
Cosine Similarity:  0.39375864632999874
Weighted:  0.5429993062253342

10.txt(47) and 11_AMI_Cantoni.txt(50)
Cosine Similarity:  0.3760228120742469
Weighted:  0.37614435991043654

10.txt(47) and 12.txt(51)
Cosine Similarity:  0.47211744478793355
Weighted:  0.6692019988660819

10.txt(47) and 1203.txt(52)
Cosine Similarity:  0.7268237938336418
Weighted:  0.7268237938336418

10.txt(47) and 122808.txt(53)
Cosine Similarity:  0
Weighted:  0

10.txt(47) and 12628324.txt(54)
Cosine Similarity:  0.5234279873715308
Weighted:  0.5234279873715308

10.txt(47) and 12_AMI_Bettiol.txt(55)
Cosine Similarity:  0.9141379262169076
Weighted:  0.9141379262169076

10.txt(47) and 12_steps_paper.txt(56)
Cosine Similarity:  0.4707980924789909
Weighted:  0.5804496964248881

10.txt(47) and 1306267704-OptimalExperienceinWork

Weighted:  0.763065034982721

12_AMI_Bettiol.txt(55) and 00023.txt(9)
Cosine Similarity:  0.7905694150420948
Weighted:  0.7905694150420948

12_AMI_Bettiol.txt(55) and 12_AMI_Bettiol.txt(55)
Cosine Similarity:  1.0
Weighted:  1.0

12_AMI_Bettiol.txt(55) and 12_steps_paper.txt(56)
Cosine Similarity:  0.5113099925649136
Weighted:  0.5113099925649136

12_AMI_Bettiol.txt(55) and 1306267704-OptimalExperienceinWorkandLeisure.txt(57)
Cosine Similarity:  0.8871662490696838
Weighted:  0.9476425042121494

12_AMI_Bettiol.txt(55) and 1311.2524v5.txt(58)
Cosine Similarity:  0.9363821838346237
Weighted:  0.9363821838346237

12_AMI_Bettiol.txt(55) and 1311.2901v3.txt(59)
Cosine Similarity:  0.8819171036881969
Weighted:  0.8819171036881969

12_AMI_Bettiol.txt(55) and 13_AMI_Laso.txt(60)
Cosine Similarity:  0.2963820186532697
Weighted:  0.44271178291056323

12_steps_paper.txt(56) and Text Clustering Algorithms.txt(6)
Cosine Similarity:  0.7990424657095319
Weighted:  0.8482092383586946

12_steps_paper.tx

In [None]:
# Trigram parser-based inverted file 
# (TF-DIF to remove trigrams common to most or all documents)

In [None]:
# Clustering algorithm based on trigram inverted file

In [None]:
# Add bigram parser-based info to inverted file

In [None]:
# Implement clustering on bigram inverted file