In [1]:
import logging
import os # For file/directory interaction
import time, sys
from datetime import datetime, date # For log data
import re # For text replacement
import spacy # Pipeline processes (stopword and punctuation removal, lemmatization)
from nltk.stem.snowball import SnowballStemmer # Pipeline process for stemming
import json

workingDir = os.getcwd()
commonDir = os.path.join(workingDir,'Files')
pdfDir = os.path.join(commonDir,'PDF')
txtFilesDir = os.path.join(commonDir,'Text Files')
rtnFilesDir = os.path.join(commonDir,'n Removed')
spaceFilesDir = os.path.join(commonDir,'No Spaces')
swFilesDir = os.path.join(commonDir,'Stop Words')
engFilesDir = os.path.join(commonDir,'English Words')
stemFilesDir = os.path.join(commonDir,'Stemmed')
jsonDocIndex = os.path.join(commonDir,'doc_dictionary.json')
jsonInvIndex = os.path.join(commonDir,'inverted_index.json')
jsonGramIndex = os.path.join(commonDir,'gram_index.json')
jsonTFMatrix = os.path.join(commonDir,'tf_matrix.json')
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

In [17]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf#process_pdf
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO

# Uses PDFminer3k to extract text from PDF documents
def getText(pdfPath):
    # Sets up necessary objects
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, laparams=laparams)
    # Reads file to the text converter
    with open(pdfPath, 'rb') as pdfFile:
        process_pdf(rsrcmgr, device, pdfFile)
    # Retrieves text result
    text = sio.getvalue()
    device.close()
    sio.close()
    return text

# Pre-condition: All PDF files to be processed are in the sub-directory
#     pdfDir, and pdfDir is in workingDir. workingDir is by default the 
#     directory in which the program is executed
# Post-condition: All PDF files processed without error are converted to
#     text files which are placed in a sub-directory 'Text Files'
# NOTE: This will process all documents by default. Change the value of
#     'limit' to limit the number of documents processed at once
def pdfToText(pdfPath = pdfDir, txtPath = txtFilesDir, workingDir = commonDir, limit = -1):
    if not os.path.exists(pdfPath):
        print('The specified directory "' + pdfPath + '" does not exist')
        return
    # Creates 'Text Files' directory for converted PDFs
    if not os.path.exists(txtPath):
        os.mkdir(txtPath)
    
    docNum = 0
    docErr = 1
    totalNum = len([file for file in os.scandir(pdfPath) if file.name.endswith('.pdf')])
    with open(os.path.join(workingDir,'log.txt'), 'a+', encoding="utf-8") as log:    
        for entity in os.scandir(pdfPath):
            # Moves on to next entity if the current entity is not a PDF
            if not entity.name.endswith('.pdf'):
                continue
            log.write("PDF to Text\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")    
            index = -4 # Remove '.pdf' from file name when creating '.txt' file
            fileName = entity.name[:index]+'.txt'
            print("Now on '"+entity.name+"'. . . ", end='')
            
            # Attempt to read the PDF file, extract text from each page,
            #     and write the text to a text file with the same name
            # Some documents are protected, corrupted, etc. and text cannot be extracted
            # Exceptions are recorded in log.txt
            # hasError remains true until each step in the try block is complete
            if fileName not in os.listdir(txtPath): 
                # Extracts text via getText method and writes to a text file. 
                # Errors are reported in log.txt
                hasError = True
                try:
                    text = getText(os.path.join(pdfPath,entity.name))
                    txtFile = open(os.path.join(txtPath,fileName), 'w+', encoding="utf-8")
                    txtFile.write(text)
                    docNum += 1
                    print("done ({}/{})".format(docNum, limit))
                    hasError = False
                except Exception as e:
                    log.write(str(docErr)+": " + entity.name + ": \n\t" + str(e)+"\n")
                if hasError:
                    print("there was an error reading this document. See log for details. Reference number "+str(docErr)+".\n")
                    docErr += 1
            else:
                max = 10
                if len(fileName) > max:
                    print('"'+fileName[:max]+'...txt"', end='')
                else:
                    print('"'+fileName+'"', end='')
                print(' already exists')
            if docNum >= limit and limit > 0:
                print("PDF to Text was stopped after "+str(docNum)+" documents.")
                break
        log.write("\n\n")
pdfToText(limit = 10)

Now on '00023.pdf'. . . "00023.txt" already exists
Now on '0028_504_uchida_s.pdf'. . . "0028_504_u...txt" already exists
Now on '0044_559_ishitani_y.pdf'. . . "0044_559_i...txt" already exists
Now on '00708543.pdf'. . . "00708543.t...txt" already exists
Now on '00823977.pdf'. . . "00823977.t...txt" already exists
Now on '00863988.pdf'. . . "00863988.t...txt" already exists
Now on '009.pdf'. . . "009.txt" already exists
Now on '00969115.pdf'. . . "00969115.t...txt" already exists
Now on '00a.pdf'. . . "00a.txt" already exists
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.pdf'. . . "01-Altamur...txt" already exists
Now on '01046104.pdf'. . . "01046104.t...txt" already exists
Now on '01217605.pdf'. . . "01217605.t...txt" already exists
Now on '01237554.pdf'. . . "01237554.t...txt" already exists
Now on '01308672.pdf'. . . "01308672.t...txt" already exists
Now on '01673370.pdf'. . . "01673370.t...txt" already exists
Now on '01_AMI_Alcaniz.

Now on '9-Planning.pdf'. . . there was an error reading this document. See log for details. Reference number 3.

Now on '9.pdf'. . . "9.txt" already exists
Now on '90.pdf'. . . there was an error reading this document. See log for details. Reference number 4.

Now on 'A  cognitive affective model of organizational communication for designing IT.pdf'. . . "A  cogniti...txt" already exists
Now on 'A Bible for the Disability Field.pdf'. . . "A Bible fo...txt" already exists
Now on 'A Comparisoon of Binarization Methods for Historical Archive Documents.pdf'. . . "A Comparis...txt" already exists
Now on 'A light-weight text image processing method for handheld embedded cameras.pdf'. . . "A light-we...txt" already exists
Now on 'A Mathematical Theory of Communication.pdf'. . . "A Mathemat...txt" already exists
Now on 'A Model for Types and Levels of Human Interaction with Automation.pdf'. . . "A Model fo...txt" already exists
Now on 'A Parser for Real-Time Speech Synthesis of Conversational 

In [19]:
# THIS MUST BE AFTER TEXT CONVERSION BEFORE ANY OTHER FUNCTIONS
# Function to remove \n
def rmvN(txtPath = txtFilesDir, rtnPath = rtnFilesDir):
    # Checks that text file directory exists/is correct
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call pdfToText first and the directory will be created')
        return
    if not os.path.exists(rtnPath):
        os.mkdir(rtnPath)

    # Substitutes returns and hyphens at the end of each line with empty strings
    for entity in os.scandir(txtPath):
        txtFilePath = os.path.join(txtPath,entity.name)
        rtnFilePath = os.path.join(rtnPath,entity.name)
        if not os.path.exists(rtnFilePath):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(txtFilePath, 'r+', encoding='utf-8') as txtFile:
                with open(rtnFilePath, 'w+', encoding='utf-8') as rtnFile:
                    text = txtFile.read()
                    text = re.sub('-\n', '', text)
                    text = re.sub('\n', '', text)
                    rtnFile.write(text)
                    rtnFile.truncate()
            print("done")
rmvN()

Now on '103583991.txt'. . . done
Now on '124972.txt'. . . done
Now on '124973.txt'. . . done
Now on '124974.txt'. . . done
Now on '124977.txt'. . . done
Now on '124978.txt'. . . done
Now on '124979.txt'. . . done
Now on '13.txt'. . . done
Now on '130771.txt'. . . done
Now on '135706.txt'. . . done
Now on '136540.txt'. . . done
Now on '136542.txt'. . . done
Now on '136543.txt'. . . done
Now on '136546.txt'. . . done
Now on '136644.txt'. . . done
Now on '139679.txt'. . . done
Now on '139681.txt'. . . done
Now on '139682.txt'. . . done
Now on '139684.txt'. . . done
Now on '139973.txt'. . . done
Now on '139978.txt'. . . done
Now on '139979.txt'. . . done
Now on '14-18436.txt'. . . done
Now on '140594.txt'. . . done
Now on '140595.txt'. . . done
Now on '140597.txt'. . . done
Now on '140599.txt'. . . done
Now on '142709.txt'. . . done
Now on '150752.txt'. . . done
Now on '156665.txt'. . . done
Now on '159814.txt'. . . done
Now on '159815.txt'. . . done
Now on '175922.txt'. . . done
Now on '1

In [24]:
# Funtion to move files without spaces to new 'Without Spaces' directory         
def checkSpaces(txtPath = rtnFilesDir, spacesPath = spaceFilesDir, workingDir = commonDir):
    # Checks that text file directory exists
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvN first and the directory will be created')
        return
    if not os.path.exists(spacesPath):
        os.mkdir(spacesPath)
    
    with open(os.path.join(workingDir,'Spaces.txt'), 'a+', encoding='utf-8') as spaces: 
        spaces.write("Check Spaces\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")
        for entity in os.scandir(txtPath):
            print("Now on '"+entity.name+"'. . . ", end='')
            txtFile = open(os.path.join(txtPath,entity.name), 'r', encoding='utf-8')
            text = txtFile.read()
            split = text.split(' ')
            if len(split) < len(text)/10 or len(text) < 100 or text == '':
                txtFile.close()
                spaces.write(entity.name+'\n')
                if entity.name not in os.listdir(spacesPath):
                    os.rename(os.path.join(txtPath,entity.name), os.path.join(spacesPath,entity.name))
                else:
                    os.remove(os.path.join(txtPath,entity.name))
            print("done")
        spaces.write('\n\n')
checkSpaces(workingDir = absolute) 

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [26]:
# Function to remove stopwords
def rmvStopWords(txtPath = rtnFilesDir, swPath = swFilesDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvN first and the directory will be created')
        return
    if not os.path.exists(swPath):
        os.mkdir(swPath)

    nlp = spacy.load("en_core_web_sm")
    for entity in os.scandir(txtPath):
        if not os.path.exists(os.path.join(swPath,entity.name)):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(os.path.join(txtPath,entity.name), 'r+', encoding='utf-8') as txtFile:
                with open(os.path.join(swPath,entity.name), 'w+', encoding='utf-8') as swFile:
                    doc = nlp(txtFile.read())
                    noStopWords = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.text.isnumeric()]
                    swFile.write(" ".join(noStopWords))
                    swFile.truncate()
            print("done")

rmvStopWords()

Now on 'Abrahamian_E.txt'. . . done
Now on 'AccessibilitySoHard.txt'. . . done
Now on 'AchievingKaiserPermanenteQuality2016McHugh.txt'. . . done
Now on 'AcquiringMastery.txt'. . . done
Now on 'Acquity_Group_Whitepaper_-_The_Emerging_Retailer_Guide.txt'. . . done
Now on 'acsc99.txt'. . . done
Now on 'Acting_to_know_improving_creativity_in_t.txt'. . . done
Now on 'ActionInResearch.txt'. . . done
Now on 'ActorModelOfComputation.txt'. . . done


In [32]:
# Remove non-english words
def rmvNonEng(txtPath = swFilesDir, engPath = engFilesDir, workingDir = commonDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvStopWords first and the directory will be created')
        return
    if not os.path.exists(engPath):
        os.mkdir(engPath)
    with open(os.path.join(workingDir,'words_dictionary.json')) as json_file:
        words = json.load(json_file)
        
    lets = []
    alph = 'abcdefghijklmnopqrstuvwxyz'
    for let in alph:
        lets.append(let)
        for char in alph:
            lets.append(let+char)
        
    for entity in os.scandir(txtPath):
        if not os.path.exists(os.path.join(engPath,entity.name)):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(os.path.join(txtPath,entity.name), 'r+', encoding='utf-8') as txtFile:
                with open(os.path.join(engPath,entity.name), 'w+', encoding='utf-8') as engFile:
                    text = txtFile.read().split(' ')
                    engChars = [word for word in text if word in words and word not in lets]
                    engFile.write(" ".join(engChars))
                    engFile.truncate()
            print("done")
rmvNonEng()

In [34]:
# Stem words in all documents
def stem(txtPath = engFilesDir, stemPath = stemFilesDir, workingDir = commonDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvNonEng first and the directory will be created')
        return
    if not os.path.exists(stemPath):
        os.mkdir(stemPath)
        
    stemmer = SnowballStemmer(language='english')
    for entity in os.scandir(txtPath):
        if not os.path.exists(os.path.join(stemPath, entity.name)):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(os.path.join(txtPath,entity.name), 'r+', encoding='utf-8') as txtFile:
                with open(os.path.join(stemPath,entity.name), 'w+', encoding='utf-8') as stemFile:
                    text = txtFile.read().split(' ')
                    stemmed = [stemmer.stem(word) for word in text]
                    stemFile.write(" ".join(stemmed))
                    stemFile.truncate()
            print("done")
stem()

Now on 'Abrahamian_E.txt'. . . done
Now on 'AccessibilitySoHard.txt'. . . done
Now on 'AchievingKaiserPermanenteQuality2016McHugh.txt'. . . done
Now on 'AcquiringMastery.txt'. . . done
Now on 'Acquity_Group_Whitepaper_-_The_Emerging_Retailer_Guide.txt'. . . done
Now on 'acsc99.txt'. . . done
Now on 'Acting_to_know_improving_creativity_in_t.txt'. . . done
Now on 'ActionInResearch.txt'. . . done
Now on 'ActorModelOfComputation.txt'. . . done


In [35]:
# Create JSON with file_name : doc_id
def update_doc_index(jsonPath = jsonDocIndex, txtPath = stemFilesDir):
    if os.path.exists(jsonPath):
        with open(jsonPath, 'r') as jsonFile:
            docIndex = json.load(jsonFile)
    else:
        docIndex = {}
    for fileName in os.listdir(txtPath):
        if fileName not in docIndex.values():
            docIndex[len(docIndex)+1] = fileName
    with open(jsonPath, 'w') as jsonFile:
        json.dump(docIndex, jsonFile, indent=4)

update_doc_index()

In [36]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_inv_index(ngrams, jsonDocPath = jsonDocIndex, jsonInvPath = jsonInvIndex, txtPath = stemFilesDir, workingDir = commonDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call stem first and the directory will be created')
        return
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
        return
    if os.path.exists(jsonInvPath):
        with open(jsonInvPath, 'r') as jsonFile:
            invIndex = json.load(jsonFile)
    else:
        invIndex = {}
    # Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
        
    for ngram in ngrams:
        for docID in docIndex:
            if docID not in invIndex:
                invIndex[docID] = {}
            if ngram not in invIndex[docID]:
                invIndex[docID][ngram] = {}
                with open(os.path.join(txtPath,docIndex[docID]), 'r', encoding='utf-8') as txtFile:
                    text = txtFile.read().split(' ')
                    while len(text) > ngram-1:
                        term = " ".join(text[:ngram])
                        if term in invIndex[docID][ngram]:
                            invIndex[docID][ngram][term] += 1
                        else:
                            invIndex[docID][ngram][term] = 1
                        text.pop(0)
    with open(jsonInvPath, 'w') as jsonFile:
        json.dump(invIndex, jsonFile, indent=4)

update_inv_index(list(range(2,4)))

In [38]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_gram_index(minFreq = 1, jsonInvPath = jsonInvIndex, jsonGramPath = jsonGramIndex, txtPath = stemFilesDir):
    # Makes sure all files and directories exist (jsonInv, txtDir)
    if not os.path.exists(jsonInvPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        print('Check that the path is correct. You must call update_inv_index first and the file will be created')
        return
    # jsonGram is created if it does not exist
    if os.path.exists(jsonGramPath):
        with open(jsonGramPath, 'r') as jsonFile:
            gramIndex = json.load(jsonFile)
    else:
        gramIndex = {}
    # Loads Inverted File index
    with open(jsonInvPath, 'r') as jsonFile:
        invIndex = json.load(jsonFile)
        
    for docID, ngrams in invIndex.items():
        for terms in ngrams.values():
            for term, freq in terms.items():
                if freq >= minFreq:
                    if term not in gramIndex:
                        gramIndex[term] = {}
                    gramIndex[term][docID] = freq
    # Writes gramIndex to gram JSON file
    with open(jsonGramPath, 'w') as jsonFile:
        json.dump(gramIndex, jsonFile, indent=4)

update_gram_index()              

In [39]:
# Print grams found in 3 or more documents
def getGrams(numDocs = 3, jsonDocPath = jsonDocIndex, jsonGramPath = jsonGramIndex, txtPath = stemFilesDir, workingDir = commonDir):
    # Makes sure all files and directories exist (textDir, jsonDoc, txtDir)
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call stem first and the directory will be created')
        return
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
        return
    if not os.path.exists(jsonGramPath):
        print('FILE NOT FOUND: The specified file "' + jsonGramPath + '" does not exist')
        print('Check that the path is correct. You must call update_gram_index first and the file will be created')
        return
    # Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
    # Loads gram index 
    with open(jsonGramPath, 'r') as jsonFile:
        gramIndex = json.load(jsonFile)
    
    for gram, docs in gramIndex.items():
        if len(docs) >= numDocs:
            print("\""+gram+"\" found in "+str(len(docs))+" documents")
            for docID in docs:
                print("\t"+ str(docIndex[docID]))
                
getGrams(10)

"context free grammar" found in 14 documents
	Lafferty_pcfg-notes.txt
	LearningExecutableSemanticParsers.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	00a.txt
	1.txt
	11.txt
	12.txt
	14.txt
	15.txt
	16.txt
	17.txt
	18.txt
	24.txt
	25.txt
"natur languag process" found in 34 documents
	LearningExecutableSemanticParsers.txt
	Text-Analytics-and-Natural-Language-Processing--.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	p123-zhang.txt
	QueryEffectiveness.txt
	00a.txt
	09_AMI_Schmidt.txt
	1.txt
	10.txt
	11.txt
	12.txt
	14.txt
	1412.2306v2.txt
	15.txt
	16.txt
	17.txt
	18.txt
	19.txt
	20.txt
	21.txt
	22.txt
	23.txt
	24.txt
	25.txt
	3.txt
	4.txt
	5.txt
	51121C Business Analytics & Enterprise Software Publishing in the US Industry Report.txt
	6.txt
	7.txt
	8.txt
	9.txt
	A Parser for Real-Time Speech Synthesis of Conversational Texts.txt
	a2-jeong.txt
"languag process comput" found in 19 documents
	LearningExecutableSemanticParsers.txt
	00a.

	10_AMI_Simplicity.txt
	173-pap0297-bateman.txt
	1900008.1900088.txt
	2393216.2393281.txt
	2553062.2553065.txt
	280-p276-winberg.txt
	a2-jeong.txt
	Acting_to_know_improving_creativity_in_t.txt
"deep understand" found in 10 documents
	LearningExecutableSemanticParsers.txt
	Text-Analytics-and-Natural-Language-Processing--.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	03_AMI_Gaggioli.txt
	04_AMI_istag.txt
	06_AMI__Piva.txt
	0e.txt
	16_58_10_733_Lead_Magazine_feature.txt
	427-1101-2-PB.txt
	A  cognitive affective model of organizational communication for designing IT.txt
"languag process" found in 39 documents
	LearningExecutableSemanticParsers.txt
	p123-zhang.txt
	QueryEffectiveness.txt
	Text-Analytics-and-Natural-Language-Processing--.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	0044_559_ishitani_y.txt
	00a.txt
	04100664.txt
	04359339.txt
	09_AMI_Schmidt.txt
	1.txt
	10.txt
	11.txt
	12.txt
	14.txt
	1412.2306v2.txt
	15.txt
	16.txt
	1

	16.txt
	1606.03507.txt
	20.txt
	2009BookChapter.txt
	2009_trouva.txt
	2015.10.12_Millenson_Berenson.txt
	21.txt
	24.txt
	25.txt
	3.txt
	5.txt
	52411B Health & Medical Insurance in the US Industry Report.txt
	54151 IT Consulting in the US Industry Report.txt
	56160540.txt
	831-828-1-PB.txt
	9.txt
	AACOpenNewWorld.txt
	aar.txt
	Acquity_Group_Whitepaper_-_The_Emerging_Retailer_Guide.txt
"continu grow" found in 11 documents
	LearningExecutableSemanticParsers.txt
	06_AMI__Piva.txt
	16_58_10_733_Lead_Magazine_feature.txt
	33163615.txt
	33451B Medical Device Manufacturing in the US Industry Report.txt
	51121C Business Analytics & Enterprise Software Publishing in the US Industry Report.txt
	52411B Health & Medical Insurance in the US Industry Report.txt
	54151 IT Consulting in the US Industry Report.txt
	62211 Hospitals in the US Industry Report.txt
	62231 Specialty Hospitals in the US Industry Report.txt
	Acquity_Group_Whitepaper_-_The_Emerging_Retailer_Guide.txt
"end user" found in 11 docu

"inform servic" found in 13 documents
	QueryEffectiveness.txt
	05_AMI_Cortese.txt
	10.1.1.115.7110.txt
	10_AMI_Simplicity.txt
	13_AMI_Laso.txt
	14_AMI_Cabrera.txt
	1st Stop Checklist- Business_Secretarial_Consulting Service.txt
	2393216.2393281.txt
	2553062.2553065.txt
	33163615.txt
	54151 IT Consulting in the US Industry Report.txt
	62151 Diagnostic & Medical Laboratories in the US Industry Report.txt
	a2-jeong.txt
"servic provid" found in 25 documents
	QueryEffectiveness.txt
	00863988.txt
	05_AMI_Cortese.txt
	06_AMI__Piva.txt
	07_AMI_Kameas.txt
	10_AMI_Simplicity.txt
	13_AMI_Laso.txt
	16_58_10_733_Lead_Magazine_feature.txt
	2009_trouva.txt
	2015.10.12_Millenson_Berenson.txt
	2393216.2393281.txt
	2666795.2666817.txt
	33163615.txt
	33451B Medical Device Manufacturing in the US Industry Report.txt
	3_Embedded_Care_Coordination_Models_To_Manage_Diverse.txt
	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	51121C Business Analytics & Enterprise Software Publishing in the 

	23.txt
	25.txt
	2553062.2553065.txt
	2666795.2666817.txt
	37_ExampleBased_JMSP_SpecialIsuue.txt
	4.txt
	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	5.txt
	54151 IT Consulting in the US Industry Report.txt
	6.txt
	62211 Hospitals in the US Industry Report.txt
	7-1-13_SR_GivingUSA.txt
	7007809.txt
	8.txt
	9.txt
	A Model for Types and Levels of Human Interaction with Automation.txt
	A Review of 326 Children with Developmental and Physical Disabilities, Consecutively Taught at the Movement Development Clinic.txt
	A Review of Quasi-Linear Pilot Models.txt
	AACOpenNewWorld.txt
	aar.txt
	Acquity_Group_Whitepaper_-_The_Emerging_Retailer_Guide.txt
	ActorModelOfComputation.txt
"total number" found in 26 documents
	Text Clustering Algorithms.txt
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	00023.txt
	01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt
	04359339.txt
	0e.txt
	1.txt
	10.1.1.130.6691.txt
	10.1.1.

	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	4824-imagenet-classification-with-deep-convolutional-neural-networks.txt
	54151 IT Consulting in the US Industry Report.txt
	5CsofAgileManagement.txt
	6.txt
	62211 Hospitals in the US Industry Report.txt
	7.txt
	A Study of Design Requirements for Mobile Learning Environments.txt
	a12.txt
	Acquity_Group_Whitepaper_-_The_Emerging_Retailer_Guide.txt
	ActorModelOfComputation.txt
"knowledg manag" found in 11 documents
	Text-Analytics-and-Natural-Language-Processing--.txt
	0044_559_ishitani_y.txt
	01_AMI_Alcaniz.txt
	04100664.txt
	04_AMI_istag.txt
	05_AMI_Cortese.txt
	12_AMI_Bettiol.txt
	13_AMI_Laso.txt
	1606.03507.txt
	21.txt
	A  cognitive affective model of organizational communication for designing IT.txt
"product market" found in 10 documents
	Text-Analytics-and-Natural-Language-Processing--.txt
	12_steps_paper.txt
	33451B Medical Device Manufacturing in the US Industry Report.txt
	42345 Medical Supplies Wholesaling in th

	2008 Neuroprosthetics.txt
	2009_trouva.txt
	21.txt
	22.txt
	23.txt
	2393216.2393281.txt
	24.txt
	420-1094-2-PB.txt
	427-1101-2-PB.txt
	A Comparisoon of Binarization Methods for Historical Archive Documents.txt
	A Study of Design Requirements for Mobile Learning Environments.txt
"univers press" found in 35 documents
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	00a.txt
	02_AMI_Riva.txt
	03_AMI_Gaggioli.txt
	0e.txt
	1.txt
	10.1.1.115.7110.txt
	10.1.1.130.6691.txt
	10.1.1.62.889.txt
	11.txt
	12.txt
	12_AMI_Bettiol.txt
	1306267704-OptimalExperienceinWorkandLeisure.txt
	142541.txt
	16.txt
	17.txt
	1708.02924.txt
	18.txt
	19.txt
	20.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	2015.10.12_Millenson_Berenson.txt
	21.txt
	21851407.txt
	24.txt
	25.txt
	3.txt
	3_Toms.txt
	4.txt
	5.txt
	7.txt
	8.txt
	aar.txt
	Acting_to_know_improving_creativity_in_t.txt
	ActorModelOfComputation.txt
"new direct" found in 10 documents
	Workshop on Robust Methods

	4.txt
	6.txt
	7.txt
	8.txt
	831-828-1-PB.txt
	9.txt
	A  cognitive affective model of organizational communication for designing IT.txt
	A Bible for the Disability Field.txt
	A Comparisoon of Binarization Methods for Historical Archive Documents.txt
	A Model for Types and Levels of Human Interaction with Automation.txt
	a12.txt
	aar.txt
	ActorModelOfComputation.txt
"univers california" found in 21 documents
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt
	0e.txt
	10.1.1.62.889.txt
	10.txt
	12.txt
	1203.txt
	13_AMI_Laso.txt
	16.txt
	17.txt
	18.txt
	19.txt
	24.txt
	30 Years of Minix.txt
	4.txt
	420-1094-2-PB.txt
	6.txt
	62211 Hospitals in the US Industry Report.txt
	A Model for Types and Levels of Human Interaction with Automation.txt
	aar.txt
	ActorModelOfComputation.txt
"high accuraci" found in 11 documents
	Workshop on Robust Methods in Analysis of Natural Language Data.t

"provid addit" found in 10 documents
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	04_AMI_istag.txt
	05_AMI_Cortese.txt
	10.1.1.115.7110.txt
	10.1.1.123.2158.txt
	10_AMI_Simplicity.txt
	173-pap0297-bateman.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	3_Embedded_Care_Coordination_Models_To_Manage_Diverse.txt
	62211 Hospitals in the US Industry Report.txt
"provid altern" found in 11 documents
	Workshop on Robust Methods in Analysis of Natural Language Data.txt
	01217605.txt
	10.1.1.32.4689.txt
	10_AMI_Simplicity.txt
	15_AMI_Morganti.txt
	1606.03507.txt
	2008 Neuroprosthetics.txt
	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	62211 Hospitals in the US Industry Report.txt
	A  cognitive affective model of organizational communication for designing IT.txt
	A Model for Types and Levels of Human Interaction with Automation.txt
"good result" found in 23 documents
	Workshop on Robust Methods in Analysis of Natural Language

	2009_trouva.txt
	2393216.2393281.txt
	2666795.2666817.txt
	280-p276-winberg.txt
	30 Years of Minix.txt
	4824-imagenet-classification-with-deep-convolutional-neural-networks.txt
	56160540.txt
	8.txt
	831-828-1-PB.txt
	A light-weight text image processing method for handheld embedded cameras.txt
	acsc99.txt
	ActorModelOfComputation.txt
"document analysi recognit" found in 11 documents
	0028_504_uchida_s.txt
	0044_559_ishitani_y.txt
	00969115.txt
	01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt
	04359339.txt
	04407722.txt
	10.1.1.72.8127.txt
	10.1.1.91.9906.txt
	156469.txt
	2009_trouva.txt
	A Comparisoon of Binarization Methods for Historical Archive Documents.txt
"markup languag" found in 11 documents
	0044_559_ishitani_y.txt
	00863988.txt
	01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt
	01237554.txt
	01_AMI_Alcaniz.txt
	05_AMI_Cortese.txt
	06_AMI__Piva.txt
	10.1.1.91.9906.txt
	12.txt
	24.txt
	331

	2006ICAD-WalkerNanceLindsay.txt
	25.txt
	3.txt
	4.txt
	5.txt
	7.txt
	aar.txt
"general model" found in 13 documents
	00a.txt
	04100664.txt
	10.1.1.123.2158.txt
	10.1.1.130.6691.txt
	1412.2306v2.txt
	142482.txt
	213.txt
	24.txt
	6.txt
	9.txt
	A  cognitive affective model of organizational communication for designing IT.txt
	A Model for Types and Levels of Human Interaction with Automation.txt
	aar.txt
"estim probabl" found in 14 documents
	00a.txt
	10.txt
	14.txt
	1406.2661v1.txt
	1506.01497v3.txt
	20.txt
	25.txt
	4.txt
	5.txt
	6.txt
	8.txt
	9.txt
	a2-jeong.txt
	AchievingKaiserPermanenteQuality2016McHugh.txt
"sourc inform" found in 13 documents
	00a.txt
	06_AMI__Piva.txt
	1.txt
	14_AMI_Cabrera.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	21851407.txt
	22.txt
	33451B Medical Device Manufacturing in the US Industry Report.txt
	419.full.txt
	427-1101-2-PB.txt
	5845-15287-1-PB.txt
	7-Things-Health-Insurance-Customers-Not-Telling-You.txt
	A Model for Types and L

	2009_trouva.txt
	33163615.txt
	427-1101-2-PB.txt
	9.txt
	A Parser for Real-Time Speech Synthesis of Conversational Texts.txt
"system help" found in 12 documents
	01_AMI_Alcaniz.txt
	12.txt
	13_AMI_Laso.txt
	14_AMI_Cabrera.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	2015.10.12_Millenson_Berenson.txt
	2393216.2393281.txt
	24.txt
	3_Embedded_Care_Coordination_Models_To_Manage_Diverse.txt
	4.txt
	54151 IT Consulting in the US Industry Report.txt
	62211 Hospitals in the US Industry Report.txt
"user perform" found in 11 documents
	01_AMI_Alcaniz.txt
	06_AMI__Piva.txt
	10.1.1.62.889.txt
	10.1.1.65.7635.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	2009_trouva.txt
	5845-15287-1-PB.txt
	831-828-1-PB.txt
	AACOpenNewWorld.txt
	AcquiringMastery.txt
	Acting_to_know_improving_creativity_in_t.txt
"complex inform" found in 10 documents
	01_AMI_Alcaniz.txt
	0e.txt
	10.1.1.65.7635.txt
	13_AMI_Laso.txt
	15.txt
	24.txt
	33163615.txt
	A  cognitiv

	aar.txt
	Acting_to_know_improving_creativity_in_t.txt
"provid high" found in 15 documents
	05_AMI_Cortese.txt
	06_AMI__Piva.txt
	08_AMI_Kleiner.txt
	10.1.1.130.6691.txt
	10.1.1.72.8127.txt
	142541.txt
	1492645.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	2016HealthcareBenchmarks_Care_Coordination_preview.txt
	3_Embedded_Care_Coordination_Models_To_Manage_Diverse.txt
	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	62151 Diagnostic & Medical Laboratories in the US Industry Report.txt
	62211 Hospitals in the US Industry Report.txt
	62231 Specialty Hospitals in the US Industry Report.txt
	AchievingKaiserPermanenteQuality2016McHugh.txt
"includ use" found in 12 documents
	05_AMI_Cortese.txt
	10.1.1.72.8127.txt
	12.txt
	17.txt
	2003_CSE_approach_to_decision_support_systems - Smith and Geddes.txt
	23.txt
	25.txt
	3.txt
	62151 Diagnostic & Medical Laboratories in the US Industry Report.txt
	8.txt
	A Comparisoon of Binarization Methods for Histor

	23.txt
	24.txt
	3.txt
	37_ExampleBased_JMSP_SpecialIsuue.txt
	9.txt
	A Comparisoon of Binarization Methods for Historical Archive Documents.txt
"carnegi mellon" found in 10 documents
	1.txt
	10.1.1.130.6691.txt
	10.txt
	16.txt
	18.txt
	213.txt
	420-1094-2-PB.txt
	7.txt
	8.txt
	9.txt
"mellon univers" found in 10 documents
	1.txt
	10.1.1.130.6691.txt
	10.txt
	16.txt
	18.txt
	213.txt
	420-1094-2-PB.txt
	7.txt
	8.txt
	9.txt
"system exampl" found in 11 documents
	1.txt
	10.1.1.123.2158.txt
	142541.txt
	16.txt
	2009_trouva.txt
	24.txt
	25.txt
	2553062.2553065.txt
	4.txt
	A  cognitive affective model of organizational communication for designing IT.txt
	ActorModelOfComputation.txt
"chapter introduc" found in 19 documents
	1.txt
	11.txt
	12.txt
	14.txt
	15.txt
	16.txt
	17.txt
	19.txt
	20.txt
	2009_trouva.txt
	23.txt
	24.txt
	25.txt
	3.txt
	4.txt
	5.txt
	6.txt
	7.txt
	9.txt
"stor note" found in 20 documents
	1.txt
	10.txt
	11.txt
	12.txt
	14.txt
	15.txt
	16.txt
	19.txt
	20.txt
	21.txt
	23.txt


	3_Embedded_Care_Coordination_Models_To_Manage_Diverse.txt
	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	52411B Health & Medical Insurance in the US Industry Report.txt
	62151 Diagnostic & Medical Laboratories in the US Industry Report.txt
	62211 Hospitals in the US Industry Report.txt
	62231 Specialty Hospitals in the US Industry Report.txt
	AchievingKaiserPermanenteQuality2016McHugh.txt
"primari care" found in 10 documents
	1411416708528.txt
	2015.10.12_Millenson_Berenson.txt
	2016HealthcareBenchmarks_Care_Coordination_preview.txt
	3_Embedded_Care_Coordination_Models_To_Manage_Diverse.txt
	419.full.txt
	42345 Medical Supplies Wholesaling in the US Industry Report.txt
	52411B Health & Medical Insurance in the US Industry Report.txt
	62151 Diagnostic & Medical Laboratories in the US Industry Report.txt
	62211 Hospitals in the US Industry Report.txt
	62231 Specialty Hospitals in the US Industry Report.txt
"tend use" found in 10 documents
	142482.txt
	17.txt
	19.txt


In [19]:
# Create vectors for all n-grams (with freq>1?) between two docs, multiply them,
#     then divide by the product Euclidian norms
# Print out similar phrases
def freqMatrices(ngrams, jsonDocPath = jsonDocIndex, jsonInvPath = jsonInvIndex, jsonTFMPath = jsonTFMatrix, limit = -1):
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
        return
    if not os.path.exists(jsonInvPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        print('Check that the path is correct. You must call update_inv_index first and the file will be created')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonInvPath, 'r') as jsonInv:
        invIndex = json.load(jsonInv)
        
    # jsonTFM is created if it does not exist
    if os.path.exists(jsonTFMPath):
        with open(jsonTFMPath, 'r') as jsonFile:
            tfMatrix = json.load(jsonFile)
    else:
        tfMatrix = {}     
    
    lastDoc = -1
    docsProcessed = 0
    for doc1 in invIndex:
        docNum = 1 + docsProcessed
        docsProcessed += 1
        doc1Terms = []
        if doc1 not in tfMatrix:
            tfMatrix[doc1] = {}
        for ngram, terms in invIndex[doc1].items():
            if int(ngram) in ngrams:
                for term in terms:
                    doc1Terms.append(term)
        if int(doc1) == lastDoc:
            break
        for doc2 in invIndex:
            if int(doc2) == lastDoc and lastDoc > 0:
                break
            if  docsProcessed == 1 and docNum >= limit and limit > 0:
                lastDoc = int(doc2)
                break
            if doc2 > doc1 and doc2 not in tfMatrix[doc1]:
                docNum += 1
                print(docIndex[doc1]+" and "+docIndex[doc2])
                if doc2 not in tfMatrix[doc1]:
                    tfMatrix[doc1][doc2] = {}
                    tfMatrix[doc1][doc2]["all"] = {}
                    tfMatrix[doc1][doc2]["like"] = {}
                allTerms = doc1Terms.copy()
                likeTerms = []
                for ngram, terms in invIndex[doc2].items():
                    if int(ngram) in ngrams:
                        for term in terms:
                            if term not in allTerms:
                                allTerms.append(term)
                            else:
                                likeTerms.append(term)
                allTerms.sort()
                likeTerms.sort()
                doc1all = {}
                doc2all = {}
                doc1like = {}
                doc2like = {}
                for gramLen in ngrams:
                    doc1all[gramLen] = []
                    doc2all[gramLen] = []
                    doc1like[gramLen] = []
                    doc2like[gramLen] = []
                for term in allTerms:
                    length = len(term.split())
                    ngram = str(len(term.split()))
                    doc1all[length].append(invIndex[doc1][ngram].get(term, 0))
                    doc2all[length].append(invIndex[doc2][ngram].get(term, 0))
                for term in likeTerms:
                    length = len(term.split())
                    ngram = str(len(term.split()))
                    doc1like[length].append(invIndex[doc1][ngram].get(term))
                    doc2like[length].append(invIndex[doc2][ngram].get(term))
                tfMatrix[doc1][doc2]["all"][doc1] = doc1all
                tfMatrix[doc1][doc2]["all"][doc2] = doc2all
                tfMatrix[doc1][doc2]["like"][doc1] = doc1like
                tfMatrix[doc1][doc2]["like"][doc2] = doc2like
                numLike = len(likeTerms)
                print("Doc1 has "+str(len(doc1Terms)-numLike)+" unique terms")
                print("Doc2 has "+str(len(allTerms)-len(doc1Terms))+" unique terms")
                for gramLength in ngrams:
                    print("{}-grams: {}".format(gramLength, len(doc1like[gramLength])))
                print("Total like terms: "+str(numLike))
                print("Total unlike terms: "+str(len(allTerms)-numLike), end="\n\n")
                with open(jsonTFMPath, 'w') as jsonFile:
                    json.dump(tfMatrix, jsonFile, indent=4)
grams = [2, 3]
freqMatrices(grams, limit=4)
    

1
1
1
1
1
Lafferty_pcfg-notes.txt and QueryEffectiveness.txt
Doc1 has 2046 unique terms
Doc2 has 1846 unique terms
2-grams: 5
3-grams: 0
Total like terms: 5
Total unlike terms: 3892

2
Lafferty_pcfg-notes.txt and Text Clustering Algorithms.txt
Doc1 has 2007 unique terms
Doc2 has 13311 unique terms
2-grams: 44
3-grams: 0
Total like terms: 44
Total unlike terms: 15318

3
Lafferty_pcfg-notes.txt and Text-Analytics-and-Natural-Language-Processing--.txt
Doc1 has 2047 unique terms
Doc2 has 4462 unique terms
2-grams: 4
3-grams: 0
Total like terms: 4
Total unlike terms: 6509

4
8
2
2
2
2
2
LearningExecutableSemanticParsers.txt and QueryEffectiveness.txt
Doc1 has 5369 unique terms
Doc2 has 1841 unique terms
2-grams: 9
3-grams: 1
Total like terms: 10
Total unlike terms: 7210

3
LearningExecutableSemanticParsers.txt and Text Clustering Algorithms.txt
Doc1 has 5299 unique terms
Doc2 has 13275 unique terms
2-grams: 77
3-grams: 3
Total like terms: 80
Total unlike terms: 18574

4
LearningExecutableSe

In [None]:
import math
def calcCosSim(list1, list2):
    if len(list1) != len(list2):
        print("Lists do not have the same number of elements")
        return -1
    sumProd = sum(n1*n2 for n1, n2 in zip(list1, list2))
    sumL1 = sum(n**2 for n in list1)
    sumL2 = sum(n**2 for n in list2)
    return sumProd/(math.sqrt(sumL1)*math.sqrt(sumL2))

def cossim(gramWeights, jsonDocPath = jsonDocIndex, jsonTFMPath = jsonTFMatrix, workingDir = commonDir):
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
    if not os.path.exists(jsonTFMPath):
        print('FILE NOT FOUND: The specified file "' + jsonTFMPath + '" does not exist')
        print('Check that the path is correct. You must call freqMatrices first and the file will be created')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonTFMPath, 'r') as jsonInv:
        tfMatrix = json.load(jsonInv)
    
    for doc1 in tfMatrix:
        for doc2, values in tfMatrix[doc1].items():
            print(docIndex[doc1]+"({}) and ".format(doc1)+docIndex[doc2]+"({})".format(doc2))
            doc1freq = []
            doc2freq = []
            doc1weighted = []
            doc2weighted = []
            for gramLen, weight in gramWeights.items():
                gramLen = str(gramLen)
                doc1freq += values["like"][doc1][gramLen]
                doc2freq += values["like"][doc2][gramLen]
                doc1weighted += [ val*weight for val in values["like"][doc1][gramLen] ]
                doc2weighted += [ val*weight for val in values["like"][doc2][gramLen] ]   
            print("Cosine Similarity: ", calcCosSim(doc1freq, doc2freq))
            print("Weighted: ", calcCosSim(doc1weighted, doc2weighted), end="\n\n")        
            
gramWeight = {2:1, 3:5}
cossim(gramWeight)