In [None]:
import logging
import os # For file/directory interaction
import time, sys
from datetime import datetime, date # For log data
import re # For text replacement
import spacy # Pipeline processes (stopword and punctuation removal, lemmatization)
from nltk.stem.snowball import SnowballStemmer # Pipeline process for stemming
import json

workingDir = os.getcwd()
commonDir = os.path.join(workingDir,'Files')
pdfDir = os.path.join(commonDir,'PDF')
txtFilesDir = os.path.join(commonDir,'Text Files')
rtnFilesDir = os.path.join(commonDir,'n Removed')
spaceFilesDir = os.path.join(commonDir,'No Spaces')
swFilesDir = os.path.join(commonDir,'Stop Words')
engFilesDir = os.path.join(commonDir,'English Words')
stemFilesDir = os.path.join(commonDir,'Stemmed')
jsonDocIndex = os.path.join(commonDir,'doc_dictionary.json')
jsonInvIndex = os.path.join(commonDir,'inverted_index.json')
jsonGramIndex = os.path.join(commonDir,'gram_index.json')
jsonTFMatrix = os.path.join(commonDir,'tf_matrix.json')
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO

# Uses PDFminer3k to extract text from PDF documents
def getText(pdfPath):
    # Sets up necessary objects
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, laparams=laparams)
    # Reads file to the text converter
    with open(pdfPath, 'rb') as pdfFile:
        process_pdf(rsrcmgr, device, pdfFile)
    # Retrieves text result
    text = sio.getvalue()
    device.close()
    sio.close()
    return text

# Pre-condition: All PDF files to be processed are in the sub-directory
#     pdfDir, and pdfDir is in workingDir. workingDir is by default the 
#     directory in which the program is executed
# Post-condition: All PDF files processed without error are converted to
#     text files which are placed in a sub-directory 'Text Files'
# NOTE: This will process all documents by default. Change the value of
#     'limit' to limit the number of documents processed at once
def pdfToText(pdfPath = pdfDir, txtPath = txtFilesDir, workingDir = commonDir, limit = -1):
    if not os.path.exists(pdfPath):
        print('The specified directory "' + pdfPath + '" does not exist')
        return
    # Creates 'Text Files' directory for converted PDFs
    if not os.path.exists(txtPath):
        os.mkdir(txtPath)
    
    docNum = 0
    docErr = 1
    totalNum = len([file for file in os.scandir(pdfPath) if file.name.endswith('.pdf')])
    with open(os.path.join(workingDir,'log.txt'), 'a+', encoding="utf-8") as log:    
        log.write("PDF to Text\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")    
        for entity in os.scandir(pdfPath):
            # Moves on to next entity if the current entity is not a PDF
            if not entity.name.endswith('.pdf'):
                continue
            index = -4 # Remove '.pdf' from file name when creating '.txt' file
            fileName = entity.name[:index]+'.txt'
            
            # Attempt to read the PDF file, extract text from each page,
            #     and write the text to a text file with the same name
            # Some documents are protected, corrupted, etc. and text cannot be extracted
            # Exceptions are recorded in log.txt
            # hasError remains true until each step in the try block is complete
            if fileName not in os.listdir(txtPath): 
                print("Now on '"+entity.name+"'. . . ", end='')
                # Extracts text via getText method and writes to a text file. 
                # Errors are reported in log.txt
                hasError = True
                try:
                    text = getText(os.path.join(pdfPath,entity.name))
                    txtFile = open(os.path.join(txtPath,fileName), 'w+', encoding="utf-8")
                    txtFile.write(text)
                    docNum += 1
                    print("done ({}/{})".format(docNum, limit))
                    hasError = False
                except Exception as e:
                    log.write(str(docErr)+": " + entity.name + ": \n\t" + str(e)+"\n")
                if hasError:
                    print("there was an error reading this document. See log for details. Reference number "+str(docErr)+".\n")
                    docErr += 1
            else:
                continue
            if docNum >= limit and limit > 0:
                print("PDF to Text was stopped after "+str(docNum)+" documents.")
                break
        log.write("\n\n")
pdfToText(limit = 5)

In [None]:
# THIS MUST BE AFTER TEXT CONVERSION BEFORE ANY OTHER FUNCTIONS
# Function to remove \n
def rmvN(txtPath = txtFilesDir, rtnPath = rtnFilesDir):
    # Checks that text file directory exists/is correct
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call pdfToText first and the directory will be created')
        return
    if not os.path.exists(rtnPath):
        os.mkdir(rtnPath)

    # Substitutes returns and hyphens at the end of each line with empty strings
    for entity in os.scandir(txtPath):
        txtFilePath = os.path.join(txtPath,entity.name)
        rtnFilePath = os.path.join(rtnPath,entity.name)
        if not os.path.exists(rtnFilePath):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(txtFilePath, 'r+', encoding='utf-8') as txtFile:
                with open(rtnFilePath, 'w+', encoding='utf-8') as rtnFile:
                    text = txtFile.read()
                    text = re.sub('-\n', '', text)
                    text = re.sub('\n', ' ', text)
                    rtnFile.write(text)
                    rtnFile.truncate()
            print("done")
rmvN()

In [None]:
# Funtion to move files without spaces to new 'Without Spaces' directory         
def checkSpaces(txtPath = rtnFilesDir, spacesPath = spaceFilesDir, workingDir = commonDir):
    # Checks that text file directory exists
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvN first and the directory will be created')
        return
    if not os.path.exists(spacesPath):
        os.mkdir(spacesPath)
    
    with open(os.path.join(workingDir,'Spaces.txt'), 'a+', encoding='utf-8') as spaces: 
        spaces.write("Check Spaces\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")
        for entity in os.scandir(txtPath):
            print("Now on '"+entity.name+"'. . . ", end='')
            txtFile = open(os.path.join(txtPath,entity.name), 'r', encoding='utf-8')
            text = txtFile.read()
            split = text.split(' ')
            if len(split) < len(text)/10 or len(text) < 100 or text == '':
                txtFile.close()
                spaces.write(entity.name+'\n')
                if entity.name not in os.listdir(spacesPath):
                    os.rename(os.path.join(txtPath,entity.name), os.path.join(spacesPath,entity.name))
                else:
                    os.remove(os.path.join(txtPath,entity.name))
            print("done")
        spaces.write('\n\n')
checkSpaces() 

In [None]:
# Function to remove stopwords
def rmvStopWords(txtPath = rtnFilesDir, swPath = swFilesDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvN first and the directory will be created')
        return
    if not os.path.exists(swPath):
        os.mkdir(swPath)

    nlp = spacy.load("en_core_web_sm")
    for entity in os.scandir(txtPath):
        if not os.path.exists(os.path.join(swPath,entity.name)):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(os.path.join(txtPath,entity.name), 'r+', encoding='utf-8') as txtFile:
                with open(os.path.join(swPath,entity.name), 'w+', encoding='utf-8') as swFile:
                    doc = nlp(txtFile.read())
                    noStopWords = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.text.isnumeric()]
                    swFile.write(" ".join(noStopWords))
                    swFile.truncate()
            print("done")

rmvStopWords()

In [None]:
# Remove non-english words
def rmvNonEng(txtPath = swFilesDir, engPath = engFilesDir, workingDir = commonDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvStopWords first and the directory will be created')
        return
    if not os.path.exists(engPath):
        os.mkdir(engPath)
    with open(os.path.join(workingDir,'words_dictionary.json')) as json_file:
        words = json.load(json_file)
        
    lets = []
    alph = 'abcdefghijklmnopqrstuvwxyz'
    for let in alph:
        lets.append(let)
        for char in alph:
            lets.append(let+char)
        
    for entity in os.scandir(txtPath):
        if not os.path.exists(os.path.join(engPath,entity.name)):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(os.path.join(txtPath,entity.name), 'r+', encoding='utf-8') as txtFile:
                with open(os.path.join(engPath,entity.name), 'w+', encoding='utf-8') as engFile:
                    text = txtFile.read().split(' ')
                    engChars = [word for word in text if word in words and word not in lets]
                    engFile.write(" ".join(engChars))
                    engFile.truncate()
            print("done")
rmvNonEng()

In [None]:
# Stem words in all documents
def stem(txtPath = engFilesDir, stemPath = stemFilesDir, workingDir = commonDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call rmvNonEng first and the directory will be created')
        return
    if not os.path.exists(stemPath):
        os.mkdir(stemPath)
        
    stemmer = SnowballStemmer(language='english')
    for entity in os.scandir(txtPath):
        if not os.path.exists(os.path.join(stemPath, entity.name)):
            print("Now on '"+entity.name+"'. . . ", end='')
            with open(os.path.join(txtPath,entity.name), 'r+', encoding='utf-8') as txtFile:
                with open(os.path.join(stemPath,entity.name), 'w+', encoding='utf-8') as stemFile:
                    text = txtFile.read().split(' ')
                    stemmed = [stemmer.stem(word) for word in text]
                    stemFile.write(" ".join(stemmed))
                    stemFile.truncate()
            print("done")
stem()

In [None]:
# Create JSON with file_name : doc_id
def update_doc_index(jsonPath = jsonDocIndex, txtPath = stemFilesDir):
    if os.path.exists(jsonPath):
        with open(jsonPath, 'r') as jsonFile:
            docIndex = json.load(jsonFile)
    else:
        docIndex = {}
    for fileName in os.listdir(txtPath):
        if fileName not in docIndex.values():
            docIndex[len(docIndex)+1] = fileName
    with open(jsonPath, 'w') as jsonFile:
        json.dump(docIndex, jsonFile, indent=4)

update_doc_index()

In [None]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_inv_index(ngrams, jsonDocPath = jsonDocIndex, jsonInvPath = jsonInvIndex, txtPath = stemFilesDir, workingDir = commonDir):
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call stem first and the directory will be created')
        return
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
        return
    if os.path.exists(jsonInvPath):
        with open(jsonInvPath, 'r') as jsonFile:
            invIndex = json.load(jsonFile)
    else:
        invIndex = {}
    # Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
        
    for ngram in ngrams:
        for docID in docIndex:
            if docID not in invIndex:
                invIndex[docID] = {}
            if ngram not in invIndex[docID]:
                invIndex[docID][ngram] = {}
                with open(os.path.join(txtPath,docIndex[docID]), 'r', encoding='utf-8') as txtFile:
                    text = txtFile.read().split(' ')
                    while len(text) > ngram-1:
                        term = " ".join(text[:ngram])
                        if term in invIndex[docID][ngram]:
                            invIndex[docID][ngram][term] += 1
                        else:
                            invIndex[docID][ngram][term] = 1
                        text.pop(0)
    with open(jsonInvPath, 'w') as jsonFile:
        json.dump(invIndex, jsonFile, indent=4)

update_inv_index(list(range(2,4)))

In [None]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_gram_index(minFreq = 1, jsonInvPath = jsonInvIndex, jsonGramPath = jsonGramIndex, txtPath = stemFilesDir):
    # Makes sure all files and directories exist (jsonInv, txtDir)
    if not os.path.exists(jsonInvPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        print('Check that the path is correct. You must call update_inv_index first and the file will be created')
        return
    # jsonGram is created if it does not exist
    if os.path.exists(jsonGramPath):
        with open(jsonGramPath, 'r') as jsonFile:
            gramIndex = json.load(jsonFile)
    else:
        gramIndex = {}
    # Loads Inverted File index
    with open(jsonInvPath, 'r') as jsonFile:
        invIndex = json.load(jsonFile)
        
    for docID, ngrams in invIndex.items():
        for terms in ngrams.values():
            for term, freq in terms.items():
                if freq >= minFreq:
                    if term not in gramIndex:
                        gramIndex[term] = {}
                    gramIndex[term][docID] = freq
    # Writes gramIndex to gram JSON file
    with open(jsonGramPath, 'w') as jsonFile:
        json.dump(gramIndex, jsonFile, indent=4)

update_gram_index()              

In [None]:
# Print grams found in 3 or more documents
def getGrams(numDocs = 3, jsonDocPath = jsonDocIndex, jsonGramPath = jsonGramIndex, txtPath = stemFilesDir, workingDir = commonDir):
    # Makes sure all files and directories exist (textDir, jsonDoc, txtDir)
    if not os.path.exists(txtPath):
        print('The specified directory "' + txtPath + '" does not exist')
        print('Check that the path is correct. You must call stem first and the directory will be created')
        return
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
        return
    if not os.path.exists(jsonGramPath):
        print('FILE NOT FOUND: The specified file "' + jsonGramPath + '" does not exist')
        print('Check that the path is correct. You must call update_gram_index first and the file will be created')
        return
    # Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
    # Loads gram index 
    with open(jsonGramPath, 'r') as jsonFile:
        gramIndex = json.load(jsonFile)
    
    for gram, docs in gramIndex.items():
        if len(docs) >= numDocs:
            print("\""+gram+"\" found in "+str(len(docs))+" documents")
            for docID in docs:
                print("\t"+ str(docIndex[docID]))
                
getGrams(10)

In [None]:
# Create vectors for all n-grams (with freq>1?) between two docs, multiply them,
#     then divide by the product Euclidian norms
# Print out similar phrases
def freqMatrices(ngrams, jsonDocPath = jsonDocIndex, jsonInvPath = jsonInvIndex, jsonTFMPath = jsonTFMatrix, limit = -1):
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
        return
    if not os.path.exists(jsonInvPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        print('Check that the path is correct. You must call update_inv_index first and the file will be created')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonInvPath, 'r') as jsonInv:
        invIndex = json.load(jsonInv)
        
    # jsonTFM is created if it does not exist
    if os.path.exists(jsonTFMPath):
        with open(jsonTFMPath, 'r') as jsonFile:
            tfMatrix = json.load(jsonFile)
    else:
        tfMatrix = {}     
    
    lastDoc = -1
    docsProcessed = 0
    for doc1 in invIndex:
        docNum = 1 + docsProcessed
        docsProcessed += 1
        doc1Terms = []
        if doc1 not in tfMatrix:
            tfMatrix[doc1] = {}
        for ngram, terms in invIndex[doc1].items():
            if int(ngram) in ngrams:
                for term in terms:
                    doc1Terms.append(term)
        if int(doc1) == lastDoc:
            break
        for doc2 in invIndex:
            if int(doc2) == lastDoc and lastDoc > 0:
                break
            if  docsProcessed == 1 and docNum >= limit and limit > 0:
                lastDoc = int(doc2)
                break
            if int(doc2) > int(doc1) and doc2 not in tfMatrix[doc1]:
                docNum += 1
                print(docIndex[doc1]+" and "+docIndex[doc2])
                if doc2 not in tfMatrix[doc1]:
                    tfMatrix[doc1][doc2] = {}
                    tfMatrix[doc1][doc2]["all"] = {}
                    tfMatrix[doc1][doc2]["like"] = {}
                allTerms = doc1Terms.copy()
                likeTerms = []
                for ngram, terms in invIndex[doc2].items():
                    if int(ngram) in ngrams:
                        for term in terms:
                            if term not in allTerms:
                                allTerms.append(term)
                            else:
                                likeTerms.append(term)
                allTerms.sort()
                likeTerms.sort()
                doc1all = {}
                doc2all = {}
                doc1like = {}
                doc2like = {}
                for gramLen in ngrams:
                    doc1all[gramLen] = []
                    doc2all[gramLen] = []
                    doc1like[gramLen] = []
                    doc2like[gramLen] = []
                for term in allTerms:
                    length = len(term.split())
                    ngram = str(len(term.split()))
                    doc1all[length].append(invIndex[doc1][ngram].get(term, 0))
                    doc2all[length].append(invIndex[doc2][ngram].get(term, 0))
                for term in likeTerms:
                    length = len(term.split())
                    ngram = str(len(term.split()))
                    doc1like[length].append(invIndex[doc1][ngram].get(term))
                    doc2like[length].append(invIndex[doc2][ngram].get(term))
                tfMatrix[doc1][doc2]["all"][doc1] = doc1all
                tfMatrix[doc1][doc2]["all"][doc2] = doc2all
                tfMatrix[doc1][doc2]["like"][doc1] = doc1like
                tfMatrix[doc1][doc2]["like"][doc2] = doc2like
                numLike = len(likeTerms)
                print("Doc1 has "+str(len(doc1Terms)-numLike)+" unique terms")
                print("Doc2 has "+str(len(allTerms)-len(doc1Terms))+" unique terms")
                for gramLength in ngrams:
                    print("{}-grams: {}".format(gramLength, len(doc1like[gramLength])))
                print("Total like terms: "+str(numLike))
                print("Total unlike terms: "+str(len(allTerms)-numLike), end="\n\n")
                with open(jsonTFMPath, 'w') as jsonFile:
                    json.dump(tfMatrix, jsonFile, indent=4)
grams = [2, 3]
freqMatrices(grams, limit=5)
    

In [None]:
import math
def calcCosSim(list1, list2):
    if len(list1) != len(list2):
        print("Lists do not have the same number of elements")
        return -1
    sumProd = sum(n1*n2 for n1, n2 in zip(list1, list2))
    sumL1 = sum(n**2 for n in list1)
    sumL2 = sum(n**2 for n in list2)
    try:
        cossim = sumProd/(math.sqrt(sumL1)*math.sqrt(sumL2))
    except:
        cossim = 0
    return cossim

def cossim(gramWeights, jsonDocPath = jsonDocIndex, jsonTFMPath = jsonTFMatrix, workingDir = commonDir):
    if not os.path.exists(jsonDocPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        print('Check that the path is correct. You must call update_doc_index first and the file will be created')
    if not os.path.exists(jsonTFMPath):
        print('FILE NOT FOUND: The specified file "' + jsonTFMPath + '" does not exist')
        print('Check that the path is correct. You must call freqMatrices first and the file will be created')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonTFMPath, 'r') as jsonInv:
        tfMatrix = json.load(jsonInv)
    
    for doc1 in tfMatrix:
        for doc2, values in tfMatrix[doc1].items():
            print(docIndex[doc1]+"({}) and ".format(doc1)+docIndex[doc2]+"({})".format(doc2))
            doc1freq = []
            doc2freq = []
            doc1weighted = []
            doc2weighted = []
            for gramLen, weight in gramWeights.items():
                gramLen = str(gramLen)
                doc1freq += values["like"][doc1][gramLen]
                doc2freq += values["like"][doc2][gramLen]
                doc1weighted += [ val*weight for val in values["like"][doc1][gramLen] ]
                doc2weighted += [ val*weight for val in values["like"][doc2][gramLen] ]   
            print("Cosine Similarity: ", calcCosSim(doc1freq, doc2freq))
            print("Weighted: ", calcCosSim(doc1weighted, doc2weighted), end="\n\n")        
            
gramWeight = {2:1, 3:5}
cossim(gramWeight)