In [None]:
# Future improvements
# Separate dir for each step
# Create JSON array to assign IDs, 
#     keep track of PDF files process (each step?) etc.
# Remove numbers, urls
# Change variable names of jsonDoc, jsonInv, jsonGram to avoid redefiining argument

In [4]:
import logging
import os # For file/directory interaction
import time, sys
from datetime import datetime, date # For log data
import re # For text replacement
import spacy # Pipeline processes (stopword and punctuation removal, lemmatization)
from nltk.stem.snowball import SnowballStemmer # Pipeline process for stemming
import json

txtFilesDir = 'Text Files'
rtnFilesDir = 'n Removed'
spaceFilesDir = 'No Spaces'
swFilesDir = 'Stop Words'
engFilesDir = 'English Words'
stemFilesDir = 'Stemmed'
jsonDocIndex = 'doc_dictionary.json'
jsonInvIndex = 'inverted_index.json'
jsonGramIndex = 'gram_index.json'
jsonTFMatrix = 'tf_matrix.json'
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

absolute = 'C:/Users/micah/Documents/IWU/CIS Practicum/Files'

In [8]:
# Pre-condition: All PDF files to be processed are in the sub-directory
#     pdfDir, and pdfDir is in absPath. absPath is by default the 
#     directory in which the program is executed
# Post-condition: All PDF files processed without error are converted to
#     text files which are placed in a new sub-directory 'Text Files'
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf#process_pdf
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

from io import StringIO

# Utilizes PDFminer3k to extract text from PDF documents
def getText(pdfPath):
# Sets up necessary objects
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, laparams=laparams)    
    
# Reads file to the text converter
    with open(pdfPath, 'rb') as pdfFile:
        process_pdf(rsrcmgr, device, pdfFile)                

# Retrieves text result
    text = sio.getvalue()
    
    device.close()
    sio.close()
    
    return text

def pdfToText(pdfDir, absPath = os.getcwd(), txtDir = txtFilesDir, stopAt = -1):
    
    pdfPath = absPath+'/'+pdfDir
    txtPath = absPath+'/'+txtDir
    if pdfDir not in os.listdir(absPath):
        print('The specified directory "' + directory + '" does not exist')
        return
# Creates 'Text Files' directory for converted PDFs
    if txtDir not in os.listdir(absPath):
        os.mkdir(txtPath)
    
    docNum = 0
    docErr = 1
    totalNum = len([file for file in os.scandir(pdfPath) if file.name.endswith('.pdf')])
    with open(absPath+'/'+'log.txt', 'a+', encoding="utf-8") as log:    
        for entity in os.scandir(pdfPath):
        # Moves on to next entity if the current entity is not a PDF
            if not entity.name.endswith('.pdf'):
                continue
            log.write("PDF to Text\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")    
            index = -4 # Remove '.pdf' from file name when creating '.txt' file
            fileName = entity.name[:index]+'.txt'
            print("Now on '"+entity.name+"'. . . ", end='')
            
        # This block attempts to read the PDF file, extract text from each page,
        #     and write the text to a text file with the same name
        # Some documents are protected, corrupted, etc. and text cannot be extracted
        # Exceptions are recorded in log.txt
        # hasError remains true until each step in the try block is complete
            if fileName not in os.listdir(txtPath): 
                hasError = True
            # Extracts text via getText method and writes to a text file. 
            # Errors are reported in log.txt
                try:
                    text = getText(pdfPath+'/'+entity.name)
                    txtFile = open(txtPath+'/'+fileName, 'w+', encoding="utf-8")
                    txtFile.write(text)
                    docNum += 1
                    print("done ({}/{})".format(docNum, stopAt))
                    hasError = False
                except Exception as e:
                    log.write(str(docErr)+": " + entity.name + ": \n\t" + str(e)+"\n")

                if hasError:
                    print("there was an error reading this document. See log for details. Reference number "+str(docErr)+".\n")
                    docErr += 1
            else:
                max = 10
                if len(fileName) > max:
                    print('"'+fileName[:max]+'...txt"', end='')
                else:
                    print('"'+fileName+'"', end='')
                print(' already exists')
            if docNum >= stopAt and stopAt > 0:
                print("PDF to Text was stopped after "+str(docNum)+" documents.")
                break
        log.write("\n\n")
pdfToText('PDF', absPath = absolute, stopAt = 10)

Now on '00023.pdf'. . . "00023.txt" already exists
Now on '0028_504_uchida_s.pdf'. . . "0028_504_u...txt" already exists
Now on '0044_559_ishitani_y.pdf'. . . "0044_559_i...txt" already exists
Now on '00708543.pdf'. . . "00708543.t...txt" already exists
Now on '00823977.pdf'. . . "00823977.t...txt" already exists
Now on '00863988.pdf'. . . "00863988.t...txt" already exists
Now on '009.pdf'. . . "009.txt" already exists
Now on '00969115.pdf'. . . "00969115.t...txt" already exists
Now on '00a.pdf'. . . "00a.txt" already exists
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.pdf'. . . "01-Altamur...txt" already exists
Now on '01046104.pdf'. . . "01046104.t...txt" already exists
Now on '01217605.pdf'. . . "01217605.t...txt" already exists
Now on '01237554.pdf'. . . "01237554.t...txt" already exists
Now on '01308672.pdf'. . . "01308672.t...txt" already exists
Now on '01673370.pdf'. . . "01673370.t...txt" already exists
Now on '01_AMI_Alcaniz.

Now on '33163615.pdf'. . . "33163615.t...txt" already exists
Now on '33451B Medical Device Manufacturing in the US Industry Report.pdf'. . . "33451B Med...txt" already exists
Now on '34-the-power-of-full-engagement.pdf'. . . "34-the-pow...txt" already exists
Now on '343362a0.pdf'. . . "343362a0.t...txt" already exists
Now on '37_ExampleBased_JMSP_SpecialIsuue.pdf'. . . "37_Example...txt" already exists
Now on '3_Embedded_Care_Coordination_Models_To_Manage_Diverse.pdf'. . . "3_Embedded...txt" already exists
Now on '3_Toms.pdf'. . . "3_Toms.txt" already exists
Now on '4.pdf'. . . "4.txt" already exists
Now on '419.full.pdf'. . . "419.full.t...txt" already exists
Now on '420-1094-2-PB.pdf'. . . "420-1094-2...txt" already exists
Now on '42345 Medical Supplies Wholesaling in the US Industry Report.pdf'. . . "42345 Medi...txt" already exists
Now on '427-1101-2-PB.pdf'. . . "427-1101-2...txt" already exists
Now on '4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf'. . .

In [9]:
# THIS MUST BE AFTER TEXT CONVERSION BEFORE ANY OTHER FUNCTIONS
# Function to remove \n
def rmvN(txtDir = txtFilesDir, rtnDir = rtnFilesDir, absPath = os.getcwd()):
# Checks that text file directory exists/is correct
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    rtnPath = absPath+'/'+rtnDir
    if rtnDir not in os.listdir(absPath):
        os.mkdir(rtnPath)

# Substitutes returns and hyphens at the end of each line with empty strings
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(rtnPath+'/'+entity.name, 'w+', encoding='utf-8') as rtnFile:
                text = txtFile.read()
                text = re.sub('-\n', '', text)
                text = re.sub('\n', '', text)
                rtnFile.write(text)
                rtnFile.truncate()
        print("done")
rmvN(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

Now on 'A Review of 326 Children with Developmental and Physical Disabilities, Consecutively Taught at the Movement Development Clinic.txt'. . . done
Now on 'A Review of Quasi-Linear Pilot Models.txt'. . . done
Now on 'A Set of C++ Classes for Co-routine Style Programming.txt'. . . done
Now on 'A Spiral Model of Software Development and Enhancement.txt'. . . done
Now on 'A Study of Design Requirements for Mobile Learning Environments.txt'. . . done
Now on 'A Target Tracking Algorithm of Passive Sensor Normal Truncated Model.txt'. . . done
Now on 'a12.txt'. . . done
Now on 'a2-jeong.txt'. . . done
Now on 'a2.txt'. . . done
Now on 'AACOpenNewWorld.txt'. . . done
Now on 'aar.txt'. . . done
Now on 'Lafferty_pcfg-notes.txt'. . . done
Now on 'LearningExecutableSemanticParsers.txt'. . . done
Now on 'p123-zhang.txt'. . . done
Now on 'Partial Parsing Finite-State Cascades.txt'. . . done
Now on 'QueryEffectiveness.txt'. . . done
Now on 'Text Clustering Algorithms.txt'. . . done
Now on 'Text-Anal

In [10]:
# Funtion to move files without spaces to new 'Without Spaces' directory         
def checkSpaces(txtDir = rtnFilesDir, spacesDir = spaceFilesDir, absPath = os.getcwd()):
# Checks that text file directory exists/is correct
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    spacesPath = absPath+'/'+spacesDir
    if spacesDir not in os.listdir(absPath):
        os.mkdir(spacesPath)
        
    with open(absPath+'/'+'Spaces.txt', 'a+', encoding='utf-8') as spaces: 
        spaces.write("Check Spaces\n" + date.today().strftime("%m/%d/%y") +
                  " at " + datetime.now().strftime("%H:%M:%S") + "\n\n")
        for entity in os.scandir(txtPath):
            print("Now on '"+entity.name+"'. . . ", end='')
            txtFile = open(txtPath+'/'+entity.name, 'r', encoding='utf-8')
            text = txtFile.read()
            split = text.split(' ')
            if len(split) < len(text)/10 or len(text) < 100 or text == '':
                txtFile.close()
                spaces.write(entity.name+'\n')
                if entity.name not in os.listdir(spacesPath):
                    os.rename(txtPath+'/'+entity.name, spacesPath+'/'+entity.name)
                else:
                    os.remove(txtPath+'/'+entity.name)
            print("done")
        spaces.write('\n\n')
checkSpaces(absPath = absolute) 

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

Now on 'A Model for Types and Levels of Human Interaction with Automation.txt'. . . done
Now on 'A Parser for Real-Time Speech Synthesis of Conversational Texts.txt'. . . done
Now on 'A Physically Based Approach to 2-D Shape Blending.txt'. . . done
Now on 'A Review of 326 Children with Developmental and Physical Disabilities, Consecutively Taught at the Movement Development Clinic.txt'. . . done
Now on 'A Review of Quasi-Linear Pilot Models.txt'. . . done
Now on 'A Set of C++ Classes for Co-routine Style Programming.txt'. . . done
Now on 'A Spiral Model of Software Development and Enhancement.txt'. . . done
Now on 'A Study of Design Requirements for Mobile Learning Environments.txt'. . . done
Now on 'A Target Tracking Algorithm of Passive Sensor Normal Truncated Model.txt'. . . done
Now on 'a12.txt'. . . done
Now on 'a2-jeong.txt'. . . done
Now on 'a2.txt'. . . done
Now on 'AACOpenNewWorld.txt'. . . done
Now on 'aar.txt'. . . done
Now on 'Lafferty_pcfg-notes.txt'. . . done
Now on 'Lear

In [11]:
# Function to remove stopwords
# NLTK or SpaCy
# Inverted File: gram:[doc1, doc3] or gram:[[doc1,freq], [doc3,freq]]
def rmvStopWords(txtDir = rtnFilesDir, swDir = swFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    swPath = absPath+'/'+swDir
    if swDir not in os.listdir(absPath):
        os.mkdir(swPath)

    nlp = spacy.load("en_core_web_sm")
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(swPath+'/'+entity.name, 'w+', encoding='utf-8') as swFile:
                doc = nlp(txtFile.read())
                noStopWords = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.text.isnumeric()]
                swFile.write(" ".join(noStopWords))
                swFile.truncate()
        print("done")

rmvStopWords(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [12]:
# Remove non-english words
def rmvNonEng(txtDir = swFilesDir, engDir = engFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    engPath = absPath+'/'+engDir
    if engDir not in os.listdir(absPath):
        os.mkdir(engPath)
    with open(absPath+'/'+'words_dictionary.json') as json_file:
        words = json.load(json_file)
        
    lets = []
    alph = 'abcdefghijklmnopqrstuvwxyz'
    for let in alph:
        lets.append(let)
        for char in alph:
            lets.append(let+char)
        
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(engPath+'/'+entity.name, 'w+', encoding='utf-8') as engFile:
                text = txtFile.read().split(' ')
                engChars = [word for word in text if word in words and word not in lets]
                engFile.write(" ".join(engChars))
                engFile.truncate()
        print("done")
rmvNonEng(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [13]:
# Stem words in all documents
def stem(txtDir = engFilesDir, stemDir = stemFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    stemPath = absPath+'/'+stemDir
    if stemDir not in os.listdir(absPath):
        os.mkdir(stemPath)
        
    stemmer = SnowballStemmer(language='english')
    for entity in os.scandir(txtPath):
        print("Now on '"+entity.name+"'. . . ", end='')
        with open(txtPath+'/'+entity.name, 'r+', encoding='utf-8') as txtFile:
            with open(stemPath+'/'+entity.name, 'w+', encoding='utf-8') as stemFile:
                text = txtFile.read().split(' ')
                stemmed = [stemmer.stem(word) for word in text]
                stemFile.write(" ".join(stemmed))
                stemFile.truncate()
        print("done")
stem(absPath = absolute)

Now on '00023.txt'. . . done
Now on '0028_504_uchida_s.txt'. . . done
Now on '0044_559_ishitani_y.txt'. . . done
Now on '00708543.txt'. . . done
Now on '00823977.txt'. . . done
Now on '00863988.txt'. . . done
Now on '009.txt'. . . done
Now on '00969115.txt'. . . done
Now on '00a.txt'. . . done
Now on '01-Altamura_Esposito_Malerba--Transforming_paper_documents_into_XML_format_with_WISDOM__.txt'. . . done
Now on '01046104.txt'. . . done
Now on '01217605.txt'. . . done
Now on '01237554.txt'. . . done
Now on '01308672.txt'. . . done
Now on '01673370.txt'. . . done
Now on '01_AMI_Alcaniz.txt'. . . done
Now on '02_AMI_Riva.txt'. . . done
Now on '03_AMI_Gaggioli.txt'. . . done
Now on '04100664.txt'. . . done
Now on '04359339.txt'. . . done
Now on '04407722.txt'. . . done
Now on '04_AMI_istag.txt'. . . done
Now on '05_AMI_Cortese.txt'. . . done
Now on '06_AMI__Piva.txt'. . . done
Now on '07_AMI_Kameas.txt'. . . done
Now on '08_AMI_Kleiner.txt'. . . done
Now on '09.txt'. . . done
Now on '09_AMI

In [14]:
# Create JSON with file_name : doc_id
def update_doc_index(jsonDoc = jsonDocIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    jsonPath = absPath+'/'+jsonDoc
    if jsonDoc in os.listdir(absPath):
        with open(jsonPath, 'r') as jsonFile:
            docIndex = json.load(jsonFile)
    else:
        docIndex = {}
    for fileName in os.listdir(txtPath):
        if fileName not in docIndex.values():
            docIndex[len(docIndex)+1] = fileName
    with open(jsonPath, 'w') as jsonFile:
        json.dump(docIndex, jsonFile, indent=4)

update_doc_index(absPath = absolute)

In [15]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_inv_index(ngrams, jsonDoc = jsonDocIndex, jsonInv = jsonInvIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
    txtPath = absPath+'/'+txtDir
    jsonDocPath = absPath+'/'+jsonDoc
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonInvPath = absPath+'/'+jsonInv
    if jsonInv in os.listdir(absPath):
        with open(jsonInvPath, 'r') as jsonFile:
            invIndex = json.load(jsonFile)
    else:
        invIndex = {}
#     Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
        
    for ngram in ngrams:
        for docID in docIndex:
            if docID not in invIndex:
                invIndex[docID] = {}
            if ngram not in invIndex[docID]:
                invIndex[docID][ngram] = {}
                with open(txtPath+'/'+docIndex[docID], 'r', encoding='utf-8') as txtFile:
                    text = txtFile.read().split(' ')
                    while len(text) > ngram-1:
                        term = " ".join(text[:ngram])
                        if term in invIndex[docID][ngram]:
                            invIndex[docID][ngram][term] += 1
                        else:
                            invIndex[docID][ngram][term] = 1
                        text.pop(0)
    with open(jsonInvPath, 'w') as jsonFile:
        json.dump(invIndex, jsonFile, indent=4)

update_inv_index(list(range(2,4)), absPath = absolute)

In [None]:
# Create JSON with {doc_id1 : {"gram1":freq, "gram2":freq}, doc_id2 : {"gram1":freq}}
def update_gram_index(minFreq = 1, jsonInv = jsonInvIndex, jsonGram = jsonGramIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
#     Makes sure all files and directories exist (jsonInv, txtDir)
    jsonInvPath = absPath+'/'+jsonInv
    if jsonInv not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        return
#     jsonGram is created if it does not exist
    jsonGramPath = absPath+'/'+jsonGram
    if jsonGram in os.listdir(absPath):
        with open(jsonGramPath, 'r') as jsonFile:
            gramIndex = json.load(jsonFile)
    else:
        gramIndex = {}
#     Loads Inverted File index
    with open(jsonInvPath, 'r') as jsonFile:
        invIndex = json.load(jsonFile)
        
    for docID, ngrams in invIndex.items():
        for terms in ngrams.values():
            for term, freq in terms.items():
                if freq >= minFreq:
                    if term not in gramIndex:
                        gramIndex[term] = {}
                    gramIndex[term][docID] = freq
#     Writes gramIndex to gram JSON file
    with open(jsonGramPath, 'w') as jsonFile:
        json.dump(gramIndex, jsonFile, indent=4)

update_gram_index(absPath = absolute)              

In [None]:
# Print grams found in 3 or more documents
def getGrams(numDocs = 3, jsonDoc = jsonDocIndex, jsonGram = jsonGramIndex, txtDir = stemFilesDir, absPath = os.getcwd()):
#     Makes sure all files and directories exist (textDir, jsonDoc, txtDir)
    txtPath = absPath+'/'+txtDir
    if txtDir not in os.listdir(absPath):
        print('The specified directory "' + txtPath + '" does not exist')
        return
    jsonDocPath = absPath+'/'+jsonDoc
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonGramPath = absPath+'/'+jsonGram
    if jsonGram not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonGramPath + '" does not exist')
        return
#    Loads document index
    with open(jsonDocPath, 'r') as jsonFile:
        docIndex = json.load(jsonFile)
#    Loads gram index 
    with open(jsonGramPath, 'r') as jsonFile:
        gramIndex = json.load(jsonFile)
    
    for gram, docs in gramIndex.items():
        if len(docs) >= numDocs:
            print("\""+gram+"\" found in "+str(len(docs))+" documents")
            for docID in docs:
                print("\t"+ str(docIndex[docID]))
                
getGrams(10, absPath = absolute)

In [None]:
# Create vectors for all n-grams (with freq>1?) between two docs, multiply them,
#     then divide by the product Euclidian norms
# Print out similar phrases
def freqMatrices(ngrams, jsonDoc = jsonDocIndex, jsonInv = jsonInvIndex, jsonTFM = jsonTFMatrix, absPath = os.getcwd()):
    jsonDocPath = absPath+'/'+jsonDoc
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonInvPath = absPath+'/'+jsonInv
    if jsonInv not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonInvPath + '" does not exist')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonInvPath, 'r') as jsonInv:
        invIndex = json.load(jsonInv)
        
#     jsonTFM is created if it does not exist
    jsonTFMPath = absPath+'/'+jsonTFM
    if jsonTFM in os.listdir(absPath):
        with open(jsonTFMPath, 'r') as jsonFile:
            tfMatrix = json.load(jsonFile)
    else:
        tfMatrix = {}     
        
    for doc1 in invIndex:
        doc1Terms = []
        if doc1 not in tfMatrix:
            tfMatrix[doc1] = {}
        for ngram, terms in invIndex[doc1].items():
            if int(ngram) in ngrams:
                for term in terms:
                    doc1Terms.append(term)
        for doc2 in invIndex:
            if doc2 >= doc1 and doc2 not in tfMatrix[doc1]:
                print(docIndex[doc1]+" and "+docIndex[doc2])
                if doc2 not in tfMatrix[doc1]:
                    tfMatrix[doc1][doc2] = {}
                    tfMatrix[doc1][doc2]["all"] = {}
                    tfMatrix[doc1][doc2]["like"] = {}
                allTerms = doc1Terms.copy()
                likeTerms = []
                for ngram, terms in invIndex[doc2].items():
                    if int(ngram) in ngrams:
                        for term in terms:
                            if term not in allTerms:
                                allTerms.append(term)
                            else:
                                likeTerms.append(term)
                allTerms.sort()
                likeTerms.sort()
                doc1all = {}
                doc2all = {}
                doc1like = {}
                doc2like = {}
#                 ngramFreq = {}
                for gramLen in ngrams:
#                     ngramFreq[gramLen] = 0
                    doc1all[gramLen] = []
                    doc2all[gramLen] = []
                    doc1like[gramLen] = []
                    doc2like[gramLen] = []
    
                for term in allTerms:
                    length = len(term.split())
#                     ngramFreq[length] += 1
#                     weight = ngrams[length]
                    ngram = str(len(term.split()))
                    doc1all[length].append(invIndex[doc1][ngram].get(term, 0))
                    doc2all[length].append(invIndex[doc2][ngram].get(term, 0))
                for term in likeTerms:
                    length = len(term.split())
#                     ngramFreq[length] += 1
#                     weight = ngrams[length]
                    ngram = str(len(term.split()))
                    doc1like[length].append(invIndex[doc1][ngram].get(term))
                    doc2like[length].append(invIndex[doc2][ngram].get(term))
#                     doc1weighted.append(invIndex[doc1][ngram].get(term)*weight)
#                     doc2weighted.append(invIndex[doc2][ngram].get(term)*weight)
#                 tfMatrix[doc1][doc2]["ngrams"] = ngramFreq
                tfMatrix[doc1][doc2]["all"][doc1] = doc1all
                tfMatrix[doc1][doc2]["all"][doc2] = doc2all
                tfMatrix[doc1][doc2]["like"][doc1] = doc1like
                tfMatrix[doc1][doc2]["like"][doc2] = doc2like
#                 cosSimilarity = calcCosSim(doc1freq, doc2freq)
#                 weightedCosSim = calcCosSim(doc1weighted, doc2weighted)
#                 print("Like terms (doc1, doc2):")
#                 for term in likeTerms:
#                     print("\t"+term+" ("+str(invIndex[doc1][str(len(term.split()))][term])+", "+str(invIndex[doc2][str(len(term.split()))][term])+")")
                numLike = len(likeTerms)
                print("Doc1 has "+str(len(doc1Terms)-numLike)+" unique terms")
                print("Doc2 has "+str(len(allTerms)-len(doc1Terms))+" unique terms")
                for gramLength in ngrams:
                    print("{}-grams: {}".format(gramLength, len(doc1like[gramLength])))
                print("Total like terms: "+str(numLike))
                print("Total unlike terms: "+str(len(allTerms)-numLike), end="\n\n")
#                 print("Cosine Similarity: "+ str(cosSimilarity))
#                 print("Weighted Cosine Similarity: "+ str(weightedCosSim), end='\n\n')
#     Writes tfMatrix to term-freq matrix JSON file
                with open(jsonTFMPath, 'w') as jsonFile:
                    json.dump(tfMatrix, jsonFile, indent=4)
grams = [2, 3]
freqMatrices(grams, absPath = absolute)
    

In [None]:
import math
def calcCosSim(list1, list2):
    if len(list1) != len(list2):
        print("Lists do not have the same number of elements")
        return -1
    sumProd = sum(n1*n2 for n1, n2 in zip(list1, list2))
    sumL1 = sum(n**2 for n in list1)
    sumL2 = sum(n**2 for n in list2)
    
    return sumProd/(math.sqrt(sumL1)*math.sqrt(sumL2))

def cossim(gramWeights, jsonDoc = jsonDocIndex, jsonTFM = jsonTFMatrix, absPath = os.getcwd()):
    jsonDocPath = absPath+'/'+jsonDoc
    if jsonDoc not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonDocPath + '" does not exist')
        return
    jsonTFMPath = absPath+'/'+jsonTFM
    if jsonTFM not in os.listdir(absPath):
        print('FILE NOT FOUND: The specified file "' + jsonTFMPath + '" does not exist')
        return
    
    with open(jsonDocPath, 'r') as jsonDoc:
        docIndex = json.load(jsonDoc)
    with open(jsonTFMPath, 'r') as jsonInv:
        tfMatrix = json.load(jsonInv)
    
    for doc1 in tfMatrix:
        for doc2, values in tfMatrix[doc1].items():
            print(docIndex[doc1]+"({}) and ".format(doc1)+docIndex[doc2]+"({})".format(doc2))
            doc1freq = []
            doc2freq = []
            doc1weighted = []
            doc2weighted = []
            for gramLen, weight in gramWeights.items():
                gramLen = str(gramLen)
                doc1freq += values["like"][doc1][gramLen]
                doc2freq += values["like"][doc2][gramLen]
                doc1weighted += [ val*weight for val in values["like"][doc1][gramLen] ]
                doc2weighted += [ val*weight for val in values["like"][doc2][gramLen] ]
                
            print("Cosine Similarity: ", calcCosSim(doc1freq, doc2freq))
            print("Weighted: ", calcCosSim(doc1weighted, doc2weighted), end="\n\n")
            
            
gramWeight = {2:1, 3:5}
cossim(gramWeight, absPath = absolute)

In [None]:
# Trigram parser-based inverted file 
# (TF-DIF to remove trigrams common to most or all documents)

In [None]:
# Clustering algorithm based on trigram inverted file

In [None]:
# Add bigram parser-based info to inverted file

In [None]:
# Implement clustering on bigram inverted file