In [64]:
import re
import roman
import os
import csv
from treatise_reference_data import *
from master_function_list import ultimateParser as uP

Let's start with the paper object. The idea is that we'll be passing into the single file processing function a PDF. That PDF will get turned into a text file with no white space. It's that text file that will be passed here. 

We can call that our Paper object for our purposes here. We want it to have a file name and containers for cites tied to different searches along with the functions to execute different searches. Finally we also want it to be able to generate a total scoring output for all citations.

In [169]:
class Paper:
    def __init__(self, txt_file_name):
        self.name = txt_file_name
        self.nortonCites = []
        self.sbnCites = []
        self.rawParenthesesCapture = []
        self.otherCites = []
        
    def NortonSearch(self):
        citationCounter = 0 #citation counter to be used in numbering citations
        #create the citation search object
        #search pattern is
##        Capture Abstract cites with optional dash additions
##        Capture Appendix cites with optional dash additions
##        Capture Main body cites as follows:
##            Book. (capture both roman and arabic numeral versions)
##            Part. (capture both roman (capitalized and not) and arabic numerals)
##            Section (I left of the '.' here to be able to capture citations that are only Book.Part.Section with no paragraph citation)
##            optional .Paragraph(s with optional dash separator for a range of paragraphs
        nortonPattern = re.compile("""  T*Abs\d+
                                            ([-–—]\d{1,2}){0,1}|
                                        T*App\d+
                                            ([-–—]\d{1,2}){0,1}|
                                        ((I{1,3}|[123])\.)
                                        (([i]{1,3}|IV|[I]{1,3}|[1-4])\.)
                                        (\d{1,2})
                                        (\.[1-9]\d{0,2}
                                            ([-–—]\d{1,2}){0,1}
                                        ){0,1}""", re.X)
        #make the text of the paper accessible and generate the match objects
        paper_to_search = open(self.name, "r")
        text_to_search = paper_to_search.readline()
        paper_to_search.close()
        matchObjects = nortonPattern.finditer(text_to_search)
        #create a citation for each match object
        for match in matchObjects:
            citationCounter +=1
            citationObject = Citation(self.name, citationCounter, match.group())
            citationObject.startPoint = match.start()
            citationObject.endPoint = match.end()
            citationObject.search_term = match.re
            self.nortonCites.append(citationObject)
        if len(self.nortonCites) == 0:
            pass
    
    def SbnSearch(self):
        citationCounter = 0 #citation counter to be used in numbering citations
        #create the citation search object
##        the search pattern this time is to start with SBN
##        then cover page numbers (I got rid of 0 as a starting point because a file happened to have a weird SBN0 followed by a long string of numbers
##        next I have an optional dash and comma separator that can be repeated to capture the multiple pages and ranges that get cited
##        this will require some cleaning because sometimes you get a random 'i' following the comma
        sbnPattern = re.compile(""" (?<!I)
                                    (SBN)
                                    ([1-9]\d+|[xvi]+|[XVI]+)
                                    ([-–—,](\d+|[xvi]+|[XVI]+))*""", re.X)
        #make the text of the paper accessible and generate the match objects
        paper_to_search = open(self.name, "r")
        text_to_search = paper_to_search.readline()
        paper_to_search.close()
        matchObjects = sbnPattern.finditer(text_to_search)
        #create a citation for each match object
        for match in matchObjects:
            citationCounter +=1
            citationObject = Citation(self.name, citationCounter, match.group())
            citationObject.startPoint = match.start()
            citationObject.endPoint = match.end()
            citationObject.search_term = match.re
            self.sbnCites.append(citationObject)
        if len(self.sbnCites) == 0:
            pass
    
    def parenthesesCapture(self):
        citationCounter = 0 #citation counter to be used in numbering citations
        #create the citation search object
        #This idea behind this search string is to get anything in parentheses with the following structure:
            #First, it can optionally start with either a 'T', 'THN', 'Treatise', or 'Hume'
            #Second, there can be a run of some intervening text but not a close parens or any numbers
            #Third, we get a page number citation with an optional p, p., or pp.
            #fourth, we get up to a three digit page number or range of up to 3 digit page numbers in either
                #roman or arabic numerals
            #The only thing I haven't figured out how to capture yet is a brief comment that appears in a few
            #cases where the authro says something like, 'my emphases' or 'italics mine'. I think that might
            #require a different search with a more restrictive start to the parentheses
        pattern = re.compile('\((T|THN|Treatise|Hume)*([A-Z]|[a-z]|[,.])*(p*\.{0,1}(\d{1,3}|[xvi]+|[XVI]+)([-–—,](\d+|[xvi]{1,5}|[XVI]{1,5}))*)\)')
        #make the text of the paper accessible and generate the match objects
        paper_to_search = open(self.name, "r")
        text_to_search = paper_to_search.readline()
        paper_to_search.close()
        matchObjects = pattern.finditer(text_to_search)
        #create a citation for each match object
        for match in matchObjects:
            citationCounter +=1
            citationObject = Citation(self.name, citationCounter, match.group())
            citationObject.startPoint = match.start()
            citationObject.endPoint = match.end()
            citationObject.search_term = match.re
            self.rawParenthesesCapture.append(citationObject)
        if len(self.rawParenthesesCapture) == 0:
            pass
    
    def otherSearch(self, search_term):
        citationCounter = 0 #citation counter to be used in numbering citations
        #create the citation search object
        pattern = re.compile(search_term)
        #make the text of the paper accessible and generate the match objects
        paper_to_search = open(self.name, "r")
        text_to_search = paper_to_search.readline()
        paper_to_search.close()
        matchObjects = pattern.finditer(text_to_search)
        #create a citation for each match object
        for match in matchObjects:
            citationCounter +=1
            citationObject = Citation(self.name, citationCounter, match.group())
            citationObject.startPoint = match.start()
            citationObject.endPoint = match.end()
            citationObject.search_term = match.re
            self.otherCites.append(citationObject)
        if len(self.otherCites) == 0:
            pass
    
    def calculate_raw_score_sheet(self):
        #gather all the citations
        master_citation_list = []
        for citation in self.nortonCites:
            master_citation_list.append(citation)
        for citation in self.sbnCites:
            master_citation_list.append(citation)
        for citation in self.rawParenthesesCapture:
            master_citation_list.append(citation)
        for citation in self.otherCites:
            master_citation_list.append(citation)
        
        #turn each citation into a list of pairs
        master_scoring_list = []
        for citation in master_citation_list:
            if citation.cleanedCitation != "":
                try:
                    for pair in uP(citation.cleanedCitation):
                        #these pairs are (chapter, weight)
                        master_scoring_list.append(pair)
                except Exception as e:
                    print(e)
                    print(pair, "generated an error calculating raw_score_sheet while running through uP")
                    pass
                
        #create a blank scoring sheet:
        score_sheet = {}
        for para in treatise_paragraph_list:
            score_sheet[para] = 0
        
        #update score_sheet from master_score list
        for pair in master_scoring_list:
            try:
                score_sheet[pair[0]] += pair[1]
            except Exception as error:
                print(error)
                print(pair[0], 'generated a key error when trying to update the score sheet')
                pass
        
        return score_sheet
    
    def make_csv_raw_score(self):
        out_data = self.calculate_raw_score_sheet()
        output_file = open(self.name[:-4]+".csv", "w")
        csv_writer = csv.writer(output_file)
        for pair in out_data.items():
            csv_writer.writerow(pair)
        output_file.close()
        print('raw score sheet csv generated')
    
    def relative_score_sheet(self):
        raw_score = self.calculate_raw_score_sheet()
        total_score = 0
        for para in raw_score.keys():
            if raw_score[para] > 0:
                total_score += raw_score[para]
        relative_score_sheet = {}
        for para in raw_score.keys():
            if raw_score[para] > 0:
                relative_score = 100*round(raw_score[para]/total_score,5)
                relative_score_sheet[para] = relative_score
            else:
                relative_score_sheet[para] = raw_score[para]
                
        return relative_score_sheet
    
    def make_csv_relative_score(self):
        out_data = self.relative_score_sheet()
        output_file = open(self.name[:-4]+"-relative-score.csv", "w")
        csv_writer = csv.writer(output_file)
        for pair in out_data.items():
            csv_writer.writerow(pair)
        output_file.close()
        print('relative score sheet csv generated')

Now let's work with the Citation class

In [170]:
class Citation:
    def __init__(self, paper_name, order_num, search_result):
        self.order = order_num
        self.paper = paper_name
        self.search_term = ""
        self.citationScores = []
        self.startPoint = 0
        self.endPoint = 0
        self.rawCitationText = search_result
        self.precedingText = ""
        self.trailingText = ""
        self.cleanedCitation = ""

    #this function pulls a given number of characters from before the citation starts up to the start
    #of the citation
    def FindPrecedingText(self, num_chars):
        #open the paper file
        paper_file = open(self.paper, "r")
        text_to_use = paper_file.readline()
        paper_file.close()
        #generate the buffer to get the appropriate slice in case it's around an edge of teh string
        buffer = self.startPoint - num_chars
        #set the proper text in the Citation
        if self.startPoint == 0:
            self.precedingText = ''
        elif buffer >= 0:
            self.precedingText = text_to_use[buffer:self.startPoint]
        elif buffer < 0:
            self.precedingText[:self.startPoint]

    #this function pulls a given number of characters from the end of the citation going forward
    def FindTrailingText(self, num_chars):
        #open the paper file
        paper_file = open(self.paper, "r")
        text_to_use = paper_file.readline()
        paper_file.close()
        #generate the buffer to get the appropriate slice in case it's around an edge of the string
        buffer = len(text_to_use) - (self.endPoint + num_chars)
        #set the proper text in the Citation
        if buffer == 0:
            self.trailingText = ""
        elif buffer > 0:
            self.trailingText = text_to_use[self.endPoint:(self.endPoint + num_chars)]
        elif buffer < 0:
            self.trailingText = text_to_use[self.endPoint:]

    #this function takes an integer as input and runs the previous two functions
    def PopulateSurroundingTexts(self, num_chars):
        self.FindPrecedingText(num_chars)
        self.FindTrailingText(num_chars)

    def calculateScore(self):
        self.citationScores = uP(self.cleanedCitation)

let's test this out

In [171]:
test_paper = "search_texts/533Cottrell.txt"

In [103]:
test = Paper(test_paper)

In [104]:
test.NortonSearch()

In [105]:
for cite in test.nortonCites:
    cite.cleanedCitation = cite.rawCitationText

In [106]:
test.make_csv_raw_score()

raw score sheet csv generated


In [107]:
test.make_csv_relative_score()

relative score sheet csv generated


Ok so, so far the process of this function is something like:

1. Create Paper Object
2. Conduct searches
3. For each Citation in all of the different citations we need to appropriately convert the .rawCitationText to .cleanedCitation
4. Generate outputs we're interested in:
    A. For further data processing we're interested in relative and perhaps raw scoring data, returned not in the form of csv data but of a dictionary/list.
    B. For visualization we're probably just interested in relative citation csv data.
   
5. I think it's probably best to have the function return a dictionary/JSON object:
    TotalCitationCount: INT
    RelativeCitationData: Para/Score dictionary
    RawCitationData: Para/score dictionary

In [167]:
def extractCitationDataFromSinglePaper(file_in):
    #generate the paper object
    paper_obj = Paper(file_in)
   
    #conduct the searches
    paper_obj.NortonSearch()
    paper_obj.SbnSearch()
    paper_obj.parenthesesCapture()
    
    #clean the citations
    for citation in paper_obj.nortonCites:
        citation.cleanedCitation = citation.rawCitationText
    for citation in paper_obj.sbnCites:
        cleanedCitation = cite.rawCitationText[3:]
    
    #this is a temporary check to get some of the easy citations we know are to treatise pages
    #we'll use a counter to see how many clean cites we get
    clean_parens_counter = 0
    num_check = re.compile('(\d{1,3})+([-–—,](\d{1,3}))*')
    for citation in paper_obj.rawParenthesesCapture:
        #first check if the citation starts with a T
        if citation.rawCitationText[1] == "T":
           #second check if the T is followed by a page number
            if num_check.search(citation.rawCitationText) != None:
                #print('found a T cite with a page num in', paper)
                pageNum = num_check.search(citation.rawCitationText).group()
                citation.cleanedCitation = pageNum
                clean_parens_counter += 1

    #generate the raw citation data for the paper:
    raw_score_sheet = paper_obj.calculate_raw_score_sheet()
    
    #generate the relative citation data for the paper:
    relative_score_sheet = paper_obj.relative_score_sheet()
    
    #output the csv of relative citation 
    paper_obj.make_csv_relative_score()
    
    #create the output dictionary
    od = {
        'TotalCites': len(paper_obj.nortonCites)+len(paper_obj.sbnCites)+clean_parens_counter,
        'RelativeCitationData': relative_score_sheet,
        'RawCitationData': raw_score_sheet
    }
    
    #return the output dictionary
    return od

In [172]:
test_single = extractCitationDataFromSinglePaper(test_paper)

relative score sheet csv generated


In [176]:
test_single['RawCitationData']['App.10']

17.183