# Read and translate numbers to integers in text

### Cases considered:
* number words only: three million two hundred three
* number words with "and": three million two hundred and three
* two separate numbers separated by word other than "and": two hundred chickens and three hens
* Mixed numbers and words: 3.4 million
* Numbers with commas: 2,514,200
* count percentages as numbers, but do not alter them if they have a % sign: 45%

### TODO


In [2]:
from itertools import groupby
from operator import itemgetter
import re

In [3]:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
    if pdfDir == "": 
        pdfDir = os.getcwd() + "\\" #if no pdfDir passed in 
    for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf 
            if "grace_villa" in pdfFilename:
                text = convert(pdfFilename) #get string of text content of pdf
#             textFilename = txtDir + pdf + ".txt"
#             textFile = open(textFilename, "w") #make text file
#             textFile.write(text) #write text to text file
    return text

# set paths accordingly:
pdfDir = "/Users/peterschnatz/1HC/pdf_reading_test/pdf_files/"
txtDir = ""
text = convertMultiple(pdfDir, txtDir)

In [4]:
split_text = text.split(".")
split_text = [s for s in split_text if len(s) != 0]
split_text = [s.replace("\n", " ") for s in split_text]
print(split_text)

['\x0c\x0c\x0cGRACE VILLA FOUNDATION 2018 ANNUAL REPORT  Pages  Contents  2  4  6  9 10  12  16 42  44  45  47  49  50  52  Message from the Founder & CEO  Message from Board of Directors  Grace Villa - The Home  The Vision & Mission  Who We Are Our Interventions  Corporate Support  Grace Villa Highlights  Visit to State House  Visit to the United Nations  Grace Villa Budget  Grace Villa Staff   Board Members  Our Partners  \x0c  2 | ANNUAL REPORT 2018  Message from the  Founder Dear Friends It brings me great joy to share what wonderful moments,   miracles  and  achievements  that  2018  held  for  your  children', ' I shall also share the challenges and lessons  learnt', '   •   Academics  •   Food    •   Advocacy   •  Health  •  Skill building •  Sustainability   Through  you,  215  orphans,  abandoned  and  abused  girls  were  rescued,  and  52  vulnerable  families  lives  were  transformed', ' This was a 40% increase from the year before', '  We  intervened  through  the  9  str

In [5]:
len(split_text)

588

In [6]:
def text2int(textnum, numwords={}):
    ### https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers
    if not numwords:
        units = [
            "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
            "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
            "sixteen", "seventeen", "eighteen", "nineteen",
            ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):    numwords[word] = (1, idx)
        for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    current = result = 0
    for word in textnum.split():
        if word not in numwords:
            raise Exception("Illegal word: " + word)

        scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return float(result + current)

print(text2int("seven billion one hundred million thirty one thousand three hundred thirty seven"))

7100031337.0


In [7]:
units = [
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
    "sixteen", "seventeen", "eighteen", "nineteen",
    ]

tens = ["twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

scales = ["hundred", "thousand", "million", "billion", "trillion"]

number_words = units + tens + scales

In [21]:
test_text_list = [
                  "there are twenty three animals in the garden", 
                  "I have a black cat",
                  "I don't want to live three Million miles away",
                  "talk to me for one minute",
                  "I don't want any milk",
                  "You have one hundred and five problems",
                  "one hundred and two will not be the same as three thousand and one",
                  "Let's try just one hundred more times one million two hundred thousand and five more times but"
                  " not more than two more times",
                  "The universe is 13,456 hundred years old",
                  "the earth is 4,500 years old and I am thirty three years old",
                  "3.14 is the beginning of pi",
                  "the earth is 13.5 billion years old",
                  "i don't think this will work, 22 hundred million",
                  "there are 25 hundred people that are sick",
                  "I can't see past 1,525 thousand meters",
                  "this sentence often produces cones and a hidden one and hidden ten in it",
                  "original sentence: grace villa foundation 2018 annual report  pages  contents  2  4  6  9 10  12  16 42  44  45  47  49  50  52  message "
]

num_scale = {"hundred": 10**2,
             "thousand": 10**3,
             "million": 10**6,
             "billion": 10**9,
             "trillion": 10**12}

translated_str = list()
for sent_num, statement in enumerate(test_text_list):
    s = statement.lower()

    #### remove commas from numbers
    match = re.findall(r"(\,\d{3})", s)
    for m in match:
        replacement = re.sub(r"\,", "", m)
        s = s.replace(m, replacement)
        
    ### separate non number characters from numbers by space
    s = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", s).strip()
    
    word_split = s.split(" ")
    ### Remove empty strings
    word_split = [w for w in word_split if w != ""]
    
    ### Does sentence contain integers or number words
    if any(substring in word_split for substring in number_words) | bool(re.search(r'\d', s)):
        print(sent_num)
        print("original sentence: {}".format(statement))
        
        word_i = list()
        used_num_words = list()
        for idx, word in enumerate(word_split):
            
            ### If numbers have commas, remove them
            if re.search(r'\d', word):
                num_no_commas = float(word.replace(",", ""))
                word_as_list = [word]
                j = 1
                
                ### check if there are number words after the number
                if idx + j < len(word_split):
                    while word_split[idx + j] in number_words:
                        num_no_commas = num_no_commas * num_scale[word_split[idx + j]]
                        word_as_list.append(word_split[idx + j])
                        used_num_words.append(word_split[idx + j])
                        j += 1

                        ### break out of while loop if at end of sentence
                        if (idx + j) == len(word_split): break
                
                ### sub in float for number words
                word_split[idx: idx+j] = [str(num_no_commas)]
                ### rebuild sentence
                s = " ".join(word_split)

            ### Get indices of number words not already translated
            if word in used_num_words:
                used_num_words.remove(word)
            elif word in number_words:
                word_i.append(idx)
                
        ### Check if there are any number words left to translate
        if len(word_i) > 0:
            all_word_nums_i = list()
            ### there is a non number word in between number words
            if (word_i[-1] - word_i[0]) != (len(word_i) - 1):

                ### Split indices into lists of consecutive indices
                consec_indices = list()
                for k, g in groupby(enumerate(word_i), lambda i_x: i_x[0] - i_x[1]):
                    consec_indices.append(list(map(itemgetter(1), g)))

                ### combine lists of indices if they only skip one number
                ### and the word corresponding to the missing index is "and"
                for i, l in enumerate(consec_indices[:-1]):
                    if (l[0] in [el for consec in all_word_nums_i for el in consec]):
                        continue
                    elif (consec_indices[i+1][0] - l[-1] == 2) & (word_split[l[-1] + 1] == "and"):
                        all_word_nums_i.append(l + [l[-1] + 1] + consec_indices[i+1])
                    else:
                        all_word_nums_i.append(l)

                if consec_indices[-1][0] not in [el for consec in all_word_nums_i for el in consec]:
                    all_word_nums_i.append(consec_indices[-1])

            else:
                ### There is only one number with no non-number words within
                all_word_nums_i = [word_i]

            ### take each word list and translate it to an integer
            for word_i in all_word_nums_i:
                full_number = " ".join([word_split[i] for i in word_i])
                numeric_sub = text2int(full_number)

                if len(word_split) > word_i[-1] + 1:
                    word_split[word_i[0]: word_i[-1] + 1] = [str(numeric_sub)] + [""]*(len(word_i)-1)
                else:
                    word_split[word_i[0]:] = [str(numeric_sub)]
                    
            s = " ".join([w for w in word_split if w != ""])
        
        translated_str.append(s)
        print("translated sentence: {}".format(s))
        print("*"*100)

0
original sentence: there are twenty three animals in the garden
translated sentence: there are 23.0 animals in the garden
****************************************************************************************************
2
original sentence: I don't want to live three Million miles away
translated sentence: i don't want to live 3000000.0 miles away
****************************************************************************************************
3
original sentence: talk to me for one minute
translated sentence: talk to me for 1.0 minute
****************************************************************************************************
5
original sentence: You have one hundred and five problems
translated sentence: you have 105.0 problems
****************************************************************************************************
6
original sentence: one hundred and two will not be the same as three thousand and one
translated sentence: 102.0 will not be the same as 3001

In [22]:
len(translated_str)

15

In [23]:
translated_str

['there are 23.0 animals in the garden',
 "i don't want to live 3000000.0 miles away",
 'talk to me for 1.0 minute',
 'you have 105.0 problems',
 '102.0 will not be the same as 3001.0',
 "let's try just 100.0 more times 1200005.0 more times but not more than 2.0 more times",
 'the universe is 1345600.0 years old',
 'the earth is 4500.0 years old and i am 33.0 years old',
 '3.14 is the beginning of pi',
 'the earth is 13500000000.0 years old',
 "i don't think this will work, 2200000000.0",
 'there are 2500.0 people that are sick',
 "i can't see past 1525000.0 meters",
 'this sentence often produces cones and a hidden 1.0 and hidden 10.0 in it',
 'original sentence: grace villa foundation 2018.0 annual report pages contents 2.0 4.0 6.0 9.0 10.0 12.0 16.0 42.0 44.0 45.0 47.0 49.0 50.0 52.0 message']