Pipeline for Extracting Text from Aventura Joven Books

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install tika

Collecting tika
  Downloading https://files.pythonhosted.org/packages/96/07/244fbb9c74c0de8a3745cc9f3f496077a29f6418c7cbd90d68fd799574cb/tika-1.24.tar.gz
Building wheels for collected packages: tika
  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Created wheel for tika: filename=tika-1.24-cp37-none-any.whl size=32885 sha256=71154154143d7e5a5f1eaf7b0d8807367a00efe41ccba55fd3cf1322888da73f
  Stored in directory: /root/.cache/pip/wheels/73/9c/f5/0b1b738442fc2a2862bef95b908b374f8e80215550fb2a8975
Successfully built tika
Installing collected packages: tika
Successfully installed tika-1.24


In [8]:
files_dir = "/content/drive/MyDrive/capstone/"
filenames = ['Aventura Joven 01 - Persecucion - Elvira Sancho, Jordi Suris.pdf',
              'Aventura Joven 02 - Misterio en - Elvira Sancho, Jordi Suris.pdf',
              'Aventura Joven 03 - Perdidos en - Elvira Sancho, Jordi Suris.pdf',
              'Aventura Joven 04 - La chica de - Elvira Sancho, Jordi Suris.pdf',
              'Aventura Joven 05 - El fantasma - Elvira Sancho, Jordi Suris.pdf',
              'Aventura Joven 06 - El monstruo - Elvira Sancho, Jordi Suris.pdf']

In [9]:
# import parser object from tika
from tika import parser  
import unicodedata
import re

data_list = []
text_orig_list = []
text_proc_list = []
for filename in filenames:
  parsed_pdf = parser.from_file(files_dir+filename) 
  data = parsed_pdf['content'] 
  data_list.append(data)

  # Find beginning and end of text
  beg_idx = data.lower().index('capítulo')
  end_idx = data.lower().index('después de la lectura\n')
  text = data[beg_idx:end_idx]
  text_orig_list.append(text)

  # write the initial text to a file (no preprocessing done here)
  with open(files_dir+filename[:17]+' Text No Preproc.txt', 'w') as f:
    f.write(text)

  # preprocess text
  # remove words with numbers in them, ex altura1 or hostal2 (footnote indicator)
  text_wo_ft_words = text
  for word in re.findall('[-a-zA-ZÀ-ÖØ-öø-ÿ]+»?,?!?\.{0,3}[1234567890]{1,2}', text_wo_ft_words):
    word_idx = text_wo_ft_words.index(word)
    m = re.search('[-a-zA-ZÀ-ÖØ-öø-ÿ]+»?,?!?\.{0,3}', word)
    stripped_word = m.group(0)
    text_wo_ft_words = text_wo_ft_words[:word_idx] + stripped_word + text_wo_ft_words[word_idx+len(word):]

  # remove words that contain '-\n' because they didn't fit on one line
  text_wo_broken_words = text_wo_ft_words
  for word in re.findall('[-a-zA-ZÀ-ÖØ-öø-ÿ]+\-\n{1,2}[-a-zØ-öø-ÿ]+', text_wo_broken_words):
    word_idx = text_wo_broken_words.index(word)
    hyphen_idx = word.index('-')
    if '-\n\n' in word:
      len_sep = 3
    else:
      len_sep = 2
    modified_word = word[:hyphen_idx] + word[hyphen_idx+len_sep:]
    text_wo_broken_words = text_wo_broken_words[:word_idx] + modified_word + text_wo_broken_words[word_idx+len(word):]

  # remove page numbers
  text_wo_page_nums = text_wo_broken_words
  for word in re.findall('\n[0-9]{1,2}\n', text_wo_page_nums):
    word_idx = text_wo_page_nums.index(word)
    m = re.search('[0-9]{1,2}', word)
    text_wo_page_nums = text_wo_page_nums[:word_idx] + '\n' + text_wo_page_nums[word_idx+len(word):]
  
  # remove unnecessary newline breaks
  text_wo_sent_breaks = text_wo_page_nums
  for word in re.findall('[-a-zA-ZÀ-ÖØ-öø-ÿ]+ ?\n\n[-a-zA-ZÀ-ÖØ-öø-ÿ]+', text_wo_sent_breaks):
    word_idx = text_wo_sent_breaks.index(word)
    newline_idx = word.index('\n\n')
    modified_word = word[:newline_idx] + word[newline_idx+2:]
    text_wo_sent_breaks = text_wo_sent_breaks[:word_idx] + modified_word + text_wo_sent_breaks[word_idx+len(word):]
  
  text_proc_list.append(text_wo_sent_breaks)
  
  with open(files_dir+ filename[:17] + ' Text.txt', 'w') as f:
    f.write(text_wo_sent_breaks)

Turning the files into JSON objects

In [98]:
from collections import defaultdict

chapter_indices = defaultdict(list)
text_chap = text_proc_list[0]
text_len = len(data_list[0])
i = 0
cur_data_chap = 0
while 'capítulo' in text_chap.lower():
  chapter_index = text_chap.lower().index('capítulo')
  newline_after_chapter_index = text_chap[chapter_index+8:].index('\n') + chapter_index+8
  text_chap = text_chap[newline_after_chapter_index:]
  if 'capítulo' in text_chap.lower():
    end_chap_idx = text_chap.lower().index('capítulo')
  else:
    end_chap_idx = len(text_proc_list[0])
  chapter_indices[str(i+1)] = [chapter_index + cur_data_chap, chapter_index+end_chap_idx+cur_data_chap+10]
  cur_data_chap += newline_after_chapter_index
  i += 1

In [99]:
chapter_indices

defaultdict(list,
            {'1': [0, 7144],
             '2': [7144, 11450],
             '3': [11450, 16769],
             '4': [16769, 23578],
             '5': [23578, 31948],
             '6': [31948, 38223],
             '7': [38223, 82927]})

In [None]:
print(text_proc_list[0][38223:82927])

In [120]:
from collections import defaultdict
dict_list = []
author = "Elvira Sancho, Jordi Suris"
for book_idx, (data, text) in enumerate(zip(data_list,text_proc_list)):
  # metadata
  # ISBN
  isbn_index = data_list[book_idx].index('ISBN')
  newline_after_isbn = data_list[book_idx][isbn_index+5:].index('\n\n') + isbn_index+5
  source = data_list[book_idx][isbn_index+6:newline_after_isbn]

  # Level
  if book_idx <= 4:
    level = "A1"
  else:
    level= "A2"

  # Title
  title_index = data_list[book_idx].index('Título')
  newline_after_title = data_list[book_idx][title_index+7:].index('\n\n') + title_index+7
  title = data_list[book_idx][title_index+8:newline_after_title]
  content = text

  # separate by chapters
  chapter_indices = defaultdict(list)
  text_chap = text
  j = 0
  cur_data_chap = 0
  while 'capítulo' in text_chap.lower():
    chapter_index = text_chap.lower().index('capítulo')
    newline_after_chapter_index = text_chap[chapter_index+8:].index('\n') + chapter_index+8
    text_chap = text_chap[newline_after_chapter_index:]
    if 'capítulo' in text_chap.lower():
      end_chap_idx = text_chap.lower().index('capítulo')
    else:
      end_chap_idx = len(text)
    chapter_indices[str(j+1)] = [chapter_index + cur_data_chap, chapter_index+end_chap_idx+cur_data_chap+10]
    cur_data_chap += newline_after_chapter_index
    j += 1
    
  # make a dictionary per chapter
  for chap, chap_indices in chapter_indices.items():
    chapter_text = text[chap_indices[0]: chap_indices[1]]
    chap_dict = defaultdict(str)
    chap_dict['author'] = author
    chap_dict['source'] = source
    chap_dict['level'] = level
    chap_dict['title'] = title + ", " + chap
    #chap_dict['chapter'] = chap
    chap_dict['content'] = chapter_text
    dict_list.append(chap_dict)

In [121]:
dict_list

[defaultdict(str,
             {'author': 'Elvira Sancho, Jordi Suris',
              'content': 'caPítulo 1\n\n—¡Eh, Lucas!, ¡mira, allí!\n—¿Dónde?\n—Allí. ¿Ves a ese chico?\n—¿Quién?\n—Aquél, el de los vaqueros y las gafas. Allá, delante del supermercado.\nLucas mira con atención al chico. Después dice:\n—No, ese chico no es. Es demasiado alto y no tiene el pelo negro.\n—¡Ah!\n—Además, tiene que entrar o salir de esta casa de ventanas verdes —Lucas señala una portería.\n—El número veintitrés.\n—Sí, el veintitrés.\nLucas y Rony están en la terraza de un bar de una placita en el barrio de La Latina de Madrid. Están bebiendo una cerveza y miran \ncon atención a la gente que pasa. Lucas es alto, de nariz grande y \nojos pequeños. Rony es bajo, un poco gordo y lleva una chaqueta \nvaquera azul.\n\n�\n\n1 placita: diminutivo de plaza, «pequeña plaza».\n2  barrio de La Latina: barrio monumental de Madrid (cerca de la Plaza del Sol) \n\nque comprende parte del Madrid antiguo. Es un barrio mu

In [123]:
import json
with open(files_dir + 'aventura.json', 'w') as outfile:
    json.dump(dict_list, outfile)

Attempt at Removing Footnote Definitions

In [None]:
# def modify_words(regex_find, regex_modify, text, modify):
#   ''' Modifies all words in text that match the regex_find regex to match the regex_modify regex. '''
#   text_removed = text
#   for word in re.findall(regex_find, text_removed):
#     word_idx = text_removed.index(word)
    
#     if modify == 'regex':
#       m = re.search(regex_modify, word)
#       modified_word = m.group(0)
#     elif modify == 'remove':
#       rm_char_idx = word.index(regex_modify)
#       modified_word = word[:rm_char_idx] + word[rm_char_idx+len(regex_modify):]
    
#     text_removed = text_removed[:word_idx] + modified_word + text_removed[word_idx+len(word):]

#   return text_removed

In [None]:
#text_wo_broken_words = modify_words('[-a-zA-ZÀ-ÖØ-öø-ÿ]+\-\n{1,2}[-a-zØ-öø-ÿ]+', '-\n', text_wo_ft_words, 'remove')
#print(text_wo_broken_words[800:2000])

In [None]:
# remove the footnotes

In [120]:
# Attempt at removing the footnotes
import re

# normalize the data to ignore special characters in Spanish
#norm_data = unicodedata.normalize('NFD', data).encode('ascii', 'ignore').decode('utf-8')
#norm_text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')

# m = re.search('[A-z]+[1234567890]{1,2}', norm_data[1880:])

# m.group(0)
# footnoted_words = [word.strip() for word in re.findall('[A-z]+[1234567890]', norm_data[1880:])]
# footnote_defs = [word.strip() for word in re.findall('[0-9]{1,2}  [A-z]+[\s\w]*:[\w \n]*\.', norm_data[1880:])]

#print(footnoted_words)
#print(footnote_defs)
all_matches = re.findall('([0-9]{1,2}  [A-z]+: ((([A-z]|[0-9]|,)+ ?)+\n))', '''1  altura: Cusco esta a unos 3500 metros de altura, lo que en algunas personas 
provoca el llamado mal de altura o soroche. Los sintomas del mal de altura 
son: dolor de cabeza, mareos, trastornos estomacales y cansancio. Puede com-
batirse con pastillas, ejercicios de respiracion o mate.

2  hostal: alojamiento normalmente mas barato y personal que un hotel, aunque 
sin sus comodidades. La palabra se utiliza principalmente en medios rurales. 
''')
print(len(all_matches))
print(all_matches)

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



KeyboardInterrupt: ignored