In [86]:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import os
import pandas as pd

class downloader():
    """
    xxxx
    Args:
        
    """
    def __init__(self, page):
        # Declaring None here is required for method setup_page.
        self._page = page
        self.setup_page()
        self.extract_pdf()
    
    
    @property
    def page(self):
        """
        Simply returns the page. 
        Keeping page as a property ensures when page is changed, 
        the correct pdf file is referred by the object (by calling self.setup_page).
        """
        return self._page
    
    
    @page.setter
    def page(self, value):
        """
        Setter for class property self.page.
        Keeping page as a property ensures when page is changed, 
        the correct pdf file is referred by the object (by calling self.setup_page).
        """
        self._page = value
        self.setup_page()
    
    
    @property
    def processed_page(self):
        """
        A class property. Convert the page number (self.page) into the page number of the pdf.
        Used in method self.extract_pdf
        """
        return self.page % 100
    
    
    def setup_page(self):
        # Construct the full pdf address
        self._pdf_folder_address='C:\\Users\\user1\\Documents\\GitHub\\JapaneseExtraction\\jp_book'
        pdf_start = 'jp_book_basic_'
        pdf_end = '.pdf'
        pdf_middle_lkup = [
            '001-100',
            '101-200',
            '201-300',
            '301-400',
            '401-500', 
            '501-600',
            '600-end'
        ]
        pdf_middle = pdf_middle_lkup[self.page//100]   
        pdf_name = pdf_start + pdf_middle + pdf_end
        pdf_full_address = os.path.join(self._pdf_folder_address, pdf_name)
        
        # Check if the current object's pdf address is the same. 
        # If same, no need to do anything to driver. Else close current and open new driver.
        try:
            if pdf_full_address == self._pdf_full_address:
                self._requires_driver_refresh = False
            else:
                self._pdf_full_address = pdf_full_address
                self._requires_driver_refresh = True
        except AttributeError:
            self._pdf_full_address = pdf_full_address
            self._requires_driver_refresh = True
        

    def extract_pdf(self):
        """
        Go through selenium process to get text from pdf. 
        Creates attribute driver, and subsequently also creates and fills attribute self.text
        
        Args:
            None
        Returns:
            None        
        """
        # if driver exists and there is an session id, then quit the existing session
        if self._requires_driver_refresh==True:
            if hasattr(self, 'driver'):
                if self.driver.session_id is not None:
                    self.driver.quit()
            
            self.driver = webdriver.Chrome()
            self.driver.get('https://www.newocr.com/')

            # Initial page, uplaod file
            pdf_selector = self.driver.find_element_by_name('userfile')
            pdf_selector.send_keys(self._pdf_full_address)

            self.driver.find_element_by_id('preview').click()

            # Preferences page, add Japanese language 
            self.driver.find_element_by_class_name('search-field').click()
            language_list = self.driver.find_elements_by_tag_name('li')
            language_list[58].click() # 58 is japanese
        
        # change page number of pdf and generate preview
        Select(self.driver.find_element_by_name('p')).select_by_value(str(self.processed_page))
        
        self.driver.find_element_by_name('ocr').click()
        self.text = self.driver.find_element_by_tag_name('textarea').text.split('\n')
        self.text = pd.DataFrame(self.text, columns=['lines'])
    
    
    def write_csv(self, choose_processed_text=True, csv_path=None):
        if csv_path == None:
            csv_name = 'jp_book_basic_p_' + str(self.page) + '.csv'
            subfolder = 'csv'
            csv_path = os.path.join(self._pdf_folder_address, subfolder, csv_name)
        
        if choose_processed_text:
            if hasattr(self, 'processed_text'):
                self.processed_text.to_csv(csv_path, encoding='utf-8')
            else:
                raise Exception('Processed text has not been defined.')
        elif hasattr(self, 'text'):
            self.text.to_csv(csv_path, encoding='utf-8')

    
    # def get_text_file(self):
        ## The following code is for downloading the text file
        # driver.find_element_by_class_name('dropdown-toggle').click()
        # dl_dropdown = driver.find_element_by_class_name('dropdown-menu')
        # dl_dropdown.find_element_by_tag_name('li').click()
    
    
    
    


In [87]:
dl = downloader(207)

In [78]:
dl.text

Unnamed: 0,lines
0,koto ga aru' 197
1,高かっ た こと が ある (There was a time when s.t. was ...
2,takakatta koto ga aru
3,静か だ っ た こと が ある (There was a time when s.t. w...
4,shizukadatta koto ga aru
5,先生 だ っ た こと が ある (There was a time when s.o. w...
6,sensei datta koto ga aru
7,
8,Gace
9,(a) 私 は 中 学校 で 英語 を 教え た こと が あり ます 。


In [88]:
dl.processed_text = dl.text.loc[[1,3,6]]

In [80]:
dl.processed_text

Unnamed: 0,lines
1,高かっ た こと が ある (There was a time when s.t. was ...
3,静か だ っ た こと が ある (There was a time when s.t. w...
6,sensei datta koto ga aru


In [89]:
dl.write_csv()