In [1]:
url = 'https://www.104.com.tw/jobs/search/?ro=0&keyword=python'

In [6]:
import json
import requests
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

- transform url to ajax url
- format ajax url's query parameters
- send request
- while total page > 150:
    - if can divide area
    - else if can divide job category
    - else if can divide industry category
- return ajax urls

In [3]:
MAX_TOTAL_PAGE = 150
DEFAULT_QUERY_PARAM = {
    # not recommended to change
    'page': 1,     # 頁數
    'mode': 's',   # s: 摘要顯示
    'order': 11,   # 11: 日期排序
    'ro': 0,       # 0: 全部職缺, 1: 全職職缺
    'asc': 0,      # 0: 降冪排序, 1: 升冪排序
}

In [4]:
with open('area_breakdown_json.txt', 'r') as f:
    AREA_BREAKDOWN_DICT = json.loads(f.read())
    
with open('jobcat_breakdown_json.txt', 'r') as f:
    JOBCAT_BREAKDOWN_DICT = json.loads(f.read())
    
with open('indust_breakdown_json.txt', 'r') as f:
    INDUST_BREAKDOWN_DICT = json.loads(f.read())

In [20]:
class StartUrlProcessor:
    
    def process(self, url):
        
        # transform url to ajax url
        ajax_url = self.transform_search_url_to_ajax_url(url)
        print('ajax_url')
        print(ajax_url)
        
        # get query parameters
        url_param = self.get_url_query_param(ajax_url)
        print('url_param')
        print(url_param)
        
        # set query parameters to ajax url
        format_url_param = {**url_param, **DEFAULT_QUERY_PARAM}
        format_url = self.format_url_with_query_params(ajax_url, format_url_param)
        print('format_url')
        print(format_url)

        # subdivide query params if "total_page" >= MAX_TOTAL_PAGE
        all_page_urls = self.subdivide_url(format_url)
        print(all_page_urls)
        
        return all_page_urls
    
    
    @staticmethod
    def transform_search_url_to_ajax_url(url):
        ''' Change url's path, and the website will return json-response instead of html.
        For example, "https://www.104.com.tw/jobs/search/?ro=0" to "https://www.104.com.tw/jobs/search/list?ro=0"
        '''
        url_parse = urlparse(url)
        endswith = 'list'
        if url_parse.path.endswith(endswith):
            return url
        new_path = url_parse.path + endswith
        url_parse = url_parse._replace(path=new_path)
        new_url = urlunparse(url_parse)
        return new_url
    
    
    @staticmethod
    def get_url_query_param(url) -> dict:
        ''' Get query parameters in url. For example, 
        input: "https://www.104.com.tw/jobs/search/list?ro=0&keyword=python", 
        return: {'ro': '0', 'keyword': 'python'}
        '''
        url_parse = urlparse(url)
        return dict(parse_qsl(url_parse.query))
    
    
    @staticmethod
    def format_url_with_query_params(url, new_query_params={}):
        ''' Set formatted query parameters to url. Parameters are in "DEFAULT_QUERY_PARAM".
        '''
        url_parse = urlparse(url)
        url_query = dict(parse_qsl(url_parse.query))
        url_query.update(new_query_params)
        new_url_query = urlencode(url_query)
        url_parse = url_parse._replace(query=new_url_query)
        new_url = urlunparse(url_parse)
        return new_url
    
    
    @staticmethod
    def get_total_page_from_search_page(url):
        ''' temp: Get the total page number of result
        '''
#         return 1 # 150
        headers = {'Referer': url}
        response = requests.get(url, headers=headers, verify=False)
        if response.status_code == 200:
            j = json.loads(response.text)
            total_page = j['data']['totalPage']
            return total_page
        print('Error happened, when get total page')
        return None
    
    
    @classmethod
    def subdivide_url(cls, url, urls=[]):
        ''' If the result of totlaPage is more than "MAX_TOTAL_PAGE", then subdivide url, 
        based on area, job category, and industry category.
        '''
        # get the result of total page number
        total_page = cls.get_total_page_from_search_page(url)
        
        # if total page is 0, then drop the url
        if total_page == 0:
            return urls
        
        # if total page is less than MAX_TOTAL_PAGE, then return url
        if total_page < MAX_TOTAL_PAGE:
            print(f'>>> URL: {url}')
            print(f'    Total Pages: {total_page}')
            
            # generate urls with all pages
            page_urls = cls.generate_each_page_url_from_start_url(url, total_page)
            urls.extend(page_urls)
            return urls

        # subdivide area, jobcat, indust code
        area_urls = cls.subdivide_area(url)
        jobcat_urls = cls.subdivide_job_category(url)
        indust_urls = cls.subdivide_industry_category(url)
        
        if area_urls:
            for area_url in area_urls:
                urls = cls.subdivide_url(area_url, urls)

        elif jobcat_urls:
            for jobcat_url in jobcat_urls:
                urls = cls.subdivide_url(jobcat_url, urls)
                
        elif indust_urls:
            for indust_url in indust_urls:
                urls = cls.subdivide_url(indust_url, urls)

        else:
            # if total page is still more then MAX_TOTAL_PAGE, return url
            print(f'>>> URL: {url}')
            print(f'    It is still more than {MAX_TOTAL_PAGE} pages.')
            if url not in urls:
                # generate urls with all pages
                page_urls = generate_each_page_url_from_start_url(url, total_page)
                urls.extend(page_urls)

        return urls

    
    @classmethod
    def subdivide_area(cls, url):
        # get query param
        url_param = cls.get_url_query_param(url)

        # get area code in url
        url_area_code = url_param.get('area', 'root')
        if url_area_code not in AREA_BREAKDOWN_DICT.keys():
            return None

        # subdivide area
        area_code_list = AREA_BREAKDOWN_DICT[url_area_code]
        urls = []
        for area_code in area_code_list:
            new_url = cls.format_url_with_query_params(url, {'area': area_code})
            urls.append(new_url)

        return urls

    
    @classmethod
    def subdivide_job_category(cls, url):
        # get query param
        url_param = cls.get_url_query_param(url)

        # get jobcat code in url
        url_jobcat_code = url_param.get('jobcat', 'root')
        if url_jobcat_code not in JOBCAT_BREAKDOWN_DICT.keys():
            return None

        # subdivide job category
        jobcat_code_list = JOBCAT_BREAKDOWN_DICT[url_jobcat_code]
        urls = []
        for jobcat_code in jobcat_code_list:
            new_url = cls.format_url_with_query_params(url, {'jobcat': jobcat_code})
            urls.append(new_url)

        return urls
    
    
    @classmethod
    def subdivide_industry_category(cls, url):
        # get query param
        url_param = cls.get_url_query_param(url)

        # get indust code in url
        url_indust_code = url_param.get('indust', 'root')
        if url_indust_code not in INDUST_BREAKDOWN_DICT.keys():
            return None

        # subdivide industry category
        indust_code_list = INDUST_BREAKDOWN_DICT[url_indust_code]
        urls = []
        for indust_code in indust_code_list:
            new_url = cls.format_url_with_query_params(url, {'indust': indust_code})
            urls.append(new_url)

        return urls
    
    
    @classmethod
    def generate_each_page_url_from_start_url(cls, url, pages):
        urls = []
        for p in range(pages):
            next_page_url = cls.format_url_with_query_params(url, {'page': p+1})
            urls.append(next_page_url)
        return urls
        

url = 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0'
url_processor = StartUrlProcessor()
all_search_page_urls = url_processor.process(url)
all_search_page_urls

ajax_url
https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0
url_param
{'ro': '0', 'keyword': 'python', 'page': '1', 'mode': 's', 'order': '11', 'asc': '0'}
format_url
https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0
>>> URL: https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001001000
    Total Pages: 131
>>> URL: https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001002000
    Total Pages: 41
>>> URL: https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001003000
    Total Pages: 1
>>> URL: https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001004000
    Total Pages: 1
>>> URL: https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001005000
    Total Pages: 16
>>> URL: https://www.104.com.tw/jobs/

['https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001001000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001002000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001003000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001004000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001005000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001006000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001007000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001008000',
 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001010000',
 'https://www.104.c

In [8]:
url = 'https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0'
headers = {'Referer': url}
response = requests.get(url, headers=headers, verify=False)

In [11]:
j = json.loads(response.text)

In [13]:
j['data']['totalCount']

5543

In [22]:
total_count = 0
for search_page_url in all_search_page_urls:
    headers = {'Referer': search_page_url}
    response = requests.get(search_page_url, headers=headers, verify=False)
    j = json.loads(response.text)
    job_count = int(j['data']['totalCount'])
    
    total_count += job_count
    print()
    print(search_page_url)
    print(job_count)
    print(total_count)


https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001001000
2603
2603

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001002000
802
3405

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001003000
8
3413

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001004000
4
3417

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001005000
318
3735

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001006000
973
4708

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001007000
45
4753

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&asc=0&area=6001008000
391
5144

https://www.104.com.tw/jobs/search/list?ro=0&keyword=python&page=1&mode=s&order=11&