In [35]:
import sys
sys.path.append("..")

In [42]:
import re
import time
import requests
from bs4 import BeautifulSoup

from config import config
from app.conf.conversion_table import AREA_DICT, INDUSTRY_DICT, JOBCAT_DICT


HEADERS = config['headers']
USE_PROXY = config['use_proxy']
PROXIES = config['proxies']


class ServiceSplitUrl:
    
    def __init__(self, origin_url_list: list=[]):
        self.total_counts = 0
        self.origin_url_list = origin_url_list
        self.return_url_list = []
        
        
    def process(self):
        for url in self.origin_url_list:
            split_url_list = []
            split_url_list = self._split_url(url, split_url_list)
            self.return_url_list.extend(split_url_list)
            
        self.return_url_list = list(set(self.return_url_list))
        return self.return_url_list
    
    
    @staticmethod
    def _get_search_id(search_type: str, search_id: str=''):
        '''
        :param search_type = 'area', 'jobcat', 'indcat'
        '''
        if search_type == 'area':
            search_dict = AREA_DICT
        elif search_type == 'jobcat':
            search_dict = JOBCAT_DICT
        elif search_type == 'indcat':
            search_dict = INDUSTRY_DICT
        else:
            raise ValueError(f'Invalid search_type: {search_type}')

        next_search_id_list = []
        if search_id[-6:] == '000000':
            next_search_id_list = [ele_id for ele_id in search_dict.values() if ele_id[:4] == search_id[:4] and ele_id[-3:] == '000']
            next_search_id_list.remove(search_id)
        elif search_id[-3:] == '000':
            next_search_id_list = [ele_id for ele_id in search_dict.values() if ele_id[:7] == search_id[:7]]
            next_search_id_list.remove(search_id)
        elif search_id == '':
            next_search_id_list = [ele_id for ele_name, ele_id in search_dict.items() if ele_id[-6:] == '000000']
        return next_search_id_list

    
    @staticmethod
    def _get_total_job_num(soup):
        total_job_num = re.findall('\"totalCount\":(\d+)', soup.text)
        if total_job_num:
            total_job_num = total_job_num[0]
            return total_job_num
        else:
            return None

    
    def _split_url(self, url: str, split_url_list: list):
        if USE_PROXY:
            response = requests.get(url, headers=HEADERS, proxies=PROXIES, verify=False, allow_redirects=False)
        else:
            response = requests.get(url, headers=HEADERS, verify=False, allow_redirects=False)
        soup = BeautifulSoup(response.text, 'lxml')
        
        
        # get total job num
        total_job_num = self._get_total_job_num(soup)
        if total_job_num == None:
            print('==============================')
            print('>>> ERROR:', url)
            print('==============================')
            time.sleep(2)
            self._split_url(url, split_url_list)
            return

        
        print('counts:', total_job_num)
        print('url:', url)
        print()
        
        
        # if total job num is less than 3,000, then return
        if 0 < int(total_job_num) <= 3000:
            split_url_list.append(url)
            self.total_counts += int(total_job_num)
            print('==============================')
            print('>>> TOTAL COUNTS:', self.total_counts)
            print('==============================')
            return split_url_list
        
        elif int(total_job_num) == 0:
            return split_url_list

        
        # split urls
        is_modified = False

        # Page
        if 'page=' not in url and is_modified == False:
            is_modified = True
            next_url = url + f'&page=1'
            self._split_url(next_url, split_url_list)


        # Industry category
        if 'indcat=' not in url and is_modified == False:
            print('>>> add indcat into url:', url)
            print()
            search_id_list = self._get_search_id('indcat')
            for search_id in search_id_list:
                is_modified = True
                next_url = url + f'&indcat={search_id}'
                self._split_url(next_url, split_url_list)

        elif 'indcat=' in url and is_modified == False:
            print('>>> split indcat url:', response.request.url)
            print()
            selected_search_id = re.search('indcat=(\d+)', response.request.url).group(1)
            search_id_list = self._get_search_id('indcat', selected_search_id)
            for search_id in search_id_list:
                is_modified = True
                next_url = re.sub('(indcat=)\d+', '\g<1>{}'.format(search_id), url)
                self._split_url(next_url, split_url_list)


        # Area
        if 'area=' not in url and is_modified == False:
            print('>>> add area into url:', url)
            print()
            search_id_list = self._get_search_id('area')
            for search_id in search_id_list:
                is_modified = True
                next_url = url + f'&area={search_id}'
                self._split_url(next_url, split_url_list)

        elif 'area=' in url and is_modified == False:
            print('>>> split area url:', response.request.url)
            print()
            selected_search_id = re.search('area=(\d+)', response.request.url).group(1)
            search_id_list = self._get_search_id('area', selected_search_id)
            for search_id in search_id_list:
                is_modified = True
                next_url = re.sub('(area=)\d+', '\g<1>{}'.format(search_id), url)
                self._split_url(next_url, split_url_list)


        # Job category
        if 'jobcat=' not in url and is_modified == False:
            print('>>> add jobcat into url:', url)
            print()
            search_id_list = self._get_search_id('jobcat')
            for search_id in search_id_list:
                is_modified = True
                next_url = url + f'&jobcat={search_id}'
                self._split_url(next_url, split_url_list)

        elif 'jobcat=' in url and is_modified == False:
            print('>>> split jobcat url:', response.request.url)
            print()
            selected_search_id = re.search('jobcat=(\d+)', response.request.url).group(1)
            search_id_list = self._get_search_id('jobcat', selected_search_id)
            for search_id in search_id_list:
                is_modified = True
                next_url = re.sub('(jobcat=)\d+', '\g<1>{}'.format(search_id), url)
                self._split_url(next_url, split_url_list)

        return split_url_list

In [44]:
urls_list = [
    'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc',
    'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1004000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc',
    'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1002000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc'
]
test = ServiceSplitUrl(urls_list)
test.process()



counts: 17175
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> split indcat url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

counts: 10842
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> split indcat url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

counts: 2316
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001001&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> TOTAL COUNTS: 2316
counts: 4485
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> split indcat url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=200700000

counts: 1120
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001006&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> TOTAL COUNTS: 10817
counts: 797
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001002000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> TOTAL COUNTS: 11614
counts: 2691
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001003000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> TOTAL COUNTS: 14305
counts: 544
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001004000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> TOTAL COUNTS: 14849
counts: 858
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001005000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc

>>> TOTAL COUNTS: 15707
counts: 1443
url: https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001006000&order=11&asc=0&page=1&mode=s&jobsource=2018index

['https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001004000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc&area=6001020000',
 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc&area=6001016000',
 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc&area=6001013000',
 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001003&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001001&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001001002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc&area=6001018000',
 'htt

In [30]:
def _get_url_query_string(url):
    parsed_url = urlparse(url)
    query_string = parse_qs(parsed_url.query)
    return query_string



In [33]:
query_string = _get_url_query_string(URL)
query_string.keys()

dict_keys(['ro', 'jobcat', 'order', 'asc', 'page', 'mode', 'jobsource'])

In [27]:
from urllib.parse import urlparse, parse_qs, urlsplit, urlunsplit, urlencode
URL='https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc'
URL='https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1004001001%2C1004001007%2C1004003001%2C1004003002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc'


qs = parse_qs(query_string)
print(qs)
print()

qs['jobsource'] = ['ABCDEFG']
qs['indcat'] = ['1,2,3']
new_query_string = urlencode(qs, doseq=True)
print(new_query_string)
print()

new_url = urlunsplit((scheme, netloc, path, new_query_string, fragment))
print(new_url)
print()

{'ro': ['0'], 'jobcat': ['2007000000'], 'order': ['11'], 'asc': ['0'], 'page': ['1'], 'mode': ['s'], 'jobsource': ['2018indexpoc']}

ro=0&jobcat=2007000000&order=11&asc=0&page=1&mode=s&jobsource=ABCDEFG&indcat=1%2C2%2C3

https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&order=11&asc=0&page=1&mode=s&jobsource=ABCDEFG&indcat=1%2C2%2C3



In [6]:
urls_list = ['https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc']
urls_list[0]

'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1001000000&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc'

In [1]:
from urllib.parse import urlparse, parse_qs, urlsplit, urlunsplit, urlencode

In [2]:
url = 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1004001001%2C1004001007%2C1004003001%2C1004003002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc'

In [4]:
parse_qs(url)

{'https://www.104.com.tw/jobs/search/?ro': ['0'],
 'jobcat': ['2007000000'],
 'indcat': ['1004001001,1004001007,1004003001,1004003002'],
 'order': ['11'],
 'asc': ['0'],
 'page': ['1'],
 'mode': ['s'],
 'jobsource': ['2018indexpoc']}

In [5]:
scheme, netloc, path, new_query_string, fragment = urlsplit(url)

SplitResult(scheme='https', netloc='www.104.com.tw', path='/jobs/search/', query='ro=0&jobcat=2007000000&indcat=1004001001%2C1004001007%2C1004003001%2C1004003002&order=11&asc=0&page=1&mode=s&jobsource=2018indexpoc', fragment='')

In [16]:
url = 'https://www.104.com.tw/jobs/search/?ro=0&jobcat=2003000000%2C2007000000%2C2002000000&indcat=1004003002%2C1004003001%2C1004001001%2C1004001007&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc'

In [28]:
scheme, netloc, path, query_string, fragment = urlsplit(url)
query_dict = parse_qs(query_string)
query_dict = {key: value[0].split(',') for key, value in query_dict.items()}
print(query_dict)
print()
# for query_key, query_value in query_dict.items():
#     if len(query_value) > 1:
#         print(query_key, query_value)


{'ro': ['0'], 'jobcat': ['2003000000', '2007000000', '2002000000'], 'indcat': ['1004003002', '1004003001', '1004001001', '1004001007'], 'order': ['12'], 'asc': ['0'], 'page': ['1'], 'mode': ['s'], 'jobsource': ['2018indexpoc']}



In [43]:
from itertools import product

scheme, netloc, path, query_string, fragment = urlsplit(url)
query_dict = parse_qs(query_string)
query_dict = {key: value[0].split(',') for key, value in query_dict.items()}
print(query_dict)
print()
        
query_value_combination_list = list(product(*(query_dict[query_key] for query_key in query_dict.keys())))
for query_value_combination in query_value_combination_list:
    new_query_dict = {}
    for query_key, query_value in zip(query_dict.keys(), query_value_combination):
        new_query_dict[query_key] = [query_value]
        
    new_query_string = urlencode(new_query_dict, doseq=True)
    new_url = urlunsplit((scheme, netloc, path, new_query_string, fragment))
    print(new_url)

{'ro': ['0'], 'jobcat': ['2003000000', '2007000000', '2002000000'], 'indcat': ['1004003002', '1004003001', '1004001001', '1004001007'], 'order': ['12'], 'asc': ['0'], 'page': ['1'], 'mode': ['s'], 'jobsource': ['2018indexpoc']}

https://www.104.com.tw/jobs/search/?ro=0&jobcat=2003000000&indcat=1004003002&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc
https://www.104.com.tw/jobs/search/?ro=0&jobcat=2003000000&indcat=1004003001&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc
https://www.104.com.tw/jobs/search/?ro=0&jobcat=2003000000&indcat=1004001001&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc
https://www.104.com.tw/jobs/search/?ro=0&jobcat=2003000000&indcat=1004001007&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc
https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1004003002&order=12&asc=0&page=1&mode=s&jobsource=2018indexpoc
https://www.104.com.tw/jobs/search/?ro=0&jobcat=2007000000&indcat=1004003001&order=12&asc=0&page=1&mode=s&jobsource=2018indexp