In [1]:
import requests
import re
import unicodedata
    
    
def join_list_of_dict_item(l: list, key: str):
    ''' input:  l = [{"name": "Ian"}, {"name": "Wang"}]
                key = 'name'
        output: "Ian、Wang"
    '''
    dict_item_list = [d[key] for d in l]
    return '、'.join(dict_item_list)


def join_list_of_element(l: list) -> str:
    ''' input:  ["A", "B", "C"]
        output: "A、B、C"
    '''
    return '、'.join(l)



def convert_full_width_to_half_width(text: str) -> str:
    ''' convert full-width character to half-width one, such as "Ａ" to "A"
    '''
    text = unicodedata.normalize('NFKC', text)
    return text


def remove_unicode_text(text: str) -> str:
    text = text.replace('\xa0', '')
    text = text.replace('\u3000', '')
    text = text.replace('\r', '')
    return text


def remove_extra_newline(text: str) -> str:
    text = re.sub('\n+', '\n', text)
    return text

    
def clean_text(text: str) -> str:
    text = convert_full_width_to_half_width(text)
    text = remove_unicode_text(text)
    text = remove_extra_newline(text)
    return text
    

In [7]:
job_id = '6vdp2'
job_id = '7aboj'
job_id = '7b726'
job_id = '6621b'
job_id = '7bmdb'
url = 'https://www.104.com.tw/job/ajax/content/{}'.format(job_id)
url

'https://www.104.com.tw/job/ajax/content/7bmdb'

In [8]:
headers = {
    'Referer': url
}

In [9]:
response = requests.get(url, headers=headers)
response

<Response [200]>

In [10]:
import json
j = json.loads(clean_text(response.text))

In [11]:
item = {}
item['job_page'] = j
item['job_page']

{'data': {'corpImageRight': {'corpImageRight': {'imageUrl': '', 'link': ''}},
  'header': {'corpImageTop': {'imageUrl': '', 'link': ''},
   'jobName': '測試工程師 Test Engineer FAE (大陸常駐FAE, 非業務人員)',
   'appearDate': '2021/07/15',
   'custName': '香港商顯通香港科技有限公司台灣分公司',
   'custUrl': 'https://www.104.com.tw/company/1a2x6bke5m',
   'applyDate': '',
   'analysisType': 1,
   'analysisUrl': '//www.104.com.tw/jobs/apply/analysis/7bmdb',
   'isSaved': False,
   'isApplied': False},
  'contact': {'hrName': '陳先生',
   'email': '',
   'visit': '',
   'phone': '',
   'other': 'Please reply with your English resume/CV.',
   'reply': '合適者將於5個工作天內主動聯繫，不合適者將不另行通知',
   'suggestExam': False},
  'environmentPic': {'environmentPic': [],
   'corpImageBottom': {'imageUrl': '', 'link': ''}},
  'condition': {'acceptRole': {'role': [{'code': 1, 'description': '上班族'},
     {'code': 2, 'description': '應屆畢業生'},
     {'code': 32, 'description': '外籍人士'}],
    'disRole': {'needHandicapCompendium': False, 'disability': []}}

In [48]:
item['job_page']['data']['jobDetail']

{'jobDescription': '• 與開發團隊協調以確定應用程序要求。\n• 使用 Python 編程語言編寫可擴展的代碼。\n• 測試和調試應用程序。\n• 開發後端組件。\n• 使用服務器端邏輯集成面向用戶的元素。\n• 評估和確定客戶端功能請求的優先級。\n• 集成數據存儲解決方案。\n• 與前端開發人員協調。\n• 重新編程現有數據庫以改進功能。',
 'jobCategory': [{'code': '2007001004', 'description': '軟體設計工程師'}],
 'salary': '月薪80,000~120,000元',
 'salaryMin': 80000,
 'salaryMax': 120000,
 'salaryType': 50,
 'jobType': 1,
 'workType': [],
 'addressNo': '6003003014',
 'addressRegion': '阿拉伯聯合大公國',
 'addressDetail': '杜拜',
 'industryArea': '',
 'longitude': '55.2707828',
 'latitude': '25.2048493',
 'manageResp': '不需負擔管理責任',
 'businessTrip': '無需出差外派',
 'workPeriod': '日班/晚班，跨國所有需要有不同時區要求。因每個部門不同所以看部門調整',
 'vacationPolicy': '依公司規定',
 'startWorkingDay': '一個月內',
 'hireType': 0,
 'delegatedRecruit': '',
 'needEmp': '5~10人',
 'landmark': ''}

In [47]:
item['job_page']['data']['condition']['acceptRole']['disRole']['disability']

[]

In [50]:
parsed_item = {}

# 工作類型
parsed_item['job_cat'] = join_list_of_dict_item(item['job_page']['data']['jobDetail']['jobCategory'], 'code')

# 工作類型
parsed_item['job_cat_desc'] = join_list_of_dict_item(item['job_page']['data']['jobDetail']['jobCategory'], 'description')

# 需求人數描述 
parsed_item['need_count_desc'] = item['job_page']['data']['jobDetail']['needEmp']

# 技能要求
parsed_item['skill'] = join_list_of_dict_item(item['job_page']['data']['condition']['skill'], 'description')

# 專長要求
parsed_item['specialty'] = join_list_of_dict_item(item['job_page']['data']['condition']['specialty'], 'description')

# 科系要求
parsed_item['major'] = join_list_of_element(item['job_page']['data']['condition']['major'])

# 語言要求
parsed_item['lang'] = get_language_requirement(item['job_page']['data']['condition']['language'])

# 地方語言要求
parsed_item['local_lang'] = get_language_requirement(item['job_page']['data']['condition']['localLanguage'])

# 證照要求
parsed_item['cert'] = join_list_of_element(item['job_page']['data']['condition']['certificate'])

# 駕照要求
parsed_item['driver_license'] = join_list_of_element(item['job_page']['data']['condition']['driverLicense'])

# 其他要求
parsed_item['other'] = item['job_page']['data']['condition']['other']

# 接受身份
parsed_item['accept_role'] = join_list_of_dict_item(item['job_page']['data']['condition']['acceptRole']['role'], 'description')

# 婉拒身份
parsed_item['disaccept_role'] = join_list_of_dict_item(item['job_page']['data']['condition']['acceptRole']['disRole']['disability'], 'type')

# 管理責任
parsed_item['manage_resp'] = item['job_page']['data']['jobDetail']['manageResp']

# 出差外派
parsed_item['business_trip'] = item['job_page']['data']['jobDetail']['businessTrip']

# 上班時段
parsed_item['work_period'] = item['job_page']['data']['jobDetail']['workPeriod']

# 休假制度
parsed_item['vacation_policy'] = item['job_page']['data']['jobDetail']['vacationPolicy']

# 可上班日
parsed_item['start_work_day'] = item['job_page']['data']['jobDetail']['startWorkingDay']

# 產業編號
parsed_item['industry_no'] = item['job_page']['data']['industryNo']

parsed_item

{'job_cat': '2007001004',
 'job_cat_desc': '軟體設計工程師',
 'need_count_desc': '5~10人',
 'skill': '系統架構規劃、軟體程式設計、資料庫系統管理維護',
 'specialty': 'Python、JavaScript',
 'major': '',
 'lang': '',
 'local_lang': '',
 'cert': '',
 'driver_license': '',
 'other': '計算機科學、計算機工程或相關領域的學士學位。\n • 2-5 年的 Python 開發經驗。\n • Python 和相關框架（包括 Django 和 Flask）的專家知識。\n • 熟悉 sqlalchemy ORM 與 SQL Syntax\n • 熟悉 mysql 資料儲存系統。熟悉 redis\n • 對 restful 格式有深刻理解，並可依其設計開發 API\n • 理解 HTML5，CSS，Javascript\n • 使用 Git， AWS， Docker， Kubernetes\n • 深入理解 Python 的多進程架構和線程限制。\n • 熟悉服務器端模板語言，包括 Jinja 2 和 Mako。\n • 能夠將多個數據源集成到一個系統中。\n • 熟悉測試工具。\n • 能夠在項目上進行協作並在需要時獨立工作',
 'accept_role': '上班族、應屆畢業生',
 'disaccept_role': '',
 'manage_resp': '不需負擔管理責任',
 'business_trip': '無需出差外派',
 'work_period': '日班/晚班，跨國所有需要有不同時區要求。因每個部門不同所以看部門調整',
 'vacation_policy': '依公司規定',
 'start_work_day': '一個月內',
 'industry_no': '1001001002'}

In [44]:
def get_language_requirement(language_list):
    if len(language_list) == 0:
        return ''
    
    required_lang_list = []
    for lang in language_list:
        if '精通' in lang['ability']:
            required_lang_list.append(lang['language'])
    return '、'.join(required_lang_list)

get_language_requirement(item['job_page']['data']['condition']['language'])


'英文'