In [1]:
import requests
import re
import unicodedata
    
    
def join_list_of_dict_item(l: list, key: str):
    ''' input:  l = [{"name": "Ian"}, {"name": "Wang"}]
                key = 'name'
        output: "Ian、Wang"
    '''
    dict_item_list = [d[key] for d in l]
    return '、'.join(dict_item_list)


def join_list_of_element(l: list) -> str:
    ''' input:  ["A", "B", "C"]
        output: "A、B、C"
    '''
    return '、'.join(l)



def convert_full_width_to_half_width(text: str) -> str:
    ''' convert full-width character to half-width one, such as "Ａ" to "A"
    '''
    text = unicodedata.normalize('NFKC', text)
    return text


def remove_unicode_text(text: str) -> str:
    text = text.replace('\xa0', '')
    text = text.replace('\u3000', '')
    text = text.replace('\r', '')
    return text


def remove_extra_newline(text: str) -> str:
    text = re.sub('\n+', '\n', text)
    return text

    
def clean_text(text: str) -> str:
    text = convert_full_width_to_half_width(text)
    text = remove_unicode_text(text)
    text = remove_extra_newline(text)
    return text
    

In [2]:
job_no = '11211417'
url = 'https://www.104.com.tw/jb/104i/applyAnalysisToJob/all?job_no={}'.format(job_no)
url

'https://www.104.com.tw/jb/104i/applyAnalysisToJob/all?job_no=11211417'

In [3]:
headers = {
    'Referer': url
}

In [4]:
response = requests.get(url, headers=headers)
response

<Response [200]>

In [5]:
import json
j = json.loads(clean_text(response.text))

In [6]:
item = {}
item['analysis_page'] = j
item['analysis_page']

{'sex': {'0': {'sexName': '男', 'count': '2', 'percent': '66.67'},
  '1': {'sexName': '女', 'count': '1', 'percent': '33.33'},
  'update_time': '2021-07-08 02:02:07',
  'total': '3'},
 'edu': {'0': {'eduName': '博碩士', 'count': '1', 'percent': '33.33'},
  '1': {'eduName': '大學', 'count': '1', 'percent': '33.33'},
  '2': {'eduName': '專科', 'count': 0, 'percent': '0.00'},
  '3': {'eduName': '高中職', 'count': 0, 'percent': '0.00'},
  '4': {'eduName': '國中(含)以下', 'count': 0, 'percent': '0.00'},
  '5': {'eduName': '不拘', 'count': 0, 'percent': '0.00'},
  '6': {'eduName': '無法判斷', 'count': 1, 'percent': '33.33'},
  'update_time': '2021-07-08 02:02:07',
  'total': '3'},
 'yearRange': {'0': {'yearRangeName': '20歲以下',
   'count': '1',
   'percent': '33.33'},
  '1': {'yearRangeName': '21~25歲', 'count': '1', 'percent': '33.33'},
  '2': {'yearRangeName': '26~30歲', 'count': 0, 'percent': '0.00'},
  '3': {'yearRangeName': '31~35歲', 'count': 0, 'percent': '0.00'},
  '4': {'yearRangeName': '36~40歲', 'count': 0, 

In [7]:
json.dumps(item['analysis_page']['sex'], ensure_ascii=False)

'{"0": {"sexName": "男", "count": "2", "percent": "66.67"}, "1": {"sexName": "女", "count": "1", "percent": "33.33"}, "update_time": "2021-07-08 02:02:07", "total": "3"}'

In [8]:
parsed_item = {}

# 更新日期
parsed_item['update_date'] = get_update_date(item['analysis_page']['sex']['update_time'])

# 性別json
parsed_item['sex_json'] = item['analysis_page']['sex']

# 學歷json
parsed_item['edu_json'] = item['analysis_page']['edu']

# 年齡json
parsed_item['age_json'] = item['analysis_page']['yearRange']

# 工作經驗json
parsed_item['work_exp_json'] = item['analysis_page']['exp']

# 語言json
parsed_item['lang_json'] = item['analysis_page']['language']

# 科系json
parsed_item['major_json'] = item['analysis_page']['major']

# 技能json
parsed_item['skill_json'] = item['analysis_page']['skill']

# 證照json
parsed_item['cert_json'] = item['analysis_page']['cert']

parsed_item

NameError: name 'get_update_date' is not defined

In [None]:
def get_update_date(update_time_string):
    update_date = ''.join(re.findall('\d+', update_time_string[:10]))
    return int(update_date)

get_update_date('2021-07-04 02:00:27')


# analysis json

In [62]:
sex = {"0": {"sexName": "男", "count": "3", "percent": "42.86"}, "1": {"sexName": "女", "count": "4", "percent": "57.14"}, "update_time": "2021-07-09 02:04:31", "total": "7"}
edu = {"0": {"eduName": "博碩士", "count": 0, "percent": "0.00"}, "1": {"eduName": "大學", "count": "2", "percent": "28.57"}, "2": {"eduName": "專科", "count": "1", "percent": "14.29"}, "3": {"eduName": "高中職", "count": "2", "percent": "28.57"}, "4": {"eduName": "國中(含)以下", "count": 0, "percent": "0.00"}, "5": {"eduName": "不拘", "count": 0, "percent": "0.00"}, "6": {"eduName": "無法判斷", "count": 2, "percent": "28.57"}, "update_time": "2021-07-09 02:04:31", "total": "7"}
age = {"0": {"yearRangeName": "20歲以下", "count": "2", "percent": "28.57"}, "1": {"yearRangeName": "21~25歲", "count": "3", "percent": "42.86"}, "2": {"yearRangeName": "26~30歲", "count": 0, "percent": "0.00"}, "3": {"yearRangeName": "31~35歲", "count": 0, "percent": "0.00"}, "4": {"yearRangeName": "36~40歲", "count": 0, "percent": "0.00"}, "5": {"yearRangeName": "41~45歲", "count": 0, "percent": "0.00"}, "6": {"yearRangeName": "46~50歲", "count": "1", "percent": "14.29"}, "7": {"yearRangeName": "51~55歲", "count": "1", "percent": "14.29"}, "8": {"yearRangeName": "56~60歲", "count": 0, "percent": "0.00"}, "9": {"yearRangeName": "60歲以上", "count": 0, "percent": "0.00"}, "update_time": "2021-07-09 02:04:31", "total": "7"}
work_exp = {"0": {"expName": "無工作經驗", "count": "1", "percent": "14.29"}, "1": {"expName": "1年以下", "count": "1", "percent": "14.29"}, "2": {"expName": "1~3年 ", "count": "2", "percent": "28.57"}, "3": {"expName": "3~5年", "count": "1", "percent": "14.29"}, "4": {"expName": "5~10年", "count": "1", "percent": "14.29"}, "5": {"expName": "10~15年", "count": 0, "percent": "0.00"}, "6": {"expName": "15~20年", "count": 0, "percent": "0.00"}, "7": {"expName": "20~25年", "count": "1", "percent": "14.29"}, "8": {"expName": "25年以上", "count": 0, "percent": "0.00"}, "update_time": "2021-07-09 02:04:31", "total": "7"}
lang = {"update_time": "2021-07-09 02:04:31", "total": "11", "0": {"lang_no": "2", "langName": "日文", "count": 2, "percent": 18.18, "level": {"1": {"level_no": "4", "levelName": "略懂", "count": "2", "percent": "18.18"}}}, "1": {"lang_no": "18", "langName": "中文", "count": 2, "percent": 18.18, "level": {"3": {"level_no": "2", "levelName": "精通", "count": "2", "percent": "18.18"}}}, "2": {"lang_no": "1", "langName": "英文", "count": 4, "percent": 36.36, "level": {"1": {"level_no": "4", "levelName": "略懂", "count": "2", "percent": "18.18"}, "2": {"level_no": "8", "levelName": "中等", "count": "2", "percent": "18.18"}}}}
major = {"update_time": "2021-07-09 02:04:31", "total": "7", "0": {"major": "3018001000", "majorName": "普通科", "count": "2", "percent": "28.57"}, "1": {"major": "3006001000", "majorName": "一般商業學類", "count": "1", "percent": "14.29"}, "2": {"major": "3006003000", "majorName": "會計學相關", "count": "1", "percent": "14.29"}, "3": {"major": "3006010000", "majorName": "國際貿易相關", "count": "1", "percent": "14.29"}}
skill = {"update_time": "2021-07-09 02:04:31", "total": "7", "0": {"skill": "12001001003", "skillName": "DOS", "count": "1", "percent": "14.29"}, "1": {"skill": "12001004030", "skillName": "Access", "count": "1", "percent": "14.29"}, "2": {"skill": "12001010002", "skillName": "中文打字20~50", "count": "2", "percent": "28.57"}, "3": {"skill": "12001001035", "skillName": "Windows 98", "count": "1", "percent": "14.29"}, "4": {"skill": "12001001032", "skillName": "Windows 2000", "count": "1", "percent": "14.29"}, "5": {"skill": "12001010009", "skillName": "英文打字20~50", "count": "1", "percent": "14.29"}, "6": {"skill": "12001010003", "skillName": "中文打字50~75", "count": "1", "percent": "14.29"}, "7": {"skill": "12002002001", "skillName": "Adobe Photoshop", "count": "1", "percent": "14.29"}, "8": {"skill": "12001008011", "skillName": "Outlook", "count": "1", "percent": "14.29"}, "9": {"skill": "12001008012", "skillName": "PowerPoint", "count": "3", "percent": "42.86"}}
cert = {"update_time": "2021-07-09 02:04:31", "total": "7", "0": {"cert": "4005002017", "certName": "丙級電腦輔助建築製圖技術士", "count": "1", "percent": "14.29"}, "1": {"cert": "4005002006", "certName": "丙級測量技術士", "count": "1", "percent": "14.29"}, "2": {"cert": "4006001038", "certName": "會計能力測驗三級合格證書", "count": "1", "percent": "14.29"}, "3": {"cert": "4006003010", "certName": "證券商業務員", "count": "1", "percent": "14.29"}, "4": {"cert": "4014003004", "certName": "CPR證照", "count": "1", "percent": "14.29"}, "5": {"cert": "4009002005", "certName": "丙級中餐烹調技術士", "count": "1", "percent": "14.29"}, "6": {"cert": "4006002018", "certName": "人身保險業務員", "count": "1", "percent": "14.29"}, "7": {"cert": "4001001005", "certName": "TOEIC (多益測驗)", "count": "1", "percent": "14.29"}}

sex

{'0': {'sexName': '男', 'count': '3', 'percent': '42.86'},
 '1': {'sexName': '女', 'count': '4', 'percent': '57.14'},
 'update_time': '2021-07-09 02:04:31',
 'total': '7'}

In [66]:
def format_basic_analysis_dict(analysis_dict):
    ''' format sample: {0: {'男': 3}, 1: {'女': 4}}
    '''
    sequence_num = 0
    formatted_analysis_dict = {}
    for serial, record in analysis_dict.items():
        if not serial.isdigit():
            continue
        
        has_record_value = False
        for record_name, record_value in record.items():
            if 'Name' in record_name:
                record_name_desc = record_value
            elif 'count' in record_name:
                record_count = int(record_value)
                has_record_value = True
                formatted_analysis_dict[sequence_num] = {}
        
        if has_record_value:
            formatted_analysis_dict[sequence_num][record_name_desc] = record_count
            sequence_num += 1
        
    return formatted_analysis_dict

                
format_basic_analysis_dict(sex) # sex, edu, age, work_exp

{0: {'男': 3}, 1: {'女': 4}}

In [65]:
def format_option_analysis_dict(analysis_dict, analysis_type):
    ''' format sample: {0: {'3018001000': 2, '普通科': 2}, 1: {'3006001000': 1, '一般商業學類': 1}}
    '''
    sequence_num = 0
    formatted_analysis_dict = {}
    for serial, record in analysis_dict.items():
        if not serial.isdigit():
            continue
        
        has_record_value = False
        for record_name, record_value in record.items():
            if analysis_type == record_name:
                record_name_no = record_value
            elif 'Name' in record_name:
                record_name_desc = record_value
            elif 'count' in record_name:
                record_count = int(record_value)
                has_record_value = True
                formatted_analysis_dict[sequence_num] = {}
        
        if has_record_value:
            formatted_analysis_dict[sequence_num][record_name_no] = record_count
            formatted_analysis_dict[sequence_num][record_name_desc] = record_count
            sequence_num += 1
        
    return formatted_analysis_dict

format_option_analysis_dict(major, 'major') # major, skill, cert

{0: {'3018001000': 2, '普通科': 2},
 1: {'3006001000': 1, '一般商業學類': 1},
 2: {'3006003000': 1, '會計學相關': 1},
 3: {'3006010000': 1, '國際貿易相關': 1}}

In [67]:
def format_language_analysis_dict(analysis_dict):
    ''' format sample: {0: {'中文': 2}}
    Only if level is up to "精通", then add the language to the dictionary.
    '''
    sequence_num = 0
    formatted_analysis_dict = {}
    for serial, record in analysis_dict.items():
        if not serial.isdigit():
            continue
        
        has_record_value = False
        for record_name, record_value in record.items():
            if 'Name' in record_name:
                record_name_desc = record_value
            elif 'count' in record_name:
                record_count = int(record_value)
            elif 'level' in record_name:
                if '精通' in str(record_value):
                    has_record_value = True
                    formatted_analysis_dict[sequence_num] = {}
                    
        if has_record_value:
            formatted_analysis_dict[sequence_num][record_name_desc] = record_count
            sequence_num += 1
        
    return formatted_analysis_dict

print(lang)
print()
format_language_analysis_dict(lang)


{'update_time': '2021-07-09 02:04:31', 'total': '11', '0': {'lang_no': '2', 'langName': '日文', 'count': 2, 'percent': 18.18, 'level': {'1': {'level_no': '4', 'levelName': '略懂', 'count': '2', 'percent': '18.18'}}}, '1': {'lang_no': '18', 'langName': '中文', 'count': 2, 'percent': 18.18, 'level': {'3': {'level_no': '2', 'levelName': '精通', 'count': '2', 'percent': '18.18'}}}, '2': {'lang_no': '1', 'langName': '英文', 'count': 4, 'percent': 36.36, 'level': {'1': {'level_no': '4', 'levelName': '略懂', 'count': '2', 'percent': '18.18'}, '2': {'level_no': '8', 'levelName': '中等', 'count': '2', 'percent': '18.18'}}}}



{0: {'中文': 2}}