In [None]:
import os
import requests
from bs4 import BeautifulSoup
import json

In [33]:
def find_all(soup, _class):
    return soup.findAll(attrs={'class': _class}, recursive=False)

def fetch_pos_blocks(soup):
    pos_blocks = soup.select('div.entry-body__el')
    return pos_blocks

def fetch_pos_tag(pos_block):
    header_block = find_all(pos_block, 'pos-header')[0]
    pos_tag = header_block.select('.pos')[0].text
    return pos_tag

def fetch_big_sense_blocks(pos_block):
    body_block = find_all(pos_block, 'pos-body')[0]
    big_sense_blocks = find_all(body_block, 'dsense')
    return big_sense_blocks

def fetch_guide_word(big_sense_block):
    if 'dsense-noh' in big_sense_block['class']:
        return ''
    header_block = find_all(big_sense_block, 'dsense_h')[0]
    guide_word = find_all(header_block, 'dsense_gw')[0].text.strip()
    return guide_word

def fetch_sense_blocks(big_sense_block):
    body_block = find_all(big_sense_block, 'dsense_b')[0]
    sense_blocks = find_all(body_block, 'ddef_block')
    return sense_blocks

def fetch_extra_examples(big_sense_block):
    extra_ex_block = find_all(big_sense_block, 'daccord')
    if extra_ex_block:
        extra_examples = [exp_block.text
                          for exp_block in extra_ex_block[0].select('li.dexamp')]
    else:
        extra_examples = []
    return extra_examples

def fetch_phrase_blocks(big_sense_block):
    body_block = find_all(big_sense_block, 'dsense_b')[0]
    phrase_blocks = find_all(body_block, 'dphrase-block')
    return phrase_blocks

def fetch_phrase_data(phrase_block):
    phrase_head_block = fetch_phrase_head_block(phrase_block)
    phrase_head_data = fetch_phrase_head_data(phrase_head_block)
    phrase_sense_blocks = fetch_phrase_sense_blocks(phrase_block)
    phrase_sense_data_list = [fetch_sense_data(sense_block, is_phrase=True)
                              for sense_block in phrase_sense_blocks]
    sense_data = {'sense': phrase_sense_data_list}
    phrase_data = {**phrase_head_data, **sense_data}
    return phrase_data

def fetch_phrase_head_block(phrase_block):
    head_block = find_all(phrase_block, 'dphrase_h')[0]
    return head_block

def fetch_phrase_sense_blocks(phrase_block):
    body_block = find_all(phrase_block, 'dphrase_b')[0]
    phrase_sense_blocks = find_all(body_block, 'ddef_block')
    return phrase_sense_blocks

def fetch_phrase_head_data(phrase_head_block):
    term = find_all(phrase_head_block, 'dphrase-title')[0].text
    info_block = find_all(phrase_head_block, 'dphrase-info')
    level = ''
    if info_block:
        level_block = find_all(info_block[0], 'dxref')
        level = level_block[0].text if level_block else ''
    head_data = {
        'term': term,
        'level': level,
    }
    return head_data

def fetch_sense_data(sense_block, is_phrase=False):
    sense_head_block = find_all(sense_block, 'ddef_h')[0]
    en_def = find_all(sense_head_block, 'ddef_d')[0].text
    sense_body_block = find_all(sense_block, 'ddef_b')[0]
    ch_def = find_all(sense_body_block, 'dtrans')[0].text
    example_blocks = find_all(sense_body_block, 'dexamp')
    example_sents = fetch_example_sents(example_blocks)
    sense_data = {
        'en_def': en_def,
        'ch_def': ch_def,
        'example_sents': example_sents
    }
    if not is_phrase:
        sense_data['level'] = fetch_sense_level(sense_block)
    return sense_data

def fetch_example_sents(example_blocks):
    example_sents = []
    for block in example_blocks:
        en_sent = find_all(block, 'deg')[0].text
        ch_sent = find_all(block, 'dtrans')[0].text
        example_sents.extend([en_sent, ch_sent])
    return example_sents

def fetch_sense_level(sense_block):
    sense_head_block = find_all(sense_block, 'ddef_h')[0]
    info_block = find_all(sense_head_block, 'ddef-info')
    level = ''
    if info_block:
        level_block = find_all(info_block[0], 'dxref')
        level = level_block[0].text.strip() if level_block else ''
    return level

# TODO: 研究如何從 sense block 中擷取 data

In [3]:
vocab = 'back'
home_url = 'https://dictionary.cambridge.org/dictionary/english-chinese-traditional'
res = requests.get(os.path.join(home_url, vocab))
bs = BeautifulSoup(res.text, 'html.parser')

In [60]:
pos_blocks = fetch_pos_blocks(bs)
big_sense_blocks = fetch_big_sense_blocks(pos_blocks[1])
phrase_blocks = fetch_phrase_blocks(big_sense_blocks[0])
phrase_sense_blocks = fetch_phrase_sense_blocks(phrase_blocks[0])

In [82]:
pos_blocks2 = fetch_pos_blocks(bs)
big_sense_blocks2 = fetch_big_sense_blocks(pos_blocks[1])
phrase_blocks2 = fetch_phrase_blocks(big_sense_blocks[0])
phrase_sense_blocks2 = fetch_phrase_sense_blocks(phrase_blocks[0])

In [69]:
fetch_phrase_data(phrase_blocks[2])

{'term': 'the back of your hand',
 'level': '',
 'sense': [{'en_def': 'the side of your hand that has hair growing on it',
   'ch_def': '手背',
   'example_sents': []}]}

In [4]:
def fetch_vocab_data(page_bs):
    pos_blocks = fetch_pos_blocks(page_bs)
    vocab_data = {}
    for pos_block in pos_blocks:
        pos_tag = fetch_pos_tag(pos_block)
        big_sense_blocks = fetch_big_sense_blocks(pos_block)
        big_sense_data_list = []
        for big_sense_block in big_sense_blocks:
            guide_word = fetch_guide_word(big_sense_block)
            extra_sents = fetch_extra_examples(big_sense_block)
            senses_data = [fetch_sense_data(sense_block)
                           for sense_block in fetch_sense_blocks(big_sense_block)]
            big_sense_data = {'sense': senses_data,
                              'extra_sents': extra_sents,
                              'guidword': guide_word}
            big_sense_data_list.append(big_sense_data)
        vocab_data[pos_tag] = big_sense_data_list
    return vocab_data

In [57]:
res = requests.get(os.path.join(home_url, 'back'))
bank_bs = BeautifulSoup(res.text, 'html.parser')
bank_data = fetch_vocab_data(bank_bs)

In [73]:
type(bs)

bs4.BeautifulSoup

In [47]:
bank_data['noun'][2]

{'sense': [{'en_def': 'a traffic light',
   'ch_def': '自動交通信號燈',
   'example_sents': [],
   'level': ''}],
 'extra_sents': [],
 'guidword': '(TRAFFIC LIGHT)'}

In [70]:
bank_data['adverb'][0]['sense'][0]

{'en_def': 'in, into, or towards a previous place or condition, or an earlier time',
 'ch_def': '在原處；回到原處；恢復原狀；返回；回到以前',
 'example_sents': ['When you take the scissors, remember to put them back.',
  '用完剪刀後，記著要放回原處。',
  'He left a note saying "Gone out. Back soon."',
  '他留了字條，上面寫著「出去一會兒，馬上回來」。',
  "She went to Brazil for two years, but now she's back (= has returned).",
  '她在巴西住了兩年，現在回來了。',
  'He looked back (= looked behind him) and saw they were following him.',
  '他回頭看了看，發現他們正跟著他。',
  'Looking at her old photographs brought back (= made her remember) a lot of memories.',
  '她看著舊照片，想起了許多的往事。',
  "I was woken by a thunderstorm, and I couldn't get back to sleep (= could not sleep again).",
  '我被雷雨弄醒後，再也無法入睡。',
  'The last time we saw Lowell was back (= at an earlier time) in January.',
  '我們最後一次見到羅威爾是在早一月的時候。',
  'This tradition dates back to (= to the earlier time of) the 16th century.',
  '這個傳統可以追溯到16世紀。'],
 'level': 'B2'}

In [78]:
sense_blocks = fetch_sense_blocks(big_sense_blocks[0])

In [59]:
for pos_block in fetch_pos_blocks(bs):
    print(fetch_pos_tag(pos_block))

noun


In [49]:
with open('/home/old/fun/crawling-cambridge-dictionary/cambridge.word.666.json', 'r') as f:
    camb_dict = json.load(f)

In [72]:
camb_dict['back']['adverb'][0]['sense'][0]

{'en_def': 'in, into, or towards a previous place or condition, or an earlier time',
 'ch_def': '在原處；回到原處；恢復原狀；返回；回到以前',
 'level': 'B2',
 'examples': ['When you take the scissors, remember to put them back.',
  '用完剪刀後，記著要放回原處。',
  'He left a note saying "Gone out. Back soon."',
  '他留了字條，上面寫著「出去一會兒，馬上回來」。',
  "She went to Brazil for two years, but now she's back (= has returned).",
  '她在巴西住了兩年，現在回來了。',
  'He looked back (= looked behind him) and saw they were following him.',
  '他回頭看了看，發現他們正跟著他。',
  'Looking at her old photographs brought back (= made her remember) a lot of memories.',
  '她看著舊照片，想起了許多的往事。',
  "I was woken by a thunderstorm, and I couldn't get back to sleep (= could not sleep again).",
  '我被雷雨弄醒後，再也無法入睡。',
  'The last time we saw Lowell was back (= at an earlier time) in January.',
  '我們最後一次見到羅威爾是在早一月的時候。',
  'This tradition dates back to (= to the earlier time of) the 16th century.',
  '這個傳統可以追溯到16世紀。'],
 'gcs': ''}