In [1]:
import os
import requests
from bs4 import BeautifulSoup
import json
import random
import sys
import tqdm
sys.path.append('../')
from ScrambleDict.ScrambleDict.vocab_page import VocabPage

In [2]:
def print_bar(symbol):
    print('\n' + symbol * 20 + '\n')

def print_vocab_page(vocab_page):
    print(f'`{vocab_page.vocab}`')
    for pos_block in vocab_page.pos_blocks:
        print(pos_block.pos_tag)
        for big_sense_block in pos_block.big_sense_blocks:
            print(f'guide word: {big_sense_block.guideword}')
            print_bar('-')
            print('`sense blocks`')
            for sense_block in big_sense_block.sense_blocks:
                print(f'en def: {sense_block.en_def}')
                print(f'ch def: {sense_block.ch_def}')
                print(f'level: {sense_block.level}')
                print('example sents:')
                print(sense_block.examples)
                print_bar('#')
            print_bar('-')
            print('`phrase_blocks`')
            for phrase_block in big_sense_block.phrase_blocks:
                print(f'term: {phrase_block.term}')
                print(f'level: {phrase_block.level}')
                print('sense list:')
                print(phrase_block.sense_list)
            print_bar('-')
            print('`extra sents`')
            print(big_sense_block.extra_sents)
            print_bar('=')
        print_bar('*')

In [3]:
with open('../ScrambleDict/vocab_urls.csv', 'r') as f:
    vocab_urls = f.read().split('\n')[1:-1]

In [4]:
random.seed(104702016)
sample_urls = random.choices(vocab_urls, k=2000)

In [5]:
vocab_page_list = []
problematic_urls = []
for url in tqdm.tqdm_notebook(sample_urls):
    vocab = url.split('/')[-1]
    res = requests.get(url)
    bs = BeautifulSoup(res.text, 'html.parser')
    try:
        vocab_page_list.append(VocabPage(vocab, bs))
    except IndexError:
        problematic_urls.append(url)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [20]:
vocab_page_list[0].vocab_page_bs.select('.pv-block')

[]

In [35]:
problematic_urls[18]

'https://dictionary.cambridge.org/dictionary/english-chinese-traditional/back-down'

In [21]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/kick-off')
bs = BeautifulSoup(res.text, 'html.parser')

In [27]:
bs.select('.entry-body__el')[0].select('.pv-block')[0]

<div class="pv-block"><div class="di-title"><h2 class="headword tw-bw dhw dpos-h_hw"><b>kick off</b></h2></div> <span class="di-info"><div class="pos-header dpos-h"><span class="anc-info-head danc-info-head">— <span class="pos dpos" title="Verb with an adverb or preposition, with meaning different from meaning of its parts.">phrasal verb</span> with <span class="hw dhw">kick</span> </span><span class="pos dpos" title="A word that describes an action, condition or experience.">verb</span> <div></div><span class="uk dpron-i"><span class="region dreg">uk</span><span class="daud">
<amp-audio controlslist="nodownload" id="ampaudio1" layout="nodisplay" preload="none">
<div class="hdib" fallback="">
<p>Your browser doesn't support HTML5 audio</p>
</div>
<source src="/media/english-chinese-traditional/uk_pron/u/ukk/ukker/ukkero_028.mp3" type="audio/mpeg"/>
<source src="/media/english-chinese-traditional/uk_pron_ogg/u/ukk/ukker/ukkero_028.ogg" type="audio/ogg"/>
</amp-audio>
<div class="i i-vol

In [7]:
tmp = list(filter(lambda x: int(len(x.pos_blocks) == 0), vocab_page_list))

In [8]:
len(tmp)

235

In [52]:
len(vocab_page_list[0].pos_blocks)

1

In [5]:
vocab = 'cat'
home_url = 'https://dictionary.cambridge.org/dictionary/english-chinese-traditional'
res = requests.get(os.path.join(home_url, vocab))
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage(vocab, bs)
print_vocab_page(vocab_page)

`cat`
noun
guide word: 

--------------------

`sense blocks`
en def: a small animal with fur, four legs, a tail, and claws, usually kept as a pet or for catching mice 
ch def: 貓
level: A1
example sents:
[]

####################

en def: any member of the group of animals similar to the cat, such as the lion
ch def: 貓科動物
level: 
example sents:
[{'en': 'the cat family ', 'ch': ''}]

####################


--------------------

`phrase_blocks`

--------------------

`extra sents`
['My cat likes dozing in front of the fire.', "I usually feed the neighbour's cat while she's away.", "She's always chasing cats out of the garden to protect her precious birds.", 'A cat was miaowing pitifully outside the door.', 'The cat purred as I stroked its fur.']



********************



In [28]:
def fetch_vocab_data(page_bs):
    pos_blocks = fetch_pos_blocks(page_bs)
    vocab_data = {}
    for pos_block in pos_blocks:
        pos_tag = fetch_pos_tag(pos_block)
        big_sense_blocks = fetch_big_sense_blocks(pos_block)
        big_sense_data_list = []
        for big_sense_block in big_sense_blocks:
            guide_word = fetch_guide_word(big_sense_block)
            extra_sents = fetch_extra_examples(big_sense_block)
            senses_data = [fetch_sense_data(sense_block)
                           for sense_block in fetch_sense_blocks(big_sense_block)]
            big_sense_data = {'sense': senses_data,
                              'extra_sents': extra_sents,
                              'guidword': guide_word}
            big_sense_data_list.append(big_sense_data)
        vocab_data[pos_tag] = big_sense_data_list
    return vocab_data

In [29]:
res = requests.get(os.path.join(home_url, 'back'))
bank_bs = BeautifulSoup(res.text, 'html.parser')
bank_data = fetch_vocab_data(bank_bs)

In [47]:
with open('/home/old/fun/crawling-cambridge-dictionary/cambridge.word.666.json', 'r') as f:
    camb_dict = json.load(f)

In [48]:
camb_dict['come-to-that']

KeyError: 'come-to-that'