In [1]:
import os
import requests
from bs4 import BeautifulSoup
import json
import random
import sys
import tqdm
sys.path.append('../')
from ScrambleDict.ScrambleDict.page_parser.vocab_page import VocabPage
from ScrambleDict.ScrambleDict.page_parser.utils import fetch_vocab_data, find_all

In [2]:
def print_bar(symbol):
    print('\n' + symbol * 20 + '\n')

def print_vocab_page(vocab_page):
    print(f'`{vocab_page.vocab}`')
    for pos_block in vocab_page.pos_blocks:
        print(pos_block.pos_tag)
        for big_sense_block in pos_block.big_sense_blocks:
            print(f'guide word: {big_sense_block.guideword}')
            print_bar('-')
            print('`sense blocks`')
            for sense_block in big_sense_block.sense_blocks:
                print(f'en def: {sense_block.en_def}')
                print(f'ch def: {sense_block.ch_def}')
                print(f'level: {sense_block.level}')
                print('example sents:')
                print(sense_block.examples)
                print_bar('#')
            print_bar('-')
            print('`phrase_blocks`')
            for phrase_block in big_sense_block.phrase_blocks:
                print(f'term: {phrase_block.term}')
                print(f'level: {phrase_block.level}')
                print('sense list:')
                print(phrase_block.sense_list)
            print_bar('-')
            print('`extra sents`')
            print(big_sense_block.extra_sents)
            print_bar('=')
        print_bar('*')

In [19]:
with open('../ScrambleDict/vocab_urls.csv', 'r') as f:
    vocab_urls = f.read().split('\n')[1:-1]

In [21]:
from collections import Counter
for url, count in Counter(vocab_urls).items():
    if count > 1:
        print(url)

In [4]:
random.seed(104702016)
sample_urls = random.choices(vocab_urls, k=2000)

In [5]:
vocab_page_list = []
problematic_urls = []
for url in tqdm.tqdm_notebook(sample_urls):
    vocab = url.split('/')[-1]
    res = requests.get(url)
    bs = BeautifulSoup(res.text, 'html.parser')
    try:
        vocab_page_list.append(VocabPage(vocab, bs))
    except IndexError:
        problematic_urls.append(url)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [35]:
problematic_urls[18]

'https://dictionary.cambridge.org/dictionary/english-chinese-traditional/back-down'

In [2]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/back')
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage('second-best', bs)
vocab_data = fetch_vocab_data(vocab_page)
#print_vocab_page(vocab_page)

In [11]:
vocab_data['verb'][0]

{'headword': 'back',
 'big_sense': [{'guideword': '(SUPPORT)',
   'sense': [{'en_def': 'to give support to someone or something with money or words',
     'ch_def': '支持；資助，援助',
     'level': 'C2',
     'gcs': 'T',
     'examples': [{'en': 'The management has refused to back our proposals.',
       'ch': '管理層拒不支持我們的提議。'}]}],
   'phrase': [],
   'extra_sents': ['This theory needs to be backed up with solid empirical evidence.',
    'The troops were backed by tanks, artillery, and other heavy armour.',
    'It is hoped that all sides will back the peace plan.',
    "A poll indicated that 77 percent of Americans backed the president's plan, with only 6 percent in opposition.",
    'Although Paris backs the U.N. demands, it has made it clear it will not take part in any new military action.']},
  {'guideword': '(RISK MONEY)',
   'sense': [{'en_def': 'to risk money by saying that you think a horse, team, etc. will win a race, game, or competition in order to win more money if they do',
     

In [3]:
vocab_page.pos_blocks[2].big_sense_blocks[2].sense_blocks[0].gcs

'I or T,  + adv/prep'

In [7]:
vocab_data['noun'][0]['big_sense'][0]

{'guideword': '',
 'sense': [{'en_def': 'the part, space, or side of something that is inside',
   'ch_def': '裡面，內部',
   'level': 'B2',
   'examples': [{'en': 'Did you clean the inside of the car?',
     'ch': '你把車內部擦乾淨了嗎？'},
    {'en': 'The hotel looked shabby from the street, but it was fine on the inside.',
     'ch': '這家飯店從街上看去顯得很破舊，但裡面很好。'},
    {'en': "the insides of people's houses", 'ch': '人們的房子內部'}]},
  {'en_def': 'The inside of a part of the body such as the arm or leg is the part facing in towards the rest of the body.',
   'ch_def': '（肢體的）內側',
   'level': '',
   'examples': [{'en': 'She dabbed perfume on the inside of her wrist.',
     'ch': '她在手腕內側搽了香水。'}]}],
 'phrase': [{'term': 'insides',
   'level': '',
   'sense': [{'en_def': "a person's or animal's internal organs, especially the stomach or bowels",
     'ch_def': '內臟，（尤指）腸胃',
     'examples': [{'en': "The dead seal's insides were spread all over the snow.",
       'ch': '死海豹的內臟散落在雪地上。'}]}]}],
 'extra_sents': ["The in

In [30]:
vocab_page.pos_blocks[4].headword

'inside'

In [32]:
vocab_data['noun'][0]

[{'guideword': '',
  'sense': [{'en_def': 'the part, space, or side of something that is inside',
    'ch_def': '裡面，內部',
    'level': 'B2',
    'examples': [{'en': 'Did you clean the inside of the car?',
      'ch': '你把車內部擦乾淨了嗎？'},
     {'en': 'The hotel looked shabby from the street, but it was fine on the inside.',
      'ch': '這家飯店從街上看去顯得很破舊，但裡面很好。'},
     {'en': "the insides of people's houses", 'ch': '人們的房子內部'}]},
   {'en_def': 'The inside of a part of the body such as the arm or leg is the part facing in towards the rest of the body.',
    'ch_def': '（肢體的）內側',
    'level': '',
    'examples': [{'en': 'She dabbed perfume on the inside of her wrist.',
      'ch': '她在手腕內側搽了香水。'}]}],
  'phrase': [{'term': 'insides',
    'level': '',
    'sense': [{'en_def': "a person's or animal's internal organs, especially the stomach or bowels",
      'ch_def': '內臟，（尤指）腸胃',
      'examples': [{'en': "The dead seal's insides were spread all over the snow.",
        'ch': '死海豹的內臟散落在雪地上。'}]}]}],
  'e

In [17]:
vocab_page.pos_blocks[3].headword

'back'

In [11]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/vt')
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage('kick-off', bs)
#print_vocab_page(vocab_page)

In [3]:
def fetch_vocab_data(vocab_page):
    vocab_data = {}
    fo.examples pos_block in vocab_page.pos_blocks:
        big_sense_data_list = [fetch_big_sense_data(big_sense_block)
                               for big_sense_block in pos_block.big_sense_blocks]
        vocab_data[pos_block.pos_tag] = big_sense_data_list
    return vocab_data

def fetch_big_sense_data(big_sense_block):
    big_sense_data = {
        'guideword': big_sense_block.guideword,
        'sense': [fetch_sense_data(sense_block)
                  for sense_block in big_sense_block.sense_blocks],
        'phrase': [fetch_phrase_data(phrase_block)
                   for phrase_block in big_sense_block.phrase_blocks],
        'extra_sents': big_sense_block.extra_sents
    }
    return big_sense_data

def fetch_sense_data(sense_block):
    sense_data = {
        'en_def': sense_block.en_def,
        'ch_def': sense_block.ch_def,
        'level': sense_block.level,
        'examples': sense_block.examples
    }
    return sense_data

def fetch_phrase_data(phrase_block):
    phrase_data = {
        'term': phrase_block.term,
        'level': phrase_block.level,
        'sense': phrase_block.sense_list
    }
    return phrase_data

In [8]:
tmp = list(filter(lambda x: int(len(x.pos_blocks) == 0), vocab_page_list))

In [41]:
vocab = 'caddie'
home_url = 'https://dictionary.cambridge.org/dictionary/english-chinese-traditional'
res = requests.get(os.path.join(home_url, vocab))
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage(vocab, bs)
vocab_data = fetch_vocab_data(vocab_page)

In [80]:
p = []
for vocab, data in tmp_data.items():
    if data and '' in data.keys():
        p.append({vocab: data})

In [29]:
res = requests.get(os.path.join(home_url, 'back'))
bank_bs = BeautifulSoup(res.text, 'html.parser')
bank_data = fetch_vocab_data(bank_bs)

In [38]:
with open('/home/old/fun/crawling-cambridge-dictionary/cambridge.word.666.json', 'r') as f:
    camb_dict = json.load(f)

In [52]:
camb_dict['back']['noun'][0]['sense'][0]['examples'][0]

'He jotted her name down on the back of an envelope.'

In [3]:
with open('../ScrambleDict/vocab_data.json', 'r') as f:
    tmp_data = json.load(f)

In [6]:
for vd in tmp_data[:5]:
    vocab, data = vd.items()

ValueError: not enough values to unpack (expected 2, got 1)

In [10]:
vd.items()

dict_items([('b', {'': [{'guideword': '', 'sense': [{'en_def': 'written abbreviation for\nborn verb ', 'ch_def': '出生（born的縮寫）', 'level': '', 'examples': [{'en': 'John Winston Lennon (b. 9 October 1940, Liverpool, d. 8 December 1980, New York).', 'ch': '約翰‧溫斯頓‧連儂（1940年10月9日生於利物浦，1980年12月8日卒於紐約）'}]}], 'phrase': [], 'extra_sents': []}]})])

In [62]:
list(tmp_data[0].keys())[0]

'b'

In [68]:
tmp_data = {list(x.keys())[0]: list(x.values())[0]
            for x in tmp_data}

In [74]:
p = []
for x in tmp_data:
    if not tmp_data[x]:
        p.append(x)

In [28]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/application')
bs = BeautifulSoup(res.text, 'html.parser')

In [35]:
bs.select('.dgram')[0].text

'[ C or U ]'

In [27]:
pos_blocks_bs = bs.select('div.entry-body__el')
header_bs = pos_blocks_bs[0].select('.pos-header')
title_bs = header_bs[0].select('.di-title')[0]
title_bs.select('.dpos-h_hw')[0].text

'fiancé'

In [21]:
pos_blocks_bs = bs.select('div.entry-body__el')
pos_blocks_bs = pos_blocks_bs[0].select('.pv-block')
#header_bs = pos_blocks_bs[0].select('.pos-header')[0]
title_bs = pos_blocks_bs[0].select('.di-title')[0]
title_bs.select('.dpos-h_hw')[0].text

<div class="pos-header dpos-h"><span class="anc-info-head danc-info-head">— <span class="pos dpos" title="Verb with an adverb or preposition, with meaning different from meaning of its parts.">phrasal verb</span> with <span class="hw dhw">kick</span> </span><span class="pos dpos" title="A word that describes an action, condition or experience.">verb</span> <div></div><span class="uk dpron-i"><span class="region dreg">uk</span><span class="daud">
<amp-audio controlslist="nodownload" id="ampaudio1" layout="nodisplay" preload="none">
<div class="hdib" fallback="">
<p>Your browser doesn't support HTML5 audio</p>
</div>
<source src="/media/english-chinese-traditional/uk_pron/u/ukk/ukker/ukkero_028.mp3" type="audio/mpeg"/>
<source src="/media/english-chinese-traditional/uk_pron_ogg/u/ukk/ukker/ukkero_028.ogg" type="audio/ogg"/>
</amp-audio>
<div class="i i-volume-up c_aud htc hdib hp hv-1 fon tcu tc-bd lmr-10 lpt-3" on="tap: ampaudio1.play" role="button" tabindex="0">
</div>
</span><span cla

In [23]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/second-best')
bs = BeautifulSoup(res.text, 'html.parser')
x = bs.findAll('div', {'class': 'idiom-block'})

In [35]:
x[1]

<div class="idiom-block"><div class="di-title"><h2 class="headword tw-bw dhw dpos-h_hw"><b>second best</b></h2></div><span class="idiom-body didiom-body">
<div class="pr dsense dsense-noh"><div class="cid" id="caldzh-cnt-1-1"></div> <div class="sense-body dsense_b"><div class="def-block ddef_block" data-wl-senseid="ID_00028555_06">
<div class="dwl hax">
<a amp-access="loggedIn" amp-access-hide="" class="dwla wordlist-add-button" on="tap:AMP.setState({ stateGlobal: { wlSenseId: 'ID_00028555_06' } }), sidebarWordList.open" title="Add this meaning to a word list">
<i aria-hidden="true" class="i i-plus"></i>
<i class="i i-list-ul"></i>
</a>
<a amp-access="NOT loggedIn" class="dwla wordlist-add-button" on="tap:amp-access.login-sign-in" title="Add this meaning to a word list">
<i aria-hidden="true" class="i i-plus"></i>
<i class="i i-list-ul"></i>
</a>
</div>
<div class="ddef_h"><span class="def-info ddef-info"> </span><div class="def ddef_d db">not as good as the <a class="query" href="http