In [2]:
import os
import requests
from bs4 import BeautifulSoup
import json
import random
import sys
import tqdm
sys.path.append('../')
from ScrambleDict.ScrambleDict.page_parser.vocab_page import VocabPage

In [2]:
def print_bar(symbol):
    print('\n' + symbol * 20 + '\n')

def print_vocab_page(vocab_page):
    print(f'`{vocab_page.vocab}`')
    for pos_block in vocab_page.pos_blocks:
        print(pos_block.pos_tag)
        for big_sense_block in pos_block.big_sense_blocks:
            print(f'guide word: {big_sense_block.guideword}')
            print_bar('-')
            print('`sense blocks`')
            for sense_block in big_sense_block.sense_blocks:
                print(f'en def: {sense_block.en_def}')
                print(f'ch def: {sense_block.ch_def}')
                print(f'level: {sense_block.level}')
                print('example sents:')
                print(sense_block.examples)
                print_bar('#')
            print_bar('-')
            print('`phrase_blocks`')
            for phrase_block in big_sense_block.phrase_blocks:
                print(f'term: {phrase_block.term}')
                print(f'level: {phrase_block.level}')
                print('sense list:')
                print(phrase_block.sense_list)
            print_bar('-')
            print('`extra sents`')
            print(big_sense_block.extra_sents)
            print_bar('=')
        print_bar('*')

In [19]:
with open('../ScrambleDict/vocab_urls.csv', 'r') as f:
    vocab_urls = f.read().split('\n')[1:-1]

In [21]:
from collections import Counter
for url, count in Counter(vocab_urls).items():
    if count > 1:
        print(url)

In [4]:
random.seed(104702016)
sample_urls = random.choices(vocab_urls, k=2000)

In [5]:
vocab_page_list = []
problematic_urls = []
for url in tqdm.tqdm_notebook(sample_urls):
    vocab = url.split('/')[-1]
    res = requests.get(url)
    bs = BeautifulSoup(res.text, 'html.parser')
    try:
        vocab_page_list.append(VocabPage(vocab, bs))
    except IndexError:
        problematic_urls.append(url)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [35]:
problematic_urls[18]

'https://dictionary.cambridge.org/dictionary/english-chinese-traditional/back-down'

In [3]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/kick-off')
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage('kick-off', bs)
print_vocab_page(vocab_page)

`kick-off`
phrasal verb
guide word: (START)

--------------------

`sense blocks`
en def: If a game of football kicks off, it starts.
ch def: （足球比賽）開球，開始
level: C1
example sents:
[{'en': 'What time does the game kick off?', 'ch': '比賽甚麼時候開始？'}]

####################


--------------------

`phrase_blocks`

--------------------

`extra sents`
[]


guide word: (TROUBLE OR EXCITEMENT)

--------------------

`sense blocks`
en def: to start to get angry or complain in a noisy way
ch def: 生氣；吵嚷地抱怨
level: 
example sents:
[{'en': "The children started to kick off so I couldn't stay.", 'ch': '孩子都開始吵嚷地抱怨了，因此我必須走了。'}]

####################


--------------------

`phrase_blocks`

--------------------

`extra sents`
[]


guide word: (DIE)

--------------------

`sense blocks`
en def: to die
ch def: 死掉
level: 
example sents:
[]

####################


--------------------

`phrase_blocks`

--------------------

`extra sents`
[]



********************

noun
guide word: 

--------------------

`sense

In [53]:
res = requests.get('https://dictionary.cambridge.org/dictionary/english-chinese-traditional/back')
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage('kick-off', bs)
print_vocab_page(vocab_page)

`kick-off`
adverb
guide word: (RETURN)

--------------------

`sense blocks`
en def: in, into, or towards a previous place or condition, or an earlier time
ch def: 在原處；回到原處；恢復原狀；返回；回到以前
level: B2
example sents:
[{'en': 'When you take the scissors, remember to put them back.', 'ch': '用完剪刀後，記著要放回原處。'}, {'en': 'He left a note saying "Gone out. Back soon."', 'ch': '他留了字條，上面寫著「出去一會兒，馬上回來」。'}, {'en': "She went to Brazil for two years, but now she's back (= has returned).", 'ch': '她在巴西住了兩年，現在回來了。'}, {'en': 'He looked back (= looked behind him) and saw they were following him.', 'ch': '他回頭看了看，發現他們正跟著他。'}, {'en': 'Looking at her old photographs brought back (= made her remember) a lot of memories.', 'ch': '她看著舊照片，想起了許多的往事。'}, {'en': "I was woken by a thunderstorm, and I couldn't get back to sleep (= could not sleep again).", 'ch': '我被雷雨弄醒後，再也無法入睡。'}, {'en': 'The last time we saw Lowell was back (= at an earlier time) in January.', 'ch': '我們最後一次見到羅威爾是在早一月的時候。'}, {'en': 'This tradition dates back

In [3]:
def fetch_vocab_data(vocab_page):
    vocab_data = {}
    for pos_block in vocab_page.pos_blocks:
        big_sense_data_list = [fetch_big_sense_data(big_sense_block)
                               for big_sense_block in pos_block.big_sense_blocks]
        vocab_data[pos_block.pos_tag] = big_sense_data_list
    return vocab_data

def fetch_big_sense_data(big_sense_block):
    big_sense_data = {
        'guideword': big_sense_block.guideword,
        'sense': [fetch_sense_data(sense_block)
                  for sense_block in big_sense_block.sense_blocks],
        'phrase': [fetch_phrase_data(phrase_block)
                   for phrase_block in big_sense_block.phrase_blocks],
        'extra_sents': big_sense_block.extra_sents
    }
    return big_sense_data

def fetch_sense_data(sense_block):
    sense_data = {
        'en_def': sense_block.en_def,
        'ch_def': sense_block.ch_def,
        'level': sense_block.level,
        'examples': sense_block.examples
    }
    return sense_data

def fetch_phrase_data(phrase_block):
    phrase_data = {
        'term': phrase_block.term,
        'level': phrase_block.level,
        'sense': phrase_block.sense_list
    }
    return phrase_data

In [8]:
tmp = list(filter(lambda x: int(len(x.pos_blocks) == 0), vocab_page_list))

In [41]:
vocab = 'caddie'
home_url = 'https://dictionary.cambridge.org/dictionary/english-chinese-traditional'
res = requests.get(os.path.join(home_url, vocab))
bs = BeautifulSoup(res.text, 'html.parser')
vocab_page = VocabPage(vocab, bs)
vocab_data = fetch_vocab_data(vocab_page)

In [80]:
p = []
for vocab, data in tmp_data.items():
    if data and '' in data.keys():
        p.append({vocab: data})

In [29]:
res = requests.get(os.path.join(home_url, 'back'))
bank_bs = BeautifulSoup(res.text, 'html.parser')
bank_data = fetch_vocab_data(bank_bs)

In [38]:
with open('/home/old/fun/crawling-cambridge-dictionary/cambridge.word.666.json', 'r') as f:
    camb_dict = json.load(f)

In [52]:
camb_dict['back']['noun'][0]['sense'][0]['examples'][0]

'He jotted her name down on the back of an envelope.'

In [3]:
with open('../ScrambleDict/vocab_data.json', 'r') as f:
    tmp_data = json.load(f)

In [6]:
for vd in tmp_data[:5]:
    vocab, data = vd.items()

ValueError: not enough values to unpack (expected 2, got 1)

In [10]:
vd.items()

dict_items([('b', {'': [{'guideword': '', 'sense': [{'en_def': 'written abbreviation for\nborn verb ', 'ch_def': '出生（born的縮寫）', 'level': '', 'examples': [{'en': 'John Winston Lennon (b. 9 October 1940, Liverpool, d. 8 December 1980, New York).', 'ch': '約翰‧溫斯頓‧連儂（1940年10月9日生於利物浦，1980年12月8日卒於紐約）'}]}], 'phrase': [], 'extra_sents': []}]})])

In [62]:
list(tmp_data[0].keys())[0]

'b'

In [68]:
tmp_data = {list(x.keys())[0]: list(x.values())[0]
            for x in tmp_data}

In [74]:
p = []
for x in tmp_data:
    if not tmp_data[x]:
        p.append(x)

In [77]:
len(p)

5927

In [15]:
tmp_data[:600]

'[\n{"b": {"": [{"guideword": "", "sense": [{"en_def": "written abbreviation for\\nborn verb ", "ch_def": "\\u51fa\\u751f\\uff08born\\u7684\\u7e2e\\u5beb\\uff09", "level": "", "examples": [{"en": "John Winston Lennon (b. 9 October 1940, Liverpool, d. 8 December 1980, New York).", "ch": "\\u7d04\\u7ff0\\u2027\\u6eab\\u65af\\u9813\\u2027\\u9023\\u5102\\uff081940\\u5e7410\\u67089\\u65e5\\u751f\\u65bc\\u5229\\u7269\\u6d66\\uff0c1980\\u5e7412\\u67088\\u65e5\\u5352\\u65bc\\u7d10\\u7d04\\uff09"}]}], "phrase": [], "extra_sents": []}]}},\n{"durian": {"noun": [{"guideword": "", "sense": [{"en_def": "a large, oval, tropical fruit with a hard'

In [78]:
with open('../ScrambleDict/logs/20200530081948_info.log') as f:
    logs = f.read().split('\n')[:-1]

In [79]:
logs[0]

'2020-05-30 08:19:49,420 - VOCAB_PAGES_SPIDER - INFO - trying to parse https://dictionary.cambridge.org/dictionary/english-chinese-traditional/d-ya'

In [46]:
c = Counter(list(map(lambda x: x.split(' ')[-1], logs)))

In [47]:
for url, count in c.items():
    if count > 1:
        print(url)

In [49]:
len(logs)

50441

In [140]:
from collections import Counter
Counter(list(map(lambda x: x.split(' ')[5], logs)))

Counter({'INFO': 50441})