In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from collections import Counter

In [2]:
def remove_rt(text):
    return re.sub('<rt>.+?</rt>', '', text)

def tag(text):
    text = re.sub(r'<span class="colorC">(.+?)</span>', r"{org}\1{/org}", text)
    text = re.sub(r'<span class="colorL">(.+?)</span>', r"{plc}\1{/plc}", text)
    text = re.sub(r'<span class="colorN">(.+?)</span>', r"{per}\1{/per}", text)
    return text

def retag(text):
    text = re.sub(r'{org}(.+?){/org}', r"<org>\1</org>", text)
    text = re.sub(r'{plc}(.+?){/plc}', r"<plc>\1</plc>", text)
    text = re.sub(r'{per}(.+?){/per}', r"<per>\1</per>", text)
    return text

def remove_a(text):
    text = re.sub(r'</?a.*?>', '', text)
    text = re.sub(r'<span class="under">(\w+)</span>', r'\1', text)
    text = re.sub(r'<img.+?>(<br ?/?>)?', '', text)
    text = re.sub(r'^<br ?/?>', '', text)
    return text.strip()

# for old web easy
def scrape_easy_one(html):
    soup = BeautifulSoup(html, "html.parser")
    url_normal = soup.find('div', id="regularnews").a.get('href')
    if '/http://' in url_normal:
        url_normal = 'http://' + url_normal.split('/http://')[-1]
    else:
        url_normal = 'https://' + url_normal.split('/https://')[-1]
    date = soup.find('p', id="newsDate").text[1:-1]
    #title_easy = soup.find('h1', class_="article-main__title")
    #title_easy_ruby = ''.join([str(t) for t in title_easy.contents]).strip()
    url_easy = soup.find('meta', attrs={'name':'shorturl'}).get('content')
    title_easy = soup.find('div', id='newstitle').h2
    title_easy_ruby = ''.join([str(t) for t in title_easy.contents]).strip()
    title_easy = BeautifulSoup(remove_rt(str(title_easy)), "html.parser").text.strip()
    article_easy = soup.find('div', id="newsarticle")
    article_easy = BeautifulSoup(tag(remove_rt(str(article_easy))), "html.parser").text.strip()
    article_easy_ruby = soup.find('div', id="newsarticle").find_all('p')
    article_easy_ruby = '\n'.join([''.join([remove_a(str(l)) for l in p.contents]) for p in article_easy_ruby if p != []]).strip()
    
    return {
        'id':url_easy.split('/')[-1].split('.html')[0],
        'title_easy':title_easy,
        'title_easy_ruby':title_easy_ruby,
        'article_easy':retag(article_easy),
        'article_easy_ruby':article_easy_ruby,
        'url_easy':url_easy,
        'url_normal':url_normal,
        'date_easy':date
    }

# for new web easy
def scrape_easy_one_new(html):
    soup = BeautifulSoup(html, "html.parser")
    url_easy = 'https://' + soup.find('meta', property="og:url").get('content').split('/https://')[-1]
    url_normal = soup.find('div', class_="link-to-normal").a.get('href')
    date = soup.find('p', class_="article-main__date").text[1:-1]
    title_easy = soup.find('h1', class_="article-main__title")
    title_easy_ruby = ''.join([str(t) for t in title_easy.contents]).strip()
    title_easy = BeautifulSoup(remove_rt(str(title_easy)), "html.parser").text.strip()
    article_easy = soup.find('div', class_="article-main__body article-body")
    article_easy = BeautifulSoup(tag(remove_rt(str(article_easy))), "html.parser").text.strip()
    article_easy_ruby = soup.find('div', class_="article-main__body article-body").find_all('p')
    article_easy_ruby = '\n'.join([''.join([remove_a(str(l)) for l in p.contents]) for p in article_easy_ruby if p != []]).strip()
    
    return {
        'id':url_easy.split('/')[-1].split('.html')[0],
        'title_easy':title_easy,
        'title_easy_ruby':title_easy_ruby,
        'article_easy':retag(article_easy),
        'article_easy_ruby':article_easy_ruby,
        'url_easy':url_easy,
        'url_normal':url_normal,
        'date_easy':date
    }



def get_link(start=0):
    notyet = []
    n_list = pd.read_json('nhkweb.json', encoding='utf-8')['url'].tolist()
    df_e = pd.read_json('nhkwebeasy.json', encoding='utf-8') 
    for i in df_e['url_normal'][start:]:
        if i not in n_list:
            notyet.append(i)
    with open('nolinknormal.txt') as f:
        nolink = f.read().split()
    return sorted(set(notyet) - set(nolink))
        
def js_e(dic):
    with open('nhkwebeasy.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('nhkwebeasy.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

# scrape

In [4]:
driver = webdriver.Chrome()

# clean category & keyword

In [6]:
with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)
    print(len(data))
with open('nhkwebeasy.json','r', encoding='utf-8') as f:
    data = json.load(f)
    print(len(data))

6814
3317


In [9]:
# check category

with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)
print('articles: ', len(data))
genre = Counter()
for dic in data:
    for g in dic['genre']:
        genre[g] += 1
genre.most_common()

articles:  6814


[('社会', 2122),
 ('国際', 1796),
 ('科学・文化', 1045),
 ('ビジネス', 1037),
 ('スポーツ', 904),
 ('政治', 700),
 ('暮らし', 490),
 ('地域', 423),
 ('気象・災害', 318)]

In [8]:
# genre <> keywords

with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)

category = ['社会', '国際', 'ビジネス', 'スポーツ', '政治', '科学・文化', '暮らし', '地域', '気象・災害']
for i, dic in enumerate(data):
    newgenre = []
    newkey = []
    for j in dic['genre']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        elif j == "科学・医療" or j == "文化・エンタメ":
            newgenre.append('科学・文化')
        elif j == "経済":
            newgenre.append('ビジネス')
        else:
            newkey.append(j)
    for j in dic['keywords']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        else:
            newkey.append(j)
    data[i]['genre'] = list(set(newgenre))
    data[i]['keywords'] = list(set(newkey))

with open('nhkweb.json','w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)
    

In [8]:
normal = pd.read_json('nhkwebeasy.json')
normal.id.value_counts()

k10011070741000    1
k10011066251000    1
k10011303361000    1
k10010573201000    1
k10011997261000    1
                  ..
k10012266621000    1
k10011017691000    1
k10011351651000    1
k10011121361000    1
k10012229751000    1
Name: id, Length: 3721, dtype: int64

# NHK web easy (new) ID k1001140020 ~

In [15]:
with open('tobescraped.txt') as f:
    urls = f.read().split()
print(len(urls))

ids = pd.read_json('nhkwebeasy.json')['id'].tolist()
urls = [url for url in urls if url.split('/')[-1].strip('.html') not in ids]
print(len(urls))
del ids

1841
683


In [4]:
driver = webdriver.Chrome()

In [16]:
for url in urls[:1000]:
    driver.get(url)
    time.sleep(6)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(between|1 time).*?<a href="(.+?)">', html)
    if snap == None:
        continue
    driver.get('https://web.archive.org' + snap.group(2))
    time.sleep(10)
    html = driver.page_source.encode('utf-8')
    try:
        dic = scrape_easy_one(html)
    except:
        dic = scrape_easy_one_new(html)
    js_e(dic)

In [13]:
html = driver.page_source.encode('utf-8')
dic = scrape_easy_one(html)
dic

{'id': 'k10010882931000',
 'title_easy': '東京都議会\u3000豊洲に決めた理由などを百条委員会で調べる',
 'title_easy_ruby': '<ruby>東京都<rt>とうきょうと</rt></ruby><ruby>議会<rt>ぎかい</rt></ruby>\u3000<ruby>豊洲<rt>とよす</rt></ruby>に<ruby>決<rt>き</rt></ruby>めた<ruby>理由<rt>りゆう</rt></ruby>などを<ruby>百<rt>ひゃく</rt></ruby><ruby>条<rt>じょう</rt></ruby><ruby>委員会<rt>いいんかい</rt></ruby>で<ruby>調<rt>しら</rt></ruby>べる',
 'article_easy': '<plc>東京都</plc>は、<org>築地市場</org>が古くなったため、<plc>豊洲</plc>に新しく市場を建てて引っ越す予定でした。しかし、新しい市場の地下の水から体に悪い物質が見つかって問題になっています。\n<org>東京都議会</org>の議員のグループは、<plc>豊洲</plc>に引っ越すと決まった理由などを調べるために「百条委員会」を開くと決めました。この委員会に呼ばれたら、理由がないとき以外は出席しなければならないと法律で決まっています。この委員会でうそを言うと、訴えられます。\n議員たちは、<plc>豊洲</plc>に引っ越すと決まったときに知事だった<per>石原</per><per>慎太郎</per>さんなどを百条委員会に呼んで話を聞こうと考えています。',
 'article_easy_ruby': '<span class="colorL"><ruby>東京都<rt>とうきょうと</rt></ruby></span>は、<span class="colorC"><ruby>築地市場<rt>つきじしじょう</rt></ruby></span>が<ruby>古<rt>ふる</rt></ruby>くなったため、<span class="colorL"><ruby>豊洲<rt>とよす</rt></ruby></span>に<ruby>新<rt>あたら</rt></ruby>しく<ruby>市場<rt

In [14]:
js_e(dic)

In [36]:
soup = BeautifulSoup(html)
soup.find('div', class_='link-to-normal')

<div class="link-to-normal" id="js-regular-news-wrapper">
<a class="btn" href="https://www3.nhk.or.jp/news/html/20200206/k10012274671000.html" id="js-regular-news" target="_blank"><ruby>普通<rt>ふつう</rt></ruby>のニュースを<ruby>読<rt>よ</rt></ruby>む</a>
</div>