In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from collections import Counter

In [0]:
def scrape(html):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('h1', class_='content--title').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    url_normal = 'https:' + soup.find('meta', property="og:url").get('content').rsplit('https:')[-1]
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url_normal.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url_normal,
        'datePublished':date,
        'dateModified':date_m
    }

def remove_rt(text):
    return re.sub('<rt>.+?</rt>', '', text)

def tag(text):
    text = re.sub(r'<span class="colorC">(.+?)</span>', r"{org}\1{/org}", text)
    text = re.sub(r'<span class="colorL">(.+?)</span>', r"{plc}\1{/plc}", text)
    text = re.sub(r'<span class="colorN">(.+?)</span>', r"{per}\1{/per}", text)
    return text

def retag(text):
    text = re.sub(r'{org}(.+?){/org}', r"<org>\1</org>", text)
    text = re.sub(r'{plc}(.+?){/plc}', r"<plc>\1</plc>", text)
    text = re.sub(r'{per}(.+?){/per}', r"<per>\1</per>", text)
    return text

def remove_a(text):
    text = re.sub(r'</?a.*?>', '', text)
    text = re.sub(r'<span class="under">(\w+)</span>', r'\1', text)
    return text

# for old web easy
def scrape_easy_one(html):
    soup = BeautifulSoup(html, "html.parser")
    url_normal = soup.find('div', id="regularnews").a.get('href').split('/http://')[-1]
    url_normal = 'http://' + url_normal
    date = soup.find('p', id="newsDate").text[1:-1]
    #title_easy = soup.find('h1', class_="article-main__title")
    #title_easy_ruby = ''.join([str(t) for t in title_easy.contents]).strip()
    url_easy = soup.find('meta', attrs={'name':'shorturl'}).get('content')
    title_easy = soup.find('div', id='newstitle').h2
    title_easy_ruby = ''.join([str(t) for t in title_easy.contents]).strip()
    title_easy = BeautifulSoup(remove_rt(str(title_easy)), "html.parser").text.strip()
    article_easy = soup.find('div', id="newsarticle")
    article_easy = BeautifulSoup(tag(remove_rt(str(article_easy))), "html.parser").text.strip()
    article_easy_ruby = soup.find('div', id="newsarticle").find_all('p')
    article_easy_ruby = '\n'.join([''.join([remove_a(str(l)) for l in p.contents]) for p in article_easy_ruby if p != []]).strip()
    
    return {
        'id':url_easy.split('/')[-1].split('.html')[0],
        'title_easy':title_easy,
        'title_easy_ruby':title_easy_ruby,
        'article_easy':retag(article_easy),
        'article_easy_ruby':article_easy_ruby,
        'url_easy':url_easy,
        'url_normal':url_normal,
        'date_easy':date
    }

def make_date_normal_old(hmd,time):
    year = hmd[:4]
    month = hmd[4:6]
    day = hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"


# for old web normal
def scrape_normal_one(html, url):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def get_link(start=0):
    notyet = []
    n_list = pd.read_json('nhkweb.json', encoding='utf-8')['url'].tolist()
    df_e = pd.read_json('nhkwebeasy.json', encoding='utf-8') 
    for i in df_e['url_normal'][start:]:
        if i not in n_list:
            notyet.append(i)
    with open('nolinknormal.txt') as f:
        nolink = f.read().split()
    return sorted(set(notyet) - set(nolink))

def get_link_nogenre(start=0):
    notyet = []
    with open('nhkweb.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    for dic in data:
        if dic['genre'] == []:
            notyet.append(dic['url'])
    return notyet  

def js(dic):
    with open('nhkweb.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('nhkweb.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)
        
def js_e(dic):
    with open('nhkwebeasy.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('nhkwebeasy.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

In [4]:
htmls = ['https://www3.nhk.or.jp/news/html/20191117/k10012180511000.html']

In [36]:
htmls = get_link(0)
print(len(htmls))
htmls[:10]

1121


['http://www3.nhk.or.jp/news/html/20151022/k10010278681000.html',
 'http://www3.nhk.or.jp/news/html/20151103/k10010292821000.html',
 'http://www3.nhk.or.jp/news/html/20151107/k10010298041000.html',
 'http://www3.nhk.or.jp/news/html/20151108/k10010298051000.html',
 'http://www3.nhk.or.jp/news/html/20151108/k10010298411000.html',
 'http://www3.nhk.or.jp/news/html/20151109/k10010298901000.html',
 'http://www3.nhk.or.jp/news/html/20151109/k10010299191000.html',
 'http://www3.nhk.or.jp/news/html/20151109/k10010299481000.html',
 'http://www3.nhk.or.jp/news/html/20151110/k10010299961000.html',
 'http://www3.nhk.or.jp/news/html/20151110/k10010300521000.html']

In [29]:
htmls = get_link_nogenre(); htmls[:50]

[]

# scrape

In [4]:
driver = webdriver.Chrome()

In [37]:
for nhkurl in tqdm.tqdm(htmls[3:]):
    driver.get(f'https://web.archive.org/web/2019*/{nhkurl}')
    time.sleep(10)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(between|1 time).*?<a href="(.+?)">', html)
    if not snap:
        print(nhkurl)
        continue
    driver.get('https://web.archive.org' + snap.group(2))
    time.sleep(30)
    html = driver.page_source.encode('utf-8')
    try:
        dic = scrape_normal_one(html, nhkurl)
        js(dic)
    except:
        print(nhkurl)





  0%|          | 0/1118 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/1118 [01:07<21:05:17, 67.97s/it][A[A[A[A



  0%|          | 2/1118 [02:22<21:38:27, 69.81s/it][A[A[A[A



  0%|          | 3/1118 [03:39<22:19:47, 72.10s/it][A[A[A[A



  0%|          | 4/1118 [04:48<22:00:52, 71.14s/it][A[A[A[A



  0%|          | 5/1118 [06:21<24:04:02, 77.85s/it][A[A[A[A



  1%|          | 6/1118 [07:36<23:41:53, 76.72s/it][A[A[A[A



  1%|          | 7/1118 [08:53<23:43:25, 76.87s/it][A[A[A[A



  1%|          | 8/1118 [10:06<23:20:20, 75.69s/it][A[A[A[A



  1%|          | 9/1118 [11:20<23:09:39, 75.18s/it][A[A[A[A



  1%|          | 10/1118 [12:36<23:13:13, 75.45s/it][A[A[A[A



  1%|          | 11/1118 [13:49<23:01:42, 74.89s/it][A[A[A[A



  1%|          | 12/1118 [14:43<21:05:07, 68.63s/it][A[A[A[Ahttp://www3.nhk.or.jp/news/html/20151111/k10010302431000.html




  1%|          | 13/1118 [15:46<20:28:20, 66.70s/it][A[A[A[A



  1

In [29]:
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, "html.parser")
dic = scrape_normal_one(html, 'http://www3.nhk.or.jp/news/html/20150929/k10010252101000.html')
print(dic)
js(dic)

{'id': 'k10010252101000', 'title': '選手強化費 クレジットカードのポイントで寄付', 'article': 'クレジットカードのポイントをオリンピックを目指すトップ選手の強化費への寄付として募るというユニークな取り組みが、ＪＯＣ＝日本オリンピック委員会とクレジットカード大手の協力で２９日から始まりました。\n\nこの取り組みは、２０２０年東京オリンピックなどを見据え、選手強化の充実が求められるなか、ＪＯＣがオリンピックの公式スポンサーでクレジットカード大手のＶＩＳＡと協力して始めました。具体的には、ＶＩＳＡカードの会員がカードの利用に伴ってたまるポイントをオリンピックを目指すトップ選手を対象とした強化費としてＪＯＣに寄付することを選択します。カード会社は、ポイントを現金化してＪＯＣの口座に振り込み、ＪＯＣが来年のリオデジャネイロ大会に参加する２８の競技団体に配分するということです。配分の割合は通常の強化費配分の指針となる競技団体のランクなどに応じて決めて、配分した金額はホームページ上で公表するということです。期間は当面、２９日から来年８月末までの予定で、５年後の東京大会などに向けては、直接、ＪＯＣに寄付できる仕組みも検討するということです。取り組みに参加しているカード会社は、２９日現在、１４８社あり、こうした寄付の取り組みは世界のオリンピック委員会でも初めてだということです。\n\n\n\n会見に同席したＪＯＣの理事で陸上、ハンマー投げの４０歳、室伏広治選手は、「強化のサポートが十分ではない競技団体や選手がまだまだいるので、１人でも多くの選手がメダルを取れるように、また、東京大会への機運が高まるようにお願いしたい」と呼びかけたうえで、「自分自身、カードを利用してためたポイントが期限を迎えて消えてしまうことがたびたびある。消えてしまう前に寄付して、多くのアスリートや若い世代に行き渡るようにしたいですね」と話し会見場を和ませていました。また、競泳女子の１８歳、渡部香生子選手は、「競技を続ける環境が整わずに苦労している選手がいる。今回の取り組みで強化の支援はもちろん、若い選手たちの励みになれば」と話していました。', 'genre': ['スポーツ'], 'keywords': [], 'url': 'http://www3.nhk.o

In [91]:
html = str(driver.page_source.encode('utf-8'))
re.search(r'"genre":(.+?),', html)

# clean category & keyword

In [44]:
with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)
    print(len(data))
with open('nhkwebeasy.json','r', encoding='utf-8') as f:
    data = json.load(f)
    print(len(data))

5147
2530


In [18]:
# check category

with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)
print('articles: ', len(data))
genre = Counter()
for dic in data:
    for g in dic['genre']:
        genre[g] += 1
genre.most_common()

articles:  5147


[('社会', 1639),
 ('国際', 1322),
 ('ビジネス', 823),
 ('科学・文化', 729),
 ('スポーツ', 702),
 ('政治', 615),
 ('暮らし', 410),
 ('地域', 319),
 ('気象・災害', 271)]

In [24]:
# genre <> keywords

with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)

category = ['社会', '国際', 'ビジネス', 'スポーツ', '政治', '科学・文化', '暮らし', '地域', '気象・災害']
for i, dic in enumerate(data):
    newgenre = []
    newkey = []
    for j in dic['genre']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        elif j == "科学・医療":
            newgenre.append('科学・文化')
        elif j == "文化・エンタメ":
            newgenre.append('科学・文化')
        elif j == "経済":
            newgenre.append('ビジネス')
        else:
            newkey.append(j)
    for j in dic['keywords']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        else:
            newkey.append(j)
    data[i]['genre'] = list(set(newgenre))
    data[i]['keywords'] = list(set(newkey))

with open('nhkweb.json','w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)
    

In [34]:
normal = pd.read_json('nhkwebeasy.json')
normal.id.value_counts()

k10012182341000    1
k10010531981000    1
k10011968211000    1
k10010547391000    1
k10011928501000    1
                  ..
k10010492921000    1
k10012044261000    1
k10012198791000    1
k10012178011000    1
k10011856231000    1
Name: id, Length: 2454, dtype: int64

# web easy archive

In [39]:
with open('tobescraped.txt') as f:
    lst = f.read().split()
with open('nolinkeasy.txt') as f:
    nolink = f.read().split()
with open('nhkwebeasy.json') as f:
    urls = [x['url_easy'] for x in json.load(f)]
htmls = sorted(set(lst) - set(urls) - set(nolink))
print(len(htmls))
htmls

14


['http://www3.nhk.or.jp/news/easy/k10010255571000/k10010255571000.html',
 'http://www3.nhk.or.jp/news/easy/k10010257891000/k10010257891000.html',
 'http://www3.nhk.or.jp/news/easy/k10010258241000/k10010258241000.html',
 'http://www3.nhk.or.jp/news/easy/k10010258631000/k10010258631000.html',
 'http://www3.nhk.or.jp/news/easy/k10010260881000/k10010260881000.html',
 'http://www3.nhk.or.jp/news/easy/k10010260921000/k10010260921000.html',
 'http://www3.nhk.or.jp/news/easy/k10010282071000/k10010282071000.html',
 'http://www3.nhk.or.jp/news/easy/k10010290761000/k10010290761000.html',
 'http://www3.nhk.or.jp/news/easy/k10010293221000/k10010293221000.html',
 'http://www3.nhk.or.jp/news/easy/k10010672751000/k10010672751000.html',
 'http://www3.nhk.or.jp/news/easy/k10010674481000/k10010674481000.html',
 'http://www3.nhk.or.jp/news/easy/k10010705131000/k10010705131000.html',
 'http://www3.nhk.or.jp/news/easy/k10010729201000/k10010729201000.html',
 'http://www3.nhk.or.jp/news/easy/k10010731741000/k

In [10]:
driver = webdriver.Chrome()

In [38]:
for nhkurl in htmls[:]:
    driver.get(f'https://web.archive.org/web/2016*/{nhkurl}')
    time.sleep(10)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(between|1 time).*?<a href="(.+?)">', html)
    if not snap:
        print(nhkurl)
        continue
    driver.get('https://web.archive.org' + snap.group(2))
    time.sleep(20)
    html = driver.page_source.encode('utf-8')
    dic = scrape_easy_one(html)
    js_e(dic)







  0%|          | 0/25 [00:00<?, ?it/s][A[A[A[A[A[A





  4%|▍         | 1/25 [00:14<05:39, 14.13s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k10010255571000/k10010255571000.html






  8%|▊         | 2/25 [00:26<05:14, 13.69s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k10010257891000/k10010257891000.html






 12%|█▏        | 3/25 [00:37<04:42, 12.83s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k10010258241000/k10010258241000.html






 16%|█▌        | 4/25 [00:49<04:20, 12.41s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k10010258631000/k10010258631000.html






 20%|██        | 5/25 [01:01<04:06, 12.33s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k10010260881000/k10010260881000.html






 24%|██▍       | 6/25 [01:12<03:48, 12.00s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k10010260921000/k10010260921000.html






 28%|██▊       | 7/25 [01:24<03:38, 12.15s/it][A[A[A[A[A[Ahttp://www3.nhk.or.jp/news/easy/k1

In [32]:
html = driver.page_source.encode('utf-8')
dic = scrape_easy_one(html)


In [33]:
js_e(dic)

In [25]:
html = requests.get('https://web.archive.org/web/20160824120410/http://www3.nhk.or.jp/news/easy/k10010642151000/k10010642151000.html').text
dic = scrape_easy_one(html)