In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from collections import Counter

In [3]:
def scrape_one_new(html):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('h1', class_='content--title').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    url_normal = 'https:' + soup.find('meta', property="og:url").get('content').rsplit('https:')[-1]
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url_normal.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url_normal,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(hmd,time):
    year, month, day = hmd[:4], hmd[4:6], hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def js(dic):
    with open('nhkweb.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('nhkweb.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def geturl():
    notyet = []
    idnormal = pd.read_json('nhkweb.json', encoding='utf-8')['id'].tolist()
    ideasy = pd.read_json('nhkwebeasy.json', encoding='utf-8')['id'].tolist()
    nolink = pd.read_csv('nolinknormal.txt', encoding='utf-8', header=None)[0].tolist()
    notyet = set(ideasy) - set(idnormal) - set(nolink)
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()
    return [url for url in existurl if url.split('.html')[0].split('/')[-1] in notyet]

def checkwrongid(): # check wrong ID in newswebeasy
    df = pd.read_json('nhkwebeasy.json', encoding='utf-8')
    print(len(df))
    df['normalID'] = df['url_normal'].apply(lambda x:x.split('/')[-1].strip('.html'))
    return df[df['id'] != df['normalID']]['id'].tolist()

def wrongscrape():
    wrongids = wrongid()
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()[::-1]
    for ID in wrongids:
        for url in existurl:
            if ID in url:
                print(url.split('/*/')[-1])
                break

# scrape

In [4]:
urls = geturl()
print(len(urls))
urls[:15]

71


['https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20121226/k10014434551000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130109/k10014684831000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130109/k10014696511000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130109/k10014696581000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130424/k10014142721000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130508/k10014434551000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130514/k10014552461000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130518/k10014672051000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20130527/k10014881651000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20131121/k10013226641000.html',
 'https://web.archive.org/web/*/http://w

In [4]:
driver = webdriver.Chrome()

In [16]:
id_exist = set(pd.read_json('nhkweb.json')['id'].tolist())
for url in urls:
    ID = url.split('.html')[0].split('/')[-1]
    if ID in id_exist:
        continue
    driver.get(url)
    time.sleep(8)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(between|1 time).*?<a href="(.+?)">', html)
    if snap == None:
        continue
    archiveurl = 'https://web.archive.org' + snap.group(2)
    response = requests.get(archiveurl)
    if response.status_code == 200:
        html = response.text
    elif response.status_code == 504:
        response = requests.get(archiveurl)
        if response.status_code == 504:
            raise AssertionError
        html = response.text
    time.sleep(3)
    """
    driver.get('https://web.archive.org' + snap.group(2))
    time.sleep(10)
    html = driver.page_source.encode('utf-8')
    """
    soup = BeautifulSoup(html, "html.parser")
    url_true = url.split('/*/')[-1]
    try:
        dic = scrape_one_old(html, url_true)
        js(dic)
    except:
        with open('nolinknormal.txt', 'a') as f:
            f.write(url)
            f.write('\n')
    id_exist = set(pd.read_json('nhkweb.json')['id'].tolist())

NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=79.0.3945.130)


In [71]:
url_true = "http://www3.nhk.or.jp/news/html/20130515/k10014593831000.html"
html = driver.page_source.encode('utf-8')
dic = scrape_one_old(html, url_true)
dic

'。'

In [72]:
js(dic)

In [66]:
soup = BeautifulSoup(html, "html.parser")
soup.find(['p', 'div'], id='news_textbody')

<p id="news_textbody">福井県にある敦賀原子力発電所の断層を半年にわたって検証してきた国の原子力規制委員会の専門家会議は１５日、「２号機の真下を走る断層は活断層である」という報告書を最終的に取りまとめました。<br/>国の指針では、原子炉の真下に活断層があることを認めておらず、事業者が専門家会議の見解を覆せないかぎり敦賀原発２号機は運転ができなくなり、廃炉に追い込まれる可能性があります。</p>

# clean

In [50]:
print('easy', len(pd.read_json('nhkwebeasy.json', encoding='utf-8')))
print('normal', len(pd.read_json('nhkweb.json', encoding='utf-8')))
print('link', len(pd.read_csv('linknormal.txt', encoding='utf-8', header=None)))

easy 5602
normal 8730
link 56520


In [None]:
# check category

with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)
print('articles: ', len(data))
genre = Counter()
for dic in data:
    for g in dic['genre']:
        genre[g] += 1
genre.most_common()

In [None]:
# genre <> keywords
with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)

category = ['社会', '国際', 'ビジネス', 'スポーツ', '政治', '科学・文化', '暮らし', '地域', '気象・災害']
for i, dic in enumerate(data):
    newgenre = []
    newkey = []
    for j in dic['genre']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        elif j == "科学・医療" or j == "文化・エンタメ":
            newgenre.append('科学・文化')
        elif j == "経済":
            newgenre.append('ビジネス')
        else:
            newkey.append(j)
    for j in dic['keywords']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        else:
            newkey.append(j)
    data[i]['genre'] = list(set(newgenre))
    data[i]['keywords'] = list(set(newkey))

with open('nhkweb.json','w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)
    

In [17]:
normal = pd.read_json('nhkweb.json')
normal.id.value_counts()

k10012275841000    1
k10012253571000    1
k10011780361000    1
k10012278171000    1
k10012160001000    1
                  ..
k10012275331000    1
k10012278751000    1
k10012261661000    1
k10012167431000    1
k10012257381000    1
Name: id, Length: 8730, dtype: int64