In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from collections import Counter

In [2]:
def scrape_one_new(html, url):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('span', class_='contentTitle').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(hmd,time):
    year, month, day = hmd[:4], hmd[4:6], hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def get_archiveurl_from_id(ID, date):
    url1 = f'https://web.archive.org/web/*/https://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'
    url2 = f'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'
    
    driver.get(url1)
    time.sleep(3)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
    archiveurl = 'https://web.archive.org' + snap.group(2)
    
    if 'nhk' not in archiveurl:
        driver.get(url2)
        time.sleep(3)
        html = str(driver.page_source.encode('utf-8'))
        snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
        archiveurl = 'https://web.archive.org' + snap.group(2)
    return None if 'nhk' not in archiveurl else archiveurl

def get_article_from_archiveurl(archiveurl):
    response = requests.get(archiveurl)
    time.sleep(2)
    html = response.text
    url_true = 'http' + archiveurl.split('/http')[-1]
    if 'This page is not available on the web' in html:
        return None
    try:
        try:
            dic = scrape_one_new(html, url_true)
        except:
            dic = scrape_one_old(html, url_true)
        return dic
    except:
        return None

def js(dic, year):
    if dic == None:
        return
    with open(f'nhkweb{year}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open(f'nhkweb{year}.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

# check data

In [20]:
data = pd.read_json('nhkweb2019.json')
len(data)

10679

# get missing url

In [21]:
data = pd.read_json('nhkweb2019.json')
data = data[['id', 'url']]
data.id = data.id.apply(lambda x:x[1:-4])
data['date'] = data.url.apply(lambda x:x.split('news/html/')[-1].split('/')[0])
data[data.id == "1001177001"]

Unnamed: 0,id,url,date
483,1001177001,https://www3.nhk.or.jp/news/html/20190107/k100...,20190107


In [18]:
data.iloc[520:550,:]

Unnamed: 0,id,url,date
520,1001177040,https://www3.nhk.or.jp/news/html/20190108/k100...,20190108
521,1001177041,http://www3.nhk.or.jp/news/html/20190108/k1001...,20190108
522,1001177042,https://www3.nhk.or.jp/news/html/20190108/k100...,20190108
523,1001177043,http://www3.nhk.or.jp/news/html/20190108/k1001...,20190108
524,1001177044,https://www3.nhk.or.jp/news/html/20190108/k100...,20190108
525,1001177045,http://www3.nhk.or.jp/news/html/20190108/k1001...,20190108
526,1001177046,https://www3.nhk.or.jp/news/html/20190108/k100...,20190108
527,1001177047,http://www3.nhk.or.jp/news/html/20190108/k1001...,20190108
528,1001177048,http://www3.nhk.or.jp/news/html/20190108/k1001...,20190108
529,1001177049,http://www3.nhk.or.jp/news/html/20190108/k1001...,20190108


In [19]:
driver = webdriver.Chrome()

In [22]:
year = 2019

for i in range(500, 600):
    ID1, ID2 = data.iat[i,0], data.iat[i+1,0]
    date1, date2 = data.iat[i,2], data.iat[i+1,2]

    if int(ID1) + 1 == int(ID2): # continuous = no missing
        continue
    elif date1 == date2:  # not continuous, but in the same day
        print(ID1)
        for ID in range(int(ID1)+1, int(ID2)):
            archiveurl = get_archiveurl_from_id(ID, date1)
            if archiveurl != None:
                dic = get_article_from_archiveurl(archiveurl)
                js(dic, year)
    else:  # not continuous, not in the same day
        print(ID1)
        is_date1 = True
        for ID in range(int(ID1)+1, int(ID2)):
            if is_date1:
                archiveurl = get_archiveurl_from_id(ID, date1)
                if archiveurl:
                    dic = get_article_from_archiveurl(archiveurl)
                    js(dic, year)
                else:
                    archiveurl = get_archiveurl_from_id(ID, date2)
                    if archiveurl:
                        dic = get_article_from_archiveurl(archiveurl)
                        js(dic, year)
                        is_date1 = False
            else:
                archiveurl = get_archiveurl_from_id(ID, date2)
                if archiveurl:
                        js(dic, year)

1001177030
1001177063
1001177067
1001177073
1001177078
1001177088
1001177103
1001177112
1001177114
1001177121
1001177134
1001177145
1001177150
1001177168
1001177172
1001177180
1001177182
1001177186
1001177193


NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=80.0.3987.132)


In [None]:
driver.page_source.encode("utf8")