In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm, datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from collections import Counter

In [2]:
def scrape_one_new(html, url):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('span', class_='contentTitle').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(hmd,time):
    year, month, day = hmd[:4], hmd[4:6], hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def write_nolink(ID):
    with open('nolinknormal.txt', 'a') as f:
        f.write(str(ID))
        f.write('\n')

def get_archiveurl_from_id(ID, date, http=True):
    if http:
        url = f'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'
    else:
        url = f'https://web.archive.org/web/*/https://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'
    
    driver.get(url)
    time.sleep(3)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
    archiveurl = 'https://web.archive.org' + snap.group(2)
    return None if 'nhk' not in archiveurl else archiveurl

def get_article_from_archiveurl(archiveurl):
    response = requests.get(archiveurl)
    time.sleep(3)
    html = response.text
    url_true = 'http' + archiveurl.split('/http')[-1]
    if 'This page is not available on the web' in html:
        return None
    try:
        try:
            dic = scrape_one_new(html, url_true)
        except:
            dic = scrape_one_old(html, url_true)
        return dic
    except:
        return None

def js(dic, year):
    if dic == None:
        return
    with open(f'nhkweb{year}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open(f'nhkweb{year}.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def datebefore(ymd:str):
    year, month, day = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:])
    before = str(datetime.datetime(year, month, day) + datetime.timedelta(-1))
    year, month, day = before[:4], before[5:7], before[8:10]
    return f'{year}{month}{day}'

def dateafter(ymd:str):
    year, month, day = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:])
    before = str(datetime.datetime(year, month, day) + datetime.timedelta(1))
    year, month, day = before[:4], before[5:7], before[8:10]
    return f'{year}{month}{day}'

# check data

In [3]:
data = pd.read_json('nhkweb2019.json')
len(data)

18615

# get missing url & article

In [15]:
data = pd.read_json('nhkweb2019.json')
data = data[['id', 'url']]
data.id = data.id.apply(lambda x:x[1:-4])
data['date'] = data.url.apply(lambda x:x.split('news/html/')[-1].split('/')[0])
print(len(data))
data.head()
data[data.id == "1001186603"]

18615


Unnamed: 0,id,url,date
9318,1001186603,http://www3.nhk.or.jp/news/html/20190329/k1001...,20190329


In [17]:
options = Options()
#options.headless = True
driver = webdriver.Chrome(options=options)

In [18]:
year = 2019

for i in tqdm.tqdm(range(9318, 12000)):
    # get the present row & next row
    ID1, ID2 = data.iat[i,0], data.iat[i+1,0]
    date = data.iat[i,2]
    before = datebefore(date)
    after = dateafter(date)

    # ID is continuous = no missing
    if int(ID1) + 1 == int(ID2):
        continue

    # not continuous
    else:  
        for ID in range(int(ID1)+1, int(ID2)):
            archiveurl = get_archiveurl_from_id(ID, date) # try the same day
            if archiveurl != None:
                dic = get_article_from_archiveurl(archiveurl)
                if dic:
                    js(dic, year)
                else:
                    write_nolink(ID) # error in NHK
            else:
                archiveurl = get_archiveurl_from_id(ID, after) # try the next day
                if archiveurl != None:
                    dic = get_article_from_archiveurl(archiveurl)
                    if dic:
                        js(dic, year)
                    else:
                        write_nolink(ID) # error in NHK
                else: 
                    archiveurl = get_archiveurl_from_id(ID, before) # try the previous day
                    if archiveurl != None:
                        dic = get_article_from_archiveurl(archiveurl)
                        if dic:
                            js(dic, year)
                        else:
                            write_nolink(ID) # error in NHK
                    else:
                        write_nolink(ID) # nolink



  0%|          | 0/2682 [00:00<?, ?it/s][A
  0%|          | 1/2682 [01:52<84:01:51, 112.84s/it][A
  0%|          | 4/2682 [02:37<62:05:57, 83.48s/it][A
  0%|          | 6/2682 [02:48<44:39:59, 60.09s/it][A
  0%|          | 9/2682 [02:59<32:02:23, 43.15s/it][A
  0%|          | 13/2682 [03:10<23:00:31, 31.03s/it][A
  1%|          | 28/2682 [03:21<16:10:47, 21.95s/it][A
  1%|          | 31/2682 [03:34<12:15:42, 16.65s/it][A
  3%|▎         | 74/2682 [03:47<8:30:27, 11.74s/it][A
  3%|▎         | 81/2682 [03:59<6:19:17,  8.75s/it][A
  8%|▊         | 203/2682 [04:10<4:14:06,  6.15s/it][A
  8%|▊         | 203/2682 [04:25<4:14:06,  6.15s/it][A
  8%|▊         | 208/2682 [04:30<3:47:38,  5.52s/it][A
  9%|▊         | 229/2682 [04:54<2:51:47,  4.20s/it][A
 14%|█▎        | 364/2682 [05:05<1:54:39,  2.97s/it][A
 16%|█▋        | 440/2682 [05:17<1:19:23,  2.12s/it][A
 16%|█▋        | 441/2682 [05:32<3:40:33,  5.90s/it][A
 21%|██        | 559/2682 [05:44<2:27:20,  4.16s/it][A
 22%|██▏

NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=80.0.3987.132)


In [18]:
ID

1001185289

In [10]:
nolinklist = pd.read_csv('nolinknormal.txt', header=None)
pd.DataFrame(sorted(set(map(str, nolinklist[0])) - set(data.id)))[0].to_csv('nolinknormal.txt', index=None)

  
