In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm, datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from collections import Counter

In [2]:
def scrape_one_new(html, url):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('span', class_='contentTitle').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(ymd,time):
    year, month, day = ymd[:4], ymd[4:6], ymd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url_true):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    ymd_ = url_true.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(ymd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url_true.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url_true,
        'datePublished':date,
        'dateModified':""
    }

def write_nolink(ID, year):
    with open(f'nolinknormal{year}.txt', 'a') as f:
        f.write(str(ID) + '\n')

def get_archiveurl_from_id(ID, date, http=True):
    if http:
        url = f'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'
    else:
        url = f'https://web.archive.org/web/*/https://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'

    driver.get(url)
    time.sleep(5)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
    print(snap)
    try:
        archiveurl = 'https://web.archive.org' + snap.group(2)
    except:
        time.sleep(5)
        archiveurl = 'https://web.archive.org' + snap.group(2)
    return None if 'nhk' not in archiveurl else archiveurl

def get_article_from_archiveurl(archiveurl, browser=True):
    if browser == False:
        response = requests.get(archiveurl)
        html = response.text
    else:
        driver.get(archiveurl)
        time.sleep(3)
        html = driver.page_source.encode('utf-8')

    url_true = 'http' + archiveurl.split('/http')[-1]
    if 'This page is not available on the web' in str(html):
        return None
    try:
        try:
            dic = scrape_one_old(html, url_true)
        except:
            dic = scrape_one_new(html, url_true)
        return dic
    except:
        return None
    
def read_json(year):
    data = pd.read_json(f'nhkweb{year}.json')
    data = data[['id', 'url']]
    data.id = data.id.apply(lambda x:x[1:-4])
    data['date'] = data.url.apply(lambda x:x.split('news/html/')[-1].split('/')[0])
    print('length:', len(data))
    return data
    

def js(dic, year):
    if dic == None:
        return
    with open(f'nhkweb{year}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open(f'nhkweb{year}.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def datebefore(ymd:str):
    year, month, day = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:])
    before = str(datetime.datetime(year, month, day) + datetime.timedelta(-1))
    year, month, day = before[:4], before[5:7], before[8:10]
    return f'{year}{month}{day}'

def dateafter(ymd:str):
    year, month, day = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:])
    before = str(datetime.datetime(year, month, day) + datetime.timedelta(1))
    year, month, day = before[:4], before[5:7], before[8:10]
    return f'{year}{month}{day}'

# scrape from url list

In [3]:
# already exist

year = 2013
data = read_json(year)
data

length: 7764


Unnamed: 0,id,url,date
0,1001300062,http://www3.nhk.or.jp:80/news/html/20130306/k1...,20130306
1,1001300185,http://www3.nhk.or.jp:80/news/html/20130306/k1...,20130306
2,1001300268,http://www3.nhk.or.jp:80/news/html/20130306/k1...,20130306
3,1001300307,http://www3.nhk.or.jp:80/news/html/20130306/k1...,20130306
4,1001300311,http://www3.nhk.or.jp:80/news/html/20130306/k1...,20130306
...,...,...,...
7759,1001597255,http://www3.nhk.or.jp:80/news/html/20130305/t1...,20130305
7760,1001597664,http://www3.nhk.or.jp:80/news/html/20130305/t1...,20130305
7761,1001598638,http://www3.nhk.or.jp:80/news/html/20130306/t1...,20130306
7762,1001598682,http://www3.nhk.or.jp:80/news/html/20130306/t1...,20130306


In [4]:
data.id.value_counts()

1001550550    1
1001568624    1
1001566946    1
1001399656    1
1001372804    1
             ..
1001346757    1
1001400391    1
1001377461    1
1001595272    1
1001507831    1
Name: id, Length: 7764, dtype: int64

In [5]:
# to be scraped

urls = pd.read_csv(f'linknormal{year}.txt', header=None)
urls['id'] = urls[0].apply(lambda x:x.split('/')[-1][1:11])
print('to be scraped:', len(set(urls.id) - set(data.id)))
urls.head()

to be scraped: 13793


Unnamed: 0,0,id
0,https://web.archive.org/web/*/http://www3.nhk....,1001454510
1,https://web.archive.org/web/*/http://www3.nhk....,1001454527
2,https://web.archive.org/web/*/http://www3.nhk....,1001454540
3,https://web.archive.org/web/*/http://www3.nhk....,1001454546
4,https://web.archive.org/web/*/http://www3.nhk....,1001454570


In [6]:
# selenium

#options = Options()
#options.headless = True
driver = webdriver.Chrome()
count = 0

In [None]:
while True:
    try:
        data = read_json(year)
        id_exists = set(data.id)
        with open(f'nolinknormal{year}.txt', 'r') as f:
            noid = f.read()

        for i, row in urls.iterrows():
            
            # refresh
            if count == 500:
                driver.close()
                driver = webdriver.Chrome()
                count = 0

            # check ID
            ID = row.id
            if ID in id_exists or ID in noid:
                continue

            # get archive url
            driver.get(row[0])
            time.sleep(8)
            html = str(driver.page_source.encode('utf-8'))
            snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
            try:
                archiveurl = 'https://web.archive.org' + snap.group(2)
            except:
                #driver.get(row[0])
                time.sleep(5)
                archiveurl = 'https://web.archive.org' + snap.group(2)

            if 'nhk' not in archiveurl:
                write_nolink(ID, year)
                continue

            # get article
            dic = get_article_from_archiveurl(archiveurl, browser=False)
            if dic:
                js(dic, year)
                id_exists.add(ID)
            else:
                write_nolink(ID, year)
            count += 1
            
    except:
        print('ERROR', ID)
        pass

length: 7764
ERROR 1001552075
length: 9001
ERROR 1001523478
length: 13662
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
le

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length: 17558
length

# get missing url & article (anaume)

26942
           id                                                url      date
0  1001127630  http://www3.nhk.or.jp/news/html/20180101/k1001...  20180101
1  1001127631  http://www3.nhk.or.jp/news/html/20180101/k1001...  20180101
2  1001127632  http://www3.nhk.or.jp/news/html/20171231/k1001...  20171231
3  1001127633  http://www3.nhk.or.jp/news/html/20171231/k1001...  20171231
4  1001127634  http://www3.nhk.or.jp/news/html/20171231/k1001...  20171231


Unnamed: 0,id,url,date
21378,1001152245,http://www3.nhk.or.jp/news/html/20180708/k1001...,20180708


In [39]:
year = 2018
start = 21373

In [None]:
for i in tqdm.tqdm(range(start, 26800)):
    # get the present row & next row
    ID1, ID2 = data.iat[i,0], data.iat[i+1,0]
    date = data.iat[i,2]
    before = datebefore(date)
    after = dateafter(date)

    # ID is continuous = no missing
    if int(ID1) + 1 == int(ID2):
        continue

    # not continuous
    else:  
        for ID in range(int(ID1)+1, int(ID2)):
            archiveurl = get_archiveurl_from_id(ID, date) # try the same day
            if archiveurl != None:
                dic = get_article_from_archiveurl(archiveurl)
                if dic:
                    js(dic, year)
                else:
                    write_nolink(ID, year) # error in NHK
            else:
                archiveurl = get_archiveurl_from_id(ID, after) # try the next day
                if archiveurl != None:
                    dic = get_article_from_archiveurl(archiveurl)
                    if dic:
                        js(dic, year)
                    else:
                        write_nolink(ID, year) # error in NHK
                else: 
                    archiveurl = get_archiveurl_from_id(ID, before) # try the previous day
                    if archiveurl != None:
                        dic = get_article_from_archiveurl(archiveurl)
                        if dic:
                            js(dic, year)
                        else:
                            write_nolink(ID, year) # error in NHK
                    else:
                        write_nolink(ID, year) # nolink

In [44]:
ID1

'1001152240'

# drop duplicate

In [42]:
year = 2018
with open(f'nhkweb{year}.json', 'r') as f:
    data = json.load(f)
newlst = []
for dic in data:
    if dic not in newlst:
        newlst.append(dic)
with open(f'nhkweb{year}.json', 'w', encoding='utf8') as f:
    json.dump(newlst, f, indent=4, ensure_ascii=False)

In [None]:
del data, newlst

In [5]:
A = np.array([[1, 2]])
B = np.array([[3,4], [5, 6]])

In [6]:
np.dot(B, A)

ValueError: shapes (2,2) and (1,2) not aligned: 2 (dim 1) != 1 (dim 0)