In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm, datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from collections import Counter

In [2]:
def scrape_one_new(html, url):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('span', class_='contentTitle').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(ymd,time):
    year, month, day = ymd[:4], ymd[4:6], ymd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url_true):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    ymd_ = url_true.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(ymd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url_true.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url_true,
        'datePublished':date,
        'dateModified':""
    }

def write_nolink(ID, year):
    with open(f'nolinknormal{year}.txt', 'a') as f:
        f.write(str(ID) + '\n')

def get_archiveurl_from_id(ID, date, http=True):
    if http:
        url = f'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'
    else:
        url = f'https://web.archive.org/web/*/https://www3.nhk.or.jp/news/html/{date}/k{ID}1000.html'

    driver.get(url)
    time.sleep(5)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
    print(snap)
    try:
        archiveurl = 'https://web.archive.org' + snap.group(2)
    except:
        time.sleep(5)
        archiveurl = 'https://web.archive.org' + snap.group(2)
    return None if 'nhk' not in archiveurl else archiveurl

def get_article_from_archiveurl(archiveurl, browser=True):
    if browser == False:
        response = requests.get(archiveurl)
        html = response.text
    else:
        driver.get(archiveurl)
        time.sleep(3)
        html = driver.page_source.encode('utf-8')

    url_true = 'http' + archiveurl.split('/http')[-1]
    if 'This page is not available on the web' in str(html):
        return None
    try:
        try:
            dic = scrape_one_old(html, url_true)
        except:
            dic = scrape_one_new(html, url_true)
        return dic
    except:
        return None
    
def read_json(year):
    data = pd.read_json(f'nhkweb{year}.json')
    data = data[['id', 'url']]
    data.id = data.id.apply(lambda x:x[1:-4])
    data['date'] = data.url.apply(lambda x:x.split('news/html/')[-1].split('/')[0])
    print('length:', len(data))
    return data
    

def js(dic, year):
    if dic == None:
        return
    with open(f'nhkweb{year}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open(f'nhkweb{year}.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def datebefore(ymd:str):
    year, month, day = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:])
    before = str(datetime.datetime(year, month, day) + datetime.timedelta(-1))
    year, month, day = before[:4], before[5:7], before[8:10]
    return f'{year}{month}{day}'

def dateafter(ymd:str):
    year, month, day = int(ymd[:4]), int(ymd[4:6]), int(ymd[6:])
    before = str(datetime.datetime(year, month, day) + datetime.timedelta(1))
    year, month, day = before[:4], before[5:7], before[8:10]
    return f'{year}{month}{day}'

# open selenium & scrape manually

In [3]:
# selenium
driver = webdriver.Chrome()

In [4]:
url = driver.current_url
html = driver.page_source.encode('utf-8')
url_true = 'http' + url.split('/http')[-1]
dic = scrape_one_old(html, url_true)
dic

{'id': 'k10011630731000',
 'title': '北海道の電力 冬も確保できる見通し 資源エネルギー庁',
 'article': '資源エネルギー庁は14日午後、記者会見で、北海道では冷え込みが厳しくなる冬に向け、電力需要が高まるものの火力発電所の再稼働などで供給力は確保されるという見通しを示しました。\n北海道では14日までに京極水力発電所の１号機と２号機が再稼働したことで当面、想定される需要のピーク、383万キロワットを上回る供給力を確保しました。また、今月末以降に苫東厚真火力発電所の１号機が復旧して再稼働すれば、来月前半には合わせて421万キロワットを確保できる見通しだとしています。さらに来月中旬以降に苫東厚真火力発電所の２号機も稼働すれば、来月後半には供給力は481万キロワットまで積み増すことができるとしています。その後も定期検査中の火力発電所を順次、再稼働することで、前の冬のことし１月25日、北海道で需要がピークになった525万キロワットを上回る供給力が確保できる見通しだとしています。しかし、老朽化した火力発電所を稼働させているため、トラブルなどで運転を停止した場合は再び電力が不足する事態に陥りかねないとして、引き続き節電の協力を求めています。',
 'genre': ['地震 ライフライン', '北海道地震', '災害'],
 'keywords': [],
 'url': 'http://www3.nhk.or.jp/news/html/20180914/k10011630731000.html',
 'datePublished': '2018-09-14T20:20',
 'dateModified': ''}

In [5]:
year = 2018

js(dic, year)