In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from collections import Counter

In [2]:
def scrape_one_new(html, url):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('span', class_='contentTitle').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(hmd,time):
    year, month, day = hmd[:4], hmd[4:6], hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def get_archiveurl(url):
    driver.get(url)
    time.sleep(6)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
    #if snap == None:
        #return None
    archiveurl = 'https://web.archive.org' + snap.group(2)
    return archiveurl

def js(dic):
    with open('nhkweb.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('nhkweb.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def geturl(year=2019):
    idnormal = pd.read_json('nhkweb.json', encoding='utf-8')['id'].tolist()
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()
    nolink = pd.read_csv('nolinknormal.txt', encoding='utf-8', header=None)[0].tolist()
    urls = set(existurl) - set(nolink)
    return sorted([url for url in urls if (url.split('.html')[0].split('/')[-1] not in idnormal) and f'html/{year}' in url])

def checkwrongid(): # check wrong ID in newswebeasy
    df = pd.read_json('nhkwebeasy.json', encoding='utf-8')
    print(len(df))
    df['normalID'] = df['url_normal'].apply(lambda x:x.split('/')[-1].strip('.html'))
    return df[df['id'] != df['normalID']]['id'].tolist()

def wrongscrape():
    wrongids = wrongid()
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()[::-1]
    for ID in wrongids:
        for url in existurl:
            if ID in url:
                print(url.split('/*/')[-1])
                break

# scrape

In [8]:
urls = geturl(2019)

print(len(urls))
urls[:15]

2362


['https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072781000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072801000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072821000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072861000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072901000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072941000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072971000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072981000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012072991000.html',
 'https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190910/k10012073011000.html',
 'https://web.archive.org/web/*/http://w

In [9]:
driver = webdriver.Chrome()

In [10]:
while True:
    try:
        urls = geturl(2019)
        id_exist = set(pd.read_json('nhkweb.json')['id'].tolist())
        for url in urls[:3000]:
            # check URL
            ID = url.split('.html')[0].split('/')[-1]
            if ID in id_exist:
                continue

            # get archive URL
            archiveurl = get_archiveurl(url)

            # request
            response = requests.get(archiveurl)
            if response.status_code == 200:
                html = response.text
            elif response.status_code == 504:
                response = requests.get(archiveurl)
                if response.status_code == 504:
                    raise AssertionError
                html = response.text
            time.sleep(2)

            # scrape
            url_true = url.split('/*/')[-1]

            try:
                dic = scrape_one_new(html, url_true)
            except:
                dic = scrape_one_old(html, url_true)
            js(dic)
            id_exist.add(ID)
            """
            except:
                with open('nolinknormal.txt', 'a') as f:
                    f.write(url)
                    f.write('\n')
            """
    except:
        print('error' , driver.current_url)
        pass

error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190911/k10012075131000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190911/k10012075241000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190912/k10012077491000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190914/k10012082001000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190917/k10012085031000.html
error https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20190922/k10012094101000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190922/k10012094171000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20190927/k10012101251000.html
error https://web.archive.org/web/*/http://www3.nhk.or.jp/news/html/20191003/k10012110371000.html
error https://web.archive.org/web/2019*/http://www3.nhk.or.jp/news/html/20191007/k10012115

NoSuchWindowException: Message: no such window: window was already closed
  (Session info: chrome=80.0.3987.132)


In [None]:
url_true = driver.current_url.split('/*/')[-1]
try:
    dic = scrape_one_new(html, url_true)
except:
    dic = scrape_one_old(html, url_true)

In [72]:
js(dic)

# clean

In [8]:
print('easy', len(pd.read_json('nhkwebeasy.json', encoding='utf-8')))
print('normal', len(pd.read_json('nhkweb.json', encoding='utf-8')))
print('link', len(pd.read_csv('linknormal.txt', encoding='utf-8', header=None)))

easy 5810
normal 19277
link 81596


In [12]:
# check category

with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)
print('articles: ', len(data))
genre = Counter()
for dic in data:
    for g in dic['genre']:
        genre[g] += 1
genre.most_common()

articles:  21404


[('国際', 6237),
 ('社会', 6094),
 ('スポーツ', 3135),
 ('ビジネス', 3082),
 ('科学・文化', 2582),
 ('政治', 2274),
 ('気象・災害', 1356),
 ('暮らし', 1151),
 ('地域', 941)]

In [11]:
# genre <> keywords
with open('nhkweb.json','r', encoding='utf-8') as f:
    data = json.load(f)

category = ['社会', '国際', 'ビジネス', 'スポーツ', '政治', '科学・文化', '暮らし', '地域', '気象・災害']
for i, dic in enumerate(data):
    newgenre = []
    newkey = []
    for j in dic['genre']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        elif j == "科学・医療" or j == "文化・エンタメ" or j == "科学":
            newgenre.append('科学・文化')
        elif j == "暮らし文化":
            newgenre.append('暮らし')
            newgenre.append('科学・文化')
        elif j == "経済":
            newgenre.append('ビジネス')
        else:
            newkey.append(j)
    for j in dic['keywords']:
        if j in category:
            newgenre.append(j)
        elif j == "科学・医療" or j == "文化・エンタメ" or j == "科学":
            newgenre.append('科学・文化')
        elif j == "暮らし文化":
            newgenre.append('暮らし')
            newgenre.append('科学・文化')
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        else:
            newkey.append(j)
    data[i]['genre'] = list(set(newgenre))
    data[i]['keywords'] = list(set(newkey))

with open('nhkweb.json','w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)
    

In [10]:
normal = pd.read_json('nhkweb.json')
normal.id.value_counts()

k10011977251000    1
k10012302521000    1
k10012305501000    1
k10012309361000    1
k10011906611000    1
                  ..
k10012262621000    1
k10012268281000    1
k10012279461000    1
k10011932601000    1
k10011826251000    1
Name: id, Length: 19277, dtype: int64