In [1]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from collections import Counter

In [2]:
def scrape_one_new(html):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)
    title = json_data.get('headline', soup.find('h1', class_='content--title').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    article = soup.find('div', id="news_textbody").text
    url_normal = 'https:' + soup.find('meta', property="og:url").get('content').rsplit('https:')[-1]
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url_normal.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url_normal,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_date_normal_old(hmd,time):
    year, month, day = hmd[:4], hmd[4:6], hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url):
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_date_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title,
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def js(dic):
    with open('nhkweb.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('nhkweb.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def getlink():
    notyet = []
    urllist = pd.read_json('nhkweb.json', encoding='utf-8')['url'].tolist()
    nolink = pd.read_csv('nolinknormal.txt', encoding='utf-8', header=None)[0].tolist()
    df_easy = pd.read_json('nhkwebeasy.json', encoding='utf-8') 
    for i in df_easy['url_normal']:
        if i not in urllist:
            notyet.append(i)
    return sorted(set(notyet) - set(nolink))

def wrongid():
    df = pd.read_json('nhkwebeasy.json', encoding='utf-8')
    print(len(df))
    df['normalID'] = df['url_normal'].apply(lambda x:x.split('/')[-1].strip('.html'))
    return df[df['id'] != df['normalID']]['id'].tolist()

def wrongscrape():
    wrongids = wrongid()
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()[::-1]
    for ID in wrongids:
        for url in existurl:
            if ID in url:
                print(url.split('/*/')[-1])
                break

In [13]:
wrongid()

3678


['k10011372331000',
 'k10011375491000',
 'k10011377641000',
 'k10011378271000',
 'k10011378581000',
 'k10011380591000',
 'k10011382671000',
 'k10011383841000',
 'k10011400201000',
 'k10011400981000',
 'k10011418641000',
 'k10011424301000',
 'k10011424771000',
 'k10011427811000',
 'k10011428421000',
 'k10011430621000',
 'k10011432981000',
 'k10011438481000',
 'k10011440601000',
 'k10011444081000',
 'k10011450551000',
 'k10011456741000',
 'k10011463631000',
 'k10011466511000',
 'k10011468501000',
 'k10011470251000',
 'k10011472611000',
 'k10011486141000',
 'k10011489441000',
 'k10011489791000',
 'k10011500211000',
 'k10011514211000',
 'k10011529871000',
 'k10011536821000',
 'k10011539821000',
 'k10011550321000',
 'k10011557611000',
 'k10011581681000',
 'k10011582271000',
 'k10011585491000',
 'k10011587591000',
 'k10011605041000',
 'k10011606281000',
 'k10011611151000',
 'k10011636841000',
 'k10011638011000',
 'k10011640811000',
 'k10011642191000',
 'k10011643751000',
 'k10011646511000',


# scrape

In [6]:
links = getlink()
print(len(links))
links[:10]

775


['http://www3.nhk.or.jp/news/easy/k10010964371000/k10010964371000.html',
 'http://www3.nhk.or.jp/news/html/20150603/k10010102161000.html',
 'http://www3.nhk.or.jp/news/html/20150604/k10010102281000.html',
 'http://www3.nhk.or.jp/news/html/20150604/k10010102771000.html',
 'http://www3.nhk.or.jp/news/html/20150604/k10010103041000.html',
 'http://www3.nhk.or.jp/news/html/20150604/k10010103101000.html',
 'http://www3.nhk.or.jp/news/html/20150604/k10010103261000.html',
 'http://www3.nhk.or.jp/news/html/20150605/k10010103951000.html',
 'http://www3.nhk.or.jp/news/html/20150605/k10010103971000.html',
 'http://www3.nhk.or.jp/news/html/20150606/k10010105501000.html']

In [4]:
driver = webdriver.Chrome()

In [5]:
for link in links[1:300]:
    driver.get(f'https://web.archive.org/web/*/{link}')
    time.sleep(6)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(between|1 time).*?<a href="(.+?)">', html)
    if snap == None:
        with open('nolinknormal.txt', 'a') as f:
            f.write(link)
            f.write('\n')
        continue
    driver.get('https://web.archive.org' + snap.group(2))
    time.sleep(10)
    html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(html, "html.parser")
    try:
        dic = scrape_one_old(html,link)
    except:
        with open('nolinknormal.txt', 'a') as f:
            f.write(link)
            f.write('\n')
    js(dic)

In [24]:
html = driver.page_source.encode('utf-8')
soup = BeautifulSoup(html, "html.parser")
dic = scrape_one_old(html,link)

AttributeError: 'NoneType' object has no attribute 'text'

In [10]:
js(dic)