In [46]:
import numpy as np
import pandas as pd
import re, json, csv, requests, time, glob, tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from collections import Counter

In [47]:
def scrape_one_new(html, url):
    soup = BeautifulSoup(html, "html.parser")
    json_data = json.loads(soup.find_all("script", type="application/ld+json")[-1].text)

    # title, date, genre, keyword
    title = json_data.get('headline', soup.find('span', class_='contentTitle').text)
    date = json_data.get('datePublished', re.search(r'datetime:.*?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2})', str(html)).group(1))
    date_m = json_data.get('dateModified', '')
    genre = json_data.get('genre', [])
    if genre == []:
        genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB']]
    keywords = json_data.get('keywords', [])
    
    # article: news_textbody > news_textmore > news_add (paragraph titles are h3)
    article = soup.find('div', id="news_textbody").text
    if soup.find_all('div', id="news_textmore") != []:
        for textmore in soup.find_all('div', id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all('div', class_="news_add") != []:
        for newsadd in soup.find_all('div', class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title.strip(),
        'article':article.strip(),
        'genre':genre,
        'keywords':keywords,
        'url':url,
        'datePublished':date,
        'dateModified':date_m
    }

# for old web normal
def make_datetime_normal_old(hmd, time):
    year, month, day = hmd[:4], hmd[4:6], hmd[6:]
    hour, minute = time.split('時')
    minute = minute.strip('分')
    if len(hour) == 1:
        hour = '0' + hour
    if len(minute) == 1:
        minute = '0' + minute
    return f"{year}-{month}-{day}T{hour}:{minute}"

def scrape_one_old(html, url):
    soup = BeautifulSoup(html, "html.parser")

    # title, date, genre, keyword
    title = soup.find('span', class_="contentTitle").text.strip()
    hmd_ = url.split('/')[-2]
    time_ = soup.find('span', id="news_time").text
    date = make_datetime_normal_old(hmd_, time_)
    genre = [k for k in soup.find('meta', attrs={'name':'keywords'}).get('content').split(',') if k not in ['NHK','ニュース', 'NHK NEWS WEB','ＮＨＫ','ＮＨＫニュース','']]
    
    # article: news_textbody > news_textmore > news_add (paragraph titles are h3)
    article = soup.find(['div','p'], id="news_textbody").text
    if soup.find_all(['div','p'], id="news_textmore") != []:
        for textmore in soup.find_all(['div','p'], id="news_textmore"):
            article += ('\n' + textmore.text)
    if soup.find_all(['div','p'], class_="news_add") != []:
        for newsadd in soup.find_all(['div','p'], class_="news_add"):
            if newsadd.h3 != None:
                newsadd.h3.extract()
            article += ('\n' + newsadd.text)
            
    return {
        'id':url.split('/')[-1].split('.html')[0],
        'title':title.strip(),
        'article':article.strip(),
        'genre':genre,
        'keywords':[],
        'url':url,
        'datePublished':date,
        'dateModified':""
    }

def get_archiveurl(url, sleeptime=5):
    driver.get(url)
    time.sleep(sleeptime)
    html = str(driver.page_source.encode('utf-8'))
    snap = re.search(r'(times between|1 time|times).*?<a href="(.+?)">', html)
    #if snap == None:
        #return None
    archiveurl = 'https://web.archive.org' + snap.group(2)
    return archiveurl

def js(dic, year):
    with open(f'nhkweb{year}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open(f'nhkweb{year}.json', 'w', encoding='utf-8') as f:
        if dic['id'] not in [x['id'] for x in data]:
            data.append(dic)
        else:
            for i, d in enumerate(data):
                if dic['id'] == d['id']:
                    data[i] = dic
        data = sorted(data, key=lambda x:x['id'])
        json.dump(data, f, indent=4, ensure_ascii=False)

def geturl(year=2019):
    idnormal = pd.read_json(f'nhkweb{year}.json', encoding='utf-8')['id'].tolist()
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()
    nolink = pd.read_csv('nolinknormal.txt', encoding='utf-8', header=None)[0].tolist()
    urls = set(existurl) - set(nolink)
    return sorted([url for url in urls if (url.split('.html')[0].split('/')[-1] not in idnormal) and f'html/{year}' in url])

def checkwrongid(): # check wrong ID in newswebeasy
    df = pd.read_json('nhkwebeasy.json', encoding='utf-8')
    print(len(df))
    df['normalID'] = df['url_normal'].apply(lambda x:x.split('/')[-1].strip('.html'))
    return df[df['id'] != df['normalID']]['id'].tolist()

def wrongscrape():
    wrongids = wrongid()
    existurl = pd.read_csv('linknormal.txt', encoding='utf-8', header=None)[0].tolist()[::-1]
    for ID in wrongids:
        for url in existurl:
            if ID in url:
                print(url.split('/*/')[-1])
                break

# scrape

In [7]:
df = pd.read_json('nhkweb2015.json')
df

Unnamed: 0,id,title,article,genre,keywords,url,datePublished,dateModified
0,k10010000061000,ノルディック複合団体ラージヒル 日本６位,スウェーデンで行われているノルディックスキーの世界選手権は２８日、ノルディック複合の団体ラー...,[スポーツ],[],http://www3.nhk.or.jp:80/news/html/20150301/k1...,2015-03-01T03:03,
1,k10010000071000,びわ湖毎日マラソン きょう開催,ことし北京で行われる陸上の世界選手権の代表選考レースを兼ねた「びわ湖毎日マラソン」が１日、行...,[スポーツ],[],http://www3.nhk.or.jp:80/news/html/20150301/k1...,2015-03-01T04:15,
2,k10010000091000,ジャンプ男子団体ラージヒル 日本は４位,スウェーデンで行われているノルディックスキーの世界選手権は２８日、ジャンプの男子団体ラージヒ...,[スポーツ],[],http://www3.nhk.or.jp:80/news/html/20150301/k1...,2015-03-01T04:04,
3,k10010000111000,過激派組織ＩＳ より過激なグループが主導権か,過激派組織ＩＳ＝イスラミックステートがフリージャーナリストの後藤健二さんを殺害したとする映像...,[国際],[],http://www3.nhk.or.jp/news/html/20150301/k1001...,2015-03-01T06:24,
4,k10010000121000,インドで日本の新幹線導入目指しＰＲ,インドで初めてとなる高速鉄道に日本の新幹線の技術やノウハウを導入してもらおうと、首都ニューデ...,[国際],[],http://www3.nhk.or.jp:80/news/html/20150301/k1...,2015-03-01T04:23,
...,...,...,...,...,...,...,...,...
5652,k10015826111000,英ウィリアム王子 首相と福島視察,安倍総理大臣は日本を訪れているイギリス王室のウィリアム王子と共に福島県を訪れ、原発事故の影響...,"[社会, ウィリアム, イギリス王室, 王子]",[],http://www3.nhk.or.jp/news/html/20150228/k1001...,2015-02-28T19:15,
5653,k10015826211000,着床前スクリーニング 研究計画を公表,体外受精をしても妊娠できなかったり流産を繰り返したりする女性を対象に、受精卵のすべての染色体...,[科学・医療],[],http://www3.nhk.or.jp/news/html/20150228/k1001...,2015-02-28T19:26,
5654,k10015827381000,転売巡る指摘 北朝鮮メディアが非難,北朝鮮の国営メディアは、朝鮮総連＝在日本朝鮮人総連合会の中央本部の土地と建物の転売を巡り、日...,[国際],[],http://www3.nhk.or.jp/news/html/20150228/k1001...,2015-02-28T22:30,
5655,k10015827681000,捜索に抵抗し逃走の男 警察に出頭し逮捕,今月６日、山口県下関市で麻薬特例法違反の疑いで警察の捜索を受けた男が刃物で抵抗して逃走した事...,[社会],[],http://www3.nhk.or.jp/news/html/20150228/k1001...,2015-02-28T23:16,


In [21]:
options = Options()
#options.headless = True
driver = webdriver.Chrome(options=options)

In [23]:
year = 2015
urls = pd.read_csv(f'linknormal.txt', header=None)
urls = sorted(urls[urls[0].str.contains(f'html/{year}')][0].tolist(), reverse=True)
id_exist = set(pd.read_json(f'nhkweb{year}.json')['id'].tolist())

for url in urls:
    # check URL
    ID = url.split('.html')[0].split('/')[-1]
    if ID in id_exist:
        continue

    # get archive URL
    archiveurl = get_archiveurl(url)

    # request
    html = None
    response = requests.get(archiveurl)
    if response.status_code == 200:
        html = response.text
    elif response.status_code == 504:
        response = requests.get(archiveurl)
        if response.status_code == 504:
            continue
        html = response.text
    if html == None:
        continue
    time.sleep(4)

    # scrape
    url_true = 'htt' + url.split('/htt')[-1]

    try:
        dic = scrape_one_new(html, url_true)
    except:
        dic = scrape_one_old(html, url_true)
    js(dic, year)
    id_exist.add(ID)

AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
url_true = 'htt' + driver.current_url.split('/htt')[-1]
html = driver.page_source.encode('utf-8')
try:
    dic = scrape_one_new(html, url_true)
except:
    dic = scrape_one_old(html, url_true)
dic

{'id': 'k10010627901000',
 'title': '台風５号 北日本～東北の太平洋側 暴風・高波に警戒',
 'article': '台風５号は日本の東の海上を北上していて、気象庁は北日本から東北にかけての太平洋側を中心に暴風や高波に警戒するよう呼びかけています。\n気象庁の観測によりますと、台風５号は９日午前６時には、仙台市の東２７０キロの海上を１時間に２５キロの速さで北へ進んでいます。中心の気圧は９８０ヘクトパスカル、最大風速は３０メートル、最大瞬間風速は４０メートルで中心の東側２８０キロ以内と西側１７０キロ以内では風速２５メートル以上の暴風が吹いています。台風はこのあとも北上を続け、９日夜には北海道の東の海上に進み、１０日の朝には千島の近海で温帯低気圧に変わる見込みです。東北の太平洋側では、海はうねりを伴って大しけとなっていて、１０日は北海道の太平洋側でも大しけとなる見込みです。また、北海道の太平洋側では９日夜にかけて非常に強い風が吹き、最大風速は２０メートル、最大瞬間風速は３０メートルに達すると予想されています。気象庁は北日本から東北にかけての太平洋側を中心に、暴風や高波に警戒するよう呼びかけています。',
 'genre': ['気象', '災害'],
 'keywords': [],
 'url': 'http://www3.nhk.or.jp/news/html/20160809/k10010627901000.html',
 'datePublished': '2016-08-09T06:29',
 'dateModified': ''}

In [10]:
js(dic, year)

# clean

In [73]:
# check category

year = 2017

with open(f'nhkweb{year}.json','r', encoding='utf-8') as f:
    data = json.load(f)
print('articles: ', len(data))
genre = Counter()
for dic in data:
    for g in dic['genre']:
        genre[g] += 1
genre.most_common()

articles:  10740


[('国際', 4223),
 ('社会', 2827),
 ('政治', 1525),
 ('ビジネス', 1341),
 ('スポーツ', 943),
 ('気象・災害', 792),
 ('科学・文化', 783),
 ('暮らし', 519),
 ('地域', 423)]

In [68]:
# genre <> keywords
with open(f'nhkweb{year}.json','r', encoding='utf-8') as f:
    data = json.load(f)

category = ['社会', '国際', 'ビジネス', 'スポーツ', '政治', '科学・文化', '暮らし', '地域', '気象・災害']
for i, dic in enumerate(data):
    newgenre = []
    newkey = []
    for j in dic['genre']:
        if j in category:
            newgenre.append(j)
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        elif j == "科学・医療" or j == "文化・エンタメ" or j == "科学":
            newgenre.append('科学・文化')
        elif j == "暮らし文化":
            newgenre.append('暮らし')
            newgenre.append('科学・文化')
        elif j == "経済":
            newgenre.append('ビジネス')
        else:
            newkey.append(j)
    for j in dic['keywords']:
        if j in category:
            newgenre.append(j)
        elif j == "科学・医療" or j == "文化・エンタメ" or j == "科学":
            newgenre.append('科学・文化')
        elif j == "暮らし文化":
            newgenre.append('暮らし')
            newgenre.append('科学・文化')
        elif j == "災害" or j == "気象":
            newgenre.append('気象・災害')
        else:
            newkey.append(j)
    data[i]['genre'] = list(set(newgenre))
    data[i]['keywords'] = list(set(newkey))

with open(f'nhkweb{year}.json','w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)
    

In [56]:
pd.read_json(f'nhkweb{year}.json').id.value_counts()

KeyboardInterrupt: 

# nhk thai

In [40]:
urls = pd.read_json('nhkthailink.json')[0].tolist()
print(len(urls))
urls[:10]

1840


['https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128206/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128207/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128208/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128209/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128210/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128211/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128212/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128213/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128214/',
 'https://web.archive.org/web/2019*/https://www3.nhk.or.jp//nhkworld/th/news/128215/']

In [44]:
#options = Options()
#options.headless = True
#driver = webdriver.Chrome(options=options)

driver = webdriver.Firefox()

def writejson(diclist):
    with open ('nhk2.json', 'w', encoding='utf-8') as f:
        with open ('nhk.json', 'r', encoding='utf-8') as g:
            old_list = json.load(g)
            for dic in diclist:
                if dic not in old_list:
                    old_list.append(dic)
        old_list = sorted(old_list, key=lambda x: x['id'])
        json.dump(old_list, f, ensure_ascii=False, indent=4)

In [45]:
diclist = []
with open ('nhk.json', 'r', encoding='utf-8') as g:
    idlist = json.load(g)
idlist = [x['id'] for x in idlist]

for url in urls:
    ID = url.split('/')[-2]
    if ID in idlist:
        continue
    for i in range(5):
        archiveurl = get_archiveurl(url, 5)
        if 'nhkworld' in archiveurl:
            break
    print(archiveurl)
    driver.get(archiveurl)
    time.sleep(5)
    html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    dic = {} 
    data = soup.find_all('script', type="application/ld+json")[-1]
    data = json.loads(data.text)
    dic['headline'] = data['headline']
    dic['article'] = data['articleBody']
    dic['date'] = data['datePublished']
    dic['url'] = url
    dic['id'] = ID
    diclist.append(dic)
    
writejson(diclist)

https://web.archive.org/web/20190826060201/https://www3.nhk.or.jp/nhkworld/th/news/179442/
https://web.archive.org/web/20190826060147/https://www3.nhk.or.jp/nhkworld/th/news/179603/
https://web.archive.org/web/20190826060148/https://www3.nhk.or.jp/nhkworld/th/news/179604/
https://web.archive.org/web/20190826060150/https://www3.nhk.or.jp/nhkworld/th/news/179605/
https://web.archive.org/web/20190826060151/https://www3.nhk.or.jp/nhkworld/th/news/179606/
https://web.archive.org/web/20190826060152/https://www3.nhk.or.jp/nhkworld/th/news/179607/
https://web.archive.org/web/20190826060153/https://www3.nhk.or.jp/nhkworld/th/news/179608/
https://web.archive.org/web/20190828081234/https://www3.nhk.or.jp/nhkworld/th/news/179784/
https://web.archive.org/web/20190828081235/https://www3.nhk.or.jp/nhkworld/th/news/179785/
https://web.archive.org/web/20190828081235/https://www3.nhk.or.jp/nhkworld/th/news/179786/
https://web.archive.org/web/20190828081236/https://www3.nhk.or.jp/nhkworld/th/news/179787/

In [43]:
writejson(diclist)