# NHK-Easy - Downloader 0x01

(C) Maxim Gansert, Mindscan, 2021

This notebook contains some code, which can be used to download some easy japanese content from the NHK website. This code can read the main index of the HNK News WEB Easy and then can download, process and store the article content and some additional files, like the vocabulary / dictionary files.

It would be very nice, if i could download the mp3 audio files as well from the akamai and extract the transport stream. This would be a nice resource for reading japanese news and listening to japanese news.

The content might be then additionally annotated with N5-N1 vocabulary as well (in the future).

In [None]:
import requests
import json
import re
import os
import time
import random
import shutil

from bs4 import BeautifulSoup

In [None]:
LOCAL_NEWS_CACHE_FOLDER = '../data/nhk_easy/'

## Reading the catalog

The catalog should be saved too, so we might be able to extract additional information from the json files as well.

In [None]:
# request the json catalog of the news
r = requests.get('https://www3.nhk.or.jp/news/easy/news-list.json')
r.encoding = 'utf-8-sig'

main_list = json.loads(r.text)

In [None]:
main_list

## Pointer to the NHK WEB NEWS EASY Articles

A description for an NHK Article, e.g. title, locations. (Still TODO: Support all these timestamps.)

In [None]:
class NewsDataPointerNHK(object):
    def __init__(self,pointer_data):
        self.data = pointer_data
        pass
    
    def getTitle(self):
        return self.data['title']
    
    def getTitleWithRuby(self):
        return self.data['title_with_ruby']
        
    def getNewsID(self):
        return self.data['news_id']
    
    def hasWebImage(self):
        return self.data['has_news_web_image']
    
    def getWebImageURI(self):
        return self.data['news_web_image_uri']
    
    def hasWebMovie(self):
        return self.data['has_news_web_movie']
    
    def getWebMovieURI(self):
        return self.data['news_web_movie_uri']
    
    def hasEasyImage(self):
        return self.data['has_news_easy_image']
    
    def getEasyImageURI(self):
        return self.data['news_easy_image_uri']

    def hasEasyMovie(self):
        return self.data['has_news_easy_movie']
    
    def getEasyMovieURI(self):
        return self.data['news_easy_movie_uri']

    def hasEasyVoice(self):
        return self.data['has_news_easy_voice']
    
    def getEasyVoiceURI(self):
        return self.data['news_easy_voice_uri']
    
    # https://www3.nhk.or.jp/news/html/20210512/k10013025211000.html
    def getNewsWebURL(self):
        return self.data['news_web_url']
    
    # structure for the easy nwes article
    # https://www3.nhk.or.jp/news/easy/k10013025211000/k10013025211000.html
    def getNewsWebURLEasyIfPossible(self):
        news_url = self.getNewsWebURL()
        news_url = re.sub(r"\/html\/.*\/",'/easy/'+self.getNewsID()+'/',news_url)
        return news_url if self.hasEasyVoice() else self.getNewsWebURL()
    
    # https://www3.nhk.or.jp/news/easy/k10013025211000/k10013025211000.out.dic
    def getVocabularyURLIfPossible(self):
        news_url = self.getNewsWebURL()
        news_url = re.sub(r"\/html\/.*\/",'/easy/'+self.getNewsID()+'/',news_url)
        news_url = re.sub(r"\.html",'.out.dic',news_url)
        return news_url if self.hasEasyVoice() else self.getNewsWebURL()
        


In [None]:
def calcDacheDirectoryName(pointer:NewsDataPointerNHK):
    web_url=pointer.getNewsWebURL()
    web_url = re.sub('.*\/html\/','',web_url)
    web_url = re.sub('\.html','',web_url)
    web_url = re.sub('\/','_',web_url)
    return os.path.join(LOCAL_NEWS_CACHE_FOLDER, web_url)
    

## Indexing and Building the Content Cache

In [None]:
def index_easy_nhk_news_article(news_date, nhk_pointer: NewsDataPointerNHK):
    cache_dir = calcDacheDirectoryName(nhk_pointer)
    
    if os.path.isdir(cache_dir):
        return
        
    os.mkdir(cache_dir)

    news_uri = nhk_pointer.getNewsWebURLEasyIfPossible()
    r = requests.get(news_uri)
    r.encoding = 'utf-8'
       
    soup = BeautifulSoup(r.text, 'html.parser')
    title = soup.find('h1', attrs={'class':'article-main__title'})
    article = soup.find('div', attrs={'id':'js-article-body'})
    
    for a in article.findAll('a'):
        a.unwrap()
    
    # we should save the 
    with open(os.path.join(cache_dir, 'extracted.html'), 'w', encoding="utf-8") as contentFile:
        print("<!DOCTYPE html>", file=contentFile)
        print("<html lang='ja'>", file=contentFile)
        print("<head><meta charset='utf-8'></head>", file=contentFile)
        print("<style>p { font-size: 150%; line-height: 3.2; padding-bottom: 20px; }</style>", file=contentFile)
        print("<body>", file=contentFile)
        print(title, file=contentFile)
        print(article, file=contentFile)
        print("</body>", file=contentFile)
        print("</html>", file=contentFile)

    # we should also save the dictionary:
    # e.g. https://www3.nhk.or.jp/news/easy/k10013023931000/k10013023931000.out.dic
    with open(os.path.join(cache_dir, 'dict_vocab.json'), 'w', encoding="utf-8") as vocabFile:
        vocab_url = nhk_pointer.getVocabularyURLIfPossible()
        vocab = requests.get(vocab_url)
        vocab.encoding = 'utf-8-sig'
        vocab_dict = json.loads(vocab.text)
        json.dump(vocab_dict, vocabFile )

    # we should also save the image for this article
    if nhk_pointer.hasWebImage():
        imageurl = nhk_pointer.getWebImageURI()
        response = requests.get(imageurl, stream=True)
        with open(os.path.join(cache_dir,'news_web_image.jpg'), 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response

    if nhk_pointer.hasEasyImage():
        imageurl = 'https://www3.nhk.or.jp/news/easy/'+ nhk_pointer.getNewsID() + '/' + nhk_pointer.getEasyImageURI()
        response = requests.get(imageurl, stream=True)
        with open(os.path.join(cache_dir,'news_easy_image.jpg'), 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response
        
    # TODO: we should also save the audio transport stream
    # TODO: we should also save the movie if exists
    
    print(news_data_pointer.keys())
    print(news_data_pointer)  

def index_nhk_news_article(news_date, news_data_pointer):
    nhk_pointer = NewsDataPointerNHK(news_data_pointer)
    print("title: {}\n       {}".format(nhk_pointer.getTitle(), nhk_pointer.getTitleWithRuby()) )
    print("url:   {}".format(nhk_pointer.getNewsWebURL()))
    print("easy:  {}".format(nhk_pointer.getNewsWebURLEasyIfPossible()))
    print("vocab: {}".format(nhk_pointer.getVocabularyURLIfPossible()))
    
    if nhk_pointer.hasEasyVoice() :
        index_easy_nhk_news_article(news_date, nhk_pointer)


## The keys of the dictionary are date-information

When we inspect the keys, we might now know how to work from here. So we have for each date some news articles.

In [None]:
main_list[0].keys()

## The Main Loop for building the index

The index should work on a slow pace, e.g. one article per every 3 minutes or so. No need to bully the NHK website.

In [None]:
def getNHKNewsItems():
    r = requests.get('https://www3.nhk.or.jp/news/easy/news-list.json')
    r.encoding = 'utf-8-sig'
    contents = json.loads(r.text)
    indexfilename="index_{}.json".format(int(time.time()))
    # TODO: save the current json_list
    with open( os.path.join( LOCAL_NEWS_CACHE_FOLDER+'/index_json/',indexfilename), 'w') as indexfile:
        json.dump(contents, indexfile, indent=2)
    
    return contents[0].items()

for news_date, news_for_the_day in getNHKNewsItems():
    print(news_date)
    for news_data_pointer in news_for_the_day:
        index_nhk_news_article(news_date, news_data_pointer)
        time.sleep(random.randint(30,75))

# BeautifulSoup and Content Playground

## BeautifulSoup

In [None]:
r = requests.get("https://www3.nhk.or.jp/news/easy/k10013025211000/k10013025211000.html")
r.encoding = 'utf-8'

    

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', attrs={'class':'article-main__title'})
article = soup.find('div', attrs={'id':'js-article-body'})

In [None]:
for a in article.findAll('a'):
    a.unwrap()

article

In [None]:
title

## Dictionary Playground

In [None]:
r = requests.get('https://www3.nhk.or.jp/news/easy/k10013023931000/k10013023931000.out.dic')
r.encoding = 'utf-8-sig'

vocab = json.loads(r.text)

In [None]:
vocab