In [2]:
#Install Requirements
import urllib
from bs4 import BeautifulSoup 
from selectolax.parser import HTMLParser

from urllib.request import urlparse, urljoin
import json 
from datetime import datetime
from dateutil.parser import parse
import nltk

In [3]:
class Crawler(object):
    
    def __init__(self, keywords, max_urls=50):
        self.links = []
        self.keywords = keywords
        self.max_urls = max_urls
        self.reset()

    def reset(self):
        self.internal_urls = set()
        self.external_urls = set()
        self.total_urls_visited = 0
        
    def get_links(self):
        return self.internal_urls
        
    def crawl(self, url):
        """
        Crawls a web page and extracts all links.
        You'll find all links in `external_urls` and `internal_urls` global set variables.
        params:
            max_urls (int): number of max urls to crawl, default is 30.
        """
        self.total_urls_visited += 1
        if self.total_urls_visited <= self.max_urls:
            print("crawling", self.total_urls_visited, "/", self.max_urls)
        
        self.links = self.get_all_website_links_selectolax(url)
        
        for link in self.links:
            if self.total_urls_visited > self.max_urls:
                break
            self.crawl(link)
            
        
            
    def is_valid(self, url):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)
    
    def get_all_website_links_selectolax(self, url):
        urls = set()
        
        domain_name = urlparse(url).netloc
        
        try:
            r = urllib.request.urlopen(url)
            sll =  HTMLParser(r.read())
        except:
            return urls
        
        for a_tag in sll.css("a"):
            if not "href" in a_tag.attributes:
                continue
            href = a_tag.attrs["href"]
            if href == "" or href is None:
                continue

            href = urljoin(url,href)
            parsed_href = urlparse(href)
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

            if not self.is_valid(href):
                # not a valid URL
                continue
            if href in self.internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
    #             if href not in external_urls:
    #                 external_urls.add(href)
                continue

            a_tag_text = a_tag.text(deep=True, separator='', strip=False)
            if not (any(word in a_tag_text for word in self.keywords) or any(word in href for word in self.keywords)):
                #check if A tag text OR A tag href doesn't contain keyword  
                continue

            urls.add(href)
            self.internal_urls.add(href)
        return urls

In [4]:
class NewsSource(object):
    
    def __init__(self, journal, seed_url, crawler):
        self.journal = journal
        
        print('Crawling for sources using seed...')
        print(seed_url)
        crawler.reset()
        crawler.crawl(seed_url)
        print('Crawling done')
        
        print('Filtering links...')
        self.links = self.filter_links(crawler.get_links())
        self.output = None
        print('Done.')
    
    def get_output(self):
        return self.output
    
    def get_links(self):
        return self.links
    
    def export(self, filename):
        if self.output == None:
            print('Run scrape first!')
            return
        
        with open(filename + '.json', 'w') as outfile:
            json.dump(self.output, outfile)
        
    
    def filter_links(self, links):
        return links
    
    def scrape(self):
        print('Scraping', len(self.links), 'sources on', self.journal)

In [33]:
class LibertyTimes(NewsSource):
    def __init__(self, seed_url, crawler):
        super().__init__('LibertyTimes', seed_url, crawler)
        
    def filter_links(self, links):
        return set(filter(lambda l: 
                          ('/topic/' not in l), links))
    
    def filter_sentence(self, line):
        filtered = ['請繼續往下閱讀...', '點我下載APP', '更多相關訊息', '更新時間']
        for each in filtered:
            if each in line:
                return False
        return True
        
    def scrape(self):
        super().scrape()
        articles = []
        
        for i, URL in enumerate(self.links):
            r = urllib.request.urlopen(URL) 
            sll = HTMLParser(r.read())
            
            print(i+1,'/',len(self.links),URL)
            headline = sll.css_first('.whitecon > h1').text()
            timestamp = sll.css_first('.time').text().strip()
            print(headline)
            main_article = sll.css('.whitecon > .text > p')
            
            story = {}
            story['content'] = []
            story['headline'] = headline
            story['time-stamp'] = timestamp
            story['url'] = URL
            story['journal'] = self.journal
            
            for paragraph in main_article:
                line = paragraph.text(deep=True, separator='')
                line = line.replace('\n','')
                if self.filter_sentence(line) and line:
                    story['content'].append(line)
            
            articles.append(story)
            #print(story)
        
        self.output = articles
            

In [66]:
class UDN(NewsSource):
    def __init__(self, seed_url, crawler):
        super().__init__('UDN', seed_url, crawler)
        
    def filter_links(self, links):
        return set(filter(lambda l: 
                          ('udn.com/news/story/' in l)
                          , links))
    def scrape(self):
        super().scrape()
        articles = []
        
        for i, URL in enumerate(self.links):
            print(URL)
            r = urllib.request.urlopen(URL) 
            sll = HTMLParser(r.read())
            
            print(i+1,'/',len(self.links),URL)
            headline = sll.css_first('.article-content__title')
            if headline:
                headline = headline.text(deep=True, separator='')
            timestamp = sll.css_first('.article-content__time')
            if timestamp:
                timestamp = timestamp.text(deep=True, separator='').strip()
            main_article = sll.css('.article-content__editor > p')
            print(headline)
            
            story = {}
            story['content'] = []
            story['headline'] = headline
            story['time-stamp'] = timestamp
            story['url'] = URL
            story['journal'] = self.journal
            
            for paragraph in main_article:
                line = paragraph.text(deep=True, separator='')
                line = line.replace('\n','')
                story['content'].append(line)
            
            articles.append(story)
            
        self.output = articles

In [67]:
crawler = Crawler(['武漢肺炎', '疫情', '病毒', '新冠肺炎'], 100)

In [68]:
udn_url = 'https://udn.com/news/story/120936/4387285?from=ddd-umaylikenews_ch2'
udn = UDN(udn_url, crawler)
udn.scrape()
udn.export('udn')

Crawling for sources using seed...
https://udn.com/news/story/120936/4387285?from=ddd-umaylikenews_ch2
crawling 1 / 100
crawling 2 / 100
crawling 3 / 100
crawling 4 / 100
crawling 5 / 100
crawling 6 / 100
crawling 7 / 100
crawling 8 / 100
crawling 9 / 100
crawling 10 / 100
crawling 11 / 100
crawling 12 / 100
crawling 13 / 100
crawling 14 / 100
crawling 15 / 100
crawling 16 / 100
crawling 17 / 100
crawling 18 / 100
crawling 19 / 100
crawling 20 / 100
crawling 21 / 100
crawling 22 / 100
crawling 23 / 100
crawling 24 / 100
crawling 25 / 100
crawling 26 / 100
crawling 27 / 100
crawling 28 / 100
crawling 29 / 100
crawling 30 / 100
crawling 31 / 100
crawling 32 / 100
crawling 33 / 100
crawling 34 / 100
crawling 35 / 100
crawling 36 / 100
crawling 37 / 100
crawling 38 / 100
crawling 39 / 100
crawling 40 / 100
crawling 41 / 100
crawling 42 / 100
crawling 43 / 100
crawling 44 / 100
crawling 45 / 100
crawling 46 / 100
crawling 47 / 100
crawling 48 / 100
crawling 49 / 100
crawling 50 / 100
crawli

In [36]:
lt_url = 'https://news.ltn.com.tw/news/world/breakingnews/3087611'
lt = LibertyTimes(lt_url, crawler)
lt.scrape()
#lt.export('lt')

Crawling for sources using seed...
https://news.ltn.com.tw/news/world/breakingnews/3087611
crawling 1 / 100
crawling 2 / 100
crawling 3 / 100
crawling 4 / 100
crawling 5 / 100
crawling 6 / 100
crawling 7 / 100
crawling 8 / 100
crawling 9 / 100
crawling 10 / 100
crawling 11 / 100
crawling 12 / 100
crawling 13 / 100
crawling 14 / 100
crawling 15 / 100
crawling 16 / 100
crawling 17 / 100
crawling 18 / 100
crawling 19 / 100
crawling 20 / 100
crawling 21 / 100
crawling 22 / 100
crawling 23 / 100
crawling 24 / 100
crawling 25 / 100
crawling 26 / 100
crawling 27 / 100
crawling 28 / 100
crawling 29 / 100
crawling 30 / 100
crawling 31 / 100
crawling 32 / 100
crawling 33 / 100
crawling 34 / 100
crawling 35 / 100
crawling 36 / 100
crawling 37 / 100
crawling 38 / 100
crawling 39 / 100
crawling 40 / 100
crawling 41 / 100
crawling 42 / 100
crawling 43 / 100
crawling 44 / 100
crawling 45 / 100
crawling 46 / 100
crawling 47 / 100
crawling 48 / 100
crawling 49 / 100
crawling 50 / 100
crawling 51 / 100
