# Crawl vnexpress.net

In [1]:
import dateparser

text = 'Thứ hai, 18/9/2023, 16:26 (GMT+7)'
date = dateparser.parse(text)
print(date)

2023-09-18 16:26:00+07:00


In [11]:
from calendar import c
import re
import requests
import json
from bs4 import BeautifulSoup
import crawl
from datetime import datetime, timedelta
import dateparser
import importlib
importlib.reload(crawl)


class VnexpressCrawler():
    base_url = 'https://vnexpress.net'
    page = 1
    daily = False # if exist in database will kill process.

    def __init__(self):
        pass

    def get_response(self, url, params = {}):
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Credentials': 'true',
            'Access-Control-Allow-Methods': 'GET, HEAD, POST, PUT, DELETE, TRACE, OPTIONS, PATCH',
            'Access-Control-Allow-Headers': """X-Real-IP,X-AGENT,Pragma,X-REFERER,X-AUTH-TOKEN,Accept-Encoding,channel,X-XSS-Protection,X-Content-Type-Options,Strict-Transport-Security,Content-Type,Authorization,Accept,Origin,User-Agent,DNT,Cache-Control,X-Mx-ReqToken,Keep-Alive,X-Requested-With,If-Modified-Since,token-id""",
            'Content-Encoding': 'gzip'
        }
        # Make an HTTP GET request to the API endpoint using the requests library
        response = requests.get(url, params=params, headers=headers)
        return response
    
    def get_page_detail(self, url, params = {}):
        if url.startswith(self.base_url) == False:
            url = self.base_url + url

        print('get_page_detail', url, params)
        response = self.get_response(url, params)
        if response.status_code // 100 == 2:
            content_html = response.content
            soup = BeautifulSoup(content_html, 'html.parser')
            # Find the title element
            publishdate_elem = soup.select_one('.header-content span.date')
            if publishdate_elem:
                publishDate = publishdate_elem.get_text()
                publishDate = publishDate.strip()
                publishDate = dateparser.parse(publishDate)
            else:
                publishDate = None

            content_detail_elem = soup.select_one('article.fck_detail')

            content_detail = ''
            if content_detail_elem:
                content_detail = content_detail_elem.get_text()

            return {
                'date': publishDate,
                'content': content_detail
            }
        else:
            # If unsuccessful, print the status code and reason for failure
            print(f"Request failed with status code {response.status_code}: {response.reason}")
            return {}

    def crawl_html(self, url, params = {}):
        print('start:crawl_html:', url, params)
        response = self.get_response(url, params)
        if response.status_code // 100 == 2:
            content_html = response.content

            # Parse the HTML using BeautifulSoup
            soup = BeautifulSoup(content_html, 'html.parser')

            # Find the title element
            title_elements = soup.select('.title-news a')
            if title_elements is None or len(title_elements) == 0:
                return False

            for title_element in title_elements:
                title = title_element.get_text()
                link = title_element['href']

                print("Title:", title)
                print("Link:", link)
                data_detail = self.get_page_detail(link, {})
                item = {
                    'domain': 'https://vnexpress.net/kinh-doanh',
                    'title': title,
                    'url': link,
                    'date': data_detail['date'] if 'date' in data_detail else None,
                    'content': data_detail['content'] if 'content' in data_detail else None,
                }

                if self.daily:
                    isDup = crawl.create_article(item, False)
                    if isDup is False:
                        return False
                else:
                    crawl.create_article(item, True)

        else:
            # If unsuccessful, print the status code and reason for failure
            print(f"Request failed with status code {response.status_code}: {response.reason}")
            return False

    def run(self, page = {"from": None, "to": None}, daily = False):
        self.daily = daily
        page_from = 1 if page['from'] is None else page['from']
        page_to = -1 if page['to'] is None else page['to']
        while (True):
            isSuccess = self.crawl_html('https://vnexpress.net/kinh-doanh-p' + str(page_from), {})
            # break
            if isSuccess == False or page_to == page_from:
                break
            page_from += 1


VnexpressCrawler().run(daily=True)
# obj.get_page_detail('/von-ngoai-bat-dau-tang-toc-185230818163354771.htm')
# crawl_json('https://finfo-api.vndirect.com.vn/v4/news?q=newsType:company_report~locale:VN~newsSource:VNDIRECT&sort=newsDate:desc~newsTime:desc&size=20&page=3', {})

start:crawl_html: https://vnexpress.net/kinh-doanh-p1 {}
Title: 
Được công nhận là nền kinh tế thị trường ý nghĩa gì với Việt Nam? 
Link: https://vnexpress.net/duoc-cong-nhan-la-nen-kinh-te-thi-truong-y-nghia-gi-voi-viet-nam-4656643.html
get_page_detail https://vnexpress.net/duoc-cong-nhan-la-nen-kinh-te-thi-truong-y-nghia-gi-voi-viet-nam-4656643.html {}
