In [8]:
from datetime import datetime, timedelta

def parse_date_publish(date_time_string):
    # Split the string into date, time, and timezone offset parts
    date_string, time_string, offset_string = date_time_string.split()
    time_zone_offset = int(offset_string[4:])  # Extract the numeric offset
    combined_string = f"{date_string} {time_string}"
    print('parse_date_publish', combined_string)
    # Parse the combined string into a datetime object
    parsed_datetime = datetime.strptime(combined_string, "%d/%m/%Y %H:%M")
    return parsed_datetime

# Input date and time string
date_time_string = "18/08/2023 15:00 GMT+7"
# Parse the date and time
parsed_datetime = parse_date_publish(date_time_string)
# Print the parsed datetime
print(parsed_datetime.strftime("%Y-%m-%d %H:%M:%S"))


parse_date_publish 18/08/2023 15:00
2023-08-18 15:00:00


In [31]:
import re
import requests
import json
from bs4 import BeautifulSoup
import crawl
from datetime import datetime, timedelta

import importlib
importlib.reload(crawl)

class CrawlThanhNien:
    magazine = 'thanhnien'
    base_url = 'https://thanhnien.vn'
    page = 1

    def __init__(self, magazine, is_update = False):
        self.magazine = magazine
        self.is_update = is_update

    def get_response(self, url, params = {}):
        headers = {
            'Access-Control-Allow-Origin': '*',
            'Access-Control-Allow-Credentials': 'true',
            'Access-Control-Allow-Methods': 'GET, HEAD, POST, PUT, DELETE, TRACE, OPTIONS, PATCH',
            'Access-Control-Allow-Headers': 'X-Real-IP,X-AGENT,Pragma,X-REFERER,X-AUTH-TOKEN,Accept-Encoding,channel,X-XSS-Protection,X-Content-Type-Options,Strict-Transport-Security,Content-Type,Authorization,Accept,Origin,User-Agent,DNT,Cache-Control,X-Mx-ReqToken,Keep-Alive,X-Requested-With,If-Modified-Since,token-id',
            'Content-Encoding': 'gzip'
        }
        # Make an HTTP GET request to the API endpoint using the requests library
        response = requests.get(url, params=params, headers=headers)
        return response
    
    def get_page_detail(self, url, params = {}):
        url = self.base_url + url
        print('get_page_detail', url, params)
        response = self.get_response(url, params)
        if response.status_code // 100 == 2:
            content_html = response.content
            print('content_html', content_html[:100])
            # Parse the HTML using BeautifulSoup
            soup = BeautifulSoup(content_html, 'html.parser')
            # Find the title element
            publishdate_elem = soup.find('div', attrs={'data-role': 'publishdate'})
            content_detail_elem = soup.find('div', attrs={'class': 'detail__cmain-main'})

            if publishdate_elem:
                publishDate = publishdate_elem.get_text()
                publishDate = publishDate.strip()
                publishDate = parse_date_publish(publishDate)
            else:
                publishDate = None

            content_detail = content_detail_elem.get_text() if content_detail_elem else None

            return {
                'date': publishDate,
                'content': content_detail
            }
        else:
            # If unsuccessful, print the status code and reason for failure
            print(f"Request failed with status code {response.status_code}: {response.reason}")
            return {}

    def crawl_html(self, url, params = {}):
        print('crawl_html', url, params)
        response = self.get_response(url, params)
        if response.status_code // 100 == 2:
            content_html = response.content

            # Parse the HTML using BeautifulSoup
            soup = BeautifulSoup(content_html, 'html.parser')

            # Find the title element
            title_elements = soup.find_all('a', class_='box-category-link-title')
            if title_elements is None or title_elements is []:
                return False
            for title_element in title_elements:
                title = title_element.get_text()
                link = title_element['href']

                print("Title:", title)
                # print("Content:", content)
                print("Link:", link)

                data_detail = self.get_page_detail(link, {})
                print('data_detail', data_detail)
                item = {
                    'domain': 'https://thanhnien.vn/kinh-te.htm',
                    'title': title,
                    'url': link,
                    'date': data_detail['date'] if 'date' in data_detail else None,
                    'content': data_detail['content'] if 'content' in data_detail else None,
                }
                isDup = crawl.create_article(item, self.is_update)
                return isDup 
        else:
            # If unsuccessful, print the status code and reason for failure
            print(f"Request failed with status code {response.status_code}: {response.reason}")
            return False

    def run(self, page = None, is_update = False):
        self.page = 1 if page is None else page
        while (True):
            isSuccess = self.crawl_html('https://thanhnien.vn/timelinelist/18549/' + str(self.page) + '.htm', {})
            if isSuccess == False:
                break
            self.page += 1


obj = CrawlThanhNien('thanhnien', is_update = False)
obj.run()
# obj.get_page_detail('/von-ngoai-bat-dau-tang-toc-185230818163354771.htm')
# crawl_json('https://finfo-api.vndirect.com.vn/v4/news?q=newsType:company_report~locale:VN~newsSource:VNDIRECT&sort=newsDate:desc~newsTime:desc&size=20&page=3', {})

crawl_html https://thanhnien.vn/timelinelist/18549/1.htm {}
Title: Cổ phiếu VFS tại Việt Nam trùng tên chứng khoán VinFast trên sàn Nasdaq là ai?
Link: /co-phieu-vfs-tai-viet-nam-trung-ten-chung-khoan-vinfast-tren-san-nasdaq-la-ai-185230820085612713.htm
get_page_detail https://thanhnien.vn/co-phieu-vfs-tai-viet-nam-trung-ten-chung-khoan-vinfast-tren-san-nasdaq-la-ai-185230820085612713.htm {}
content_html b'<!DOCTYPE html>\n<html lang="vi">\n<head>\n    <meta http-equiv="Content-Type" content="text/html; char'
parse_date_publish 20/08/2023 11:50
data_detail {'date': datetime.datetime(2023, 8, 20, 11, 50), 'content': '\n\n\n\n\n\n\n\nMai Phương\n-  maiphuongthanhnien@gmail.com \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                20/08/2023 11:50 GMT+7\n            \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n                            