In [1]:
from scrapy.crawler import CrawlerProcess
from scrapy import Spider
from scrapy.utils.project import get_project_settings

# Định nghĩa Spider
class BlogSpider(Spider):
    name = 'blog_spider'
    start_urls = ['https://newbreak.church/blog/?gad_source=1&gclid=Cj0KCQiA7NO7BhDsARIsADg_hIb6THWfJcUdxD9HMXljPV1Vyxk0JRYy_zhQUtnw8Hmr-s6GB19FuZAaAmuIEALw_wcB']
    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }
    }

    def parse(self, response):
        # Lấy danh sách các bài viết
        blogs = response.css('div.fl-post-column')
        print(blogs)
        for blog in blogs:
            yield {
                'title': blog.css('h2.fl-post-grid-title a::text').get(),
                'link': blog.css('h2.fl-post-grid-title a::attr(href)').get(),
                'date': blog.css('span.fl-post-grid-date::text').get(),
                'image': blog.css('img::attr(src)').get(),
            }
        
        # Xử lý pagination (trang kế tiếp)
        next_page = response.css('a.next-page::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

# Tạo hàm chạy Scrapy trong Notebook
def run_spider(spider):
    # os._exit(00)
    process = CrawlerProcess(get_project_settings())
    process.crawl(spider)
    process.start()

# Chạy spider
run_spider(BlogSpider)

2025-01-02 09:33:43 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2025-01-02 09:33:43 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.13.0 (v3.13.0:60403a5409f, Oct  7 2024, 00:37:40) [Clang 15.0.0 (clang-1500.3.9.4)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform macOS-14.2-arm64-arm-64bit-Mach-O
2025-01-02 09:33:43 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-01-02 09:33:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-01-02 09:33:43 [scrapy.extensions.telnet] INFO: Telnet Password: 0f07d86e497135fd
2025-01-02 09:33:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.ex

In [1]:
import requests
from bs4 import BeautifulSoup

# Hàm xử lý một trang blog
def scrape_blog_page(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.text, 'html.parser')

    blogs = soup.find_all('div', class_='fl-post-column')
    results = []

    for blog in blogs:
        title = blog.find('h2', class_='fl-post-grid-title').get_text(strip=True)
        link = blog.find('a', rel='bookmark')['href']
        date = blog.find('span', class_='fl-post-grid-date').get_text(strip=True)
        image = blog.find('img')['src']
        
        results.append({
            'title': title,
            'link': link,
            'date': date,
            'image': image,
        })

    return results

# Hàm xử lý pagination
def scrape_all_blogs(base_url):
    all_blogs = []
    next_page = base_url

    while next_page:
        print(f"Scraping: {next_page}")
        blogs = scrape_blog_page(next_page)
        all_blogs.extend(blogs)
        
        # Tìm link tới trang kế tiếp (pagination)
        response = requests.get(next_page, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        next_page_tag = soup.find('a', class_='next page-numbers')
        next_page = next_page_tag['href'] if next_page_tag else None

    return all_blogs

# URL gốc
start_url = 'https://newbreak.church/blog/?gad_source=1&gclid=Cj0KCQiA7NO7BhDsARIsADg_hIb6THWfJcUdxD9HMXljPV1Vyxk0JRYy_zhQUtnw8Hmr-s6GB19FuZAaAmuIEALw_wcB'
blogs = scrape_all_blogs(start_url)

# In kết quả
for blog in blogs:
    print(blog)

Scraping: https://newbreak.church/blog/?gad_source=1&gclid=Cj0KCQiA7NO7BhDsARIsADg_hIb6THWfJcUdxD9HMXljPV1Vyxk0JRYy_zhQUtnw8Hmr-s6GB19FuZAaAmuIEALw_wcB
Scraping: https://newbreak.church/blog/page/2/?gad_source=1
Scraping: https://newbreak.church/blog/page/3/?gad_source=1
Scraping: https://newbreak.church/blog/page/4/?gad_source=1
Scraping: https://newbreak.church/blog/page/5/?gad_source=1
Scraping: https://newbreak.church/blog/page/6/?gad_source=1
Scraping: https://newbreak.church/blog/page/7/?gad_source=1
Scraping: https://newbreak.church/blog/page/8/?gad_source=1
Scraping: https://newbreak.church/blog/page/9/?gad_source=1
Scraping: https://newbreak.church/blog/page/10/?gad_source=1
Scraping: https://newbreak.church/blog/page/11/?gad_source=1
Scraping: https://newbreak.church/blog/page/12/?gad_source=1
Scraping: https://newbreak.church/blog/page/13/?gad_source=1
Scraping: https://newbreak.church/blog/page/14/?gad_source=1
Scraping: https://newbreak.church/blog/page/15/?gad_source=1
Sc

In [2]:
import requests
from bs4 import BeautifulSoup

# Gửi GET request
response = requests.get('https://www.nitrotechasia.com/')

# Kiểm tra nếu response thành công
if response.status_code == 200:
    # Parse HTML với BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # In tiêu đề trang
    print(soup.title.string)
    
    # Hoặc truy xuất các phần tử cụ thể
    for link in soup.find_all('a'):
        print(link.get('href'))

2025-01-02 14:05:18 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): www.nitrotechasia.com:443
2025-01-02 14:05:19 [urllib3.connectionpool] DEBUG: https://www.nitrotechasia.com:443 "GET / HTTP/11" 200 None


CÔNG TY TNHH NITRO TECH ASIA INC
https://www.nitrotechasia.com
#home
#about
#services
#blog
#info
#contact
https://www.nitrotechasia.com
https://www.nitrotechasia.com?lang=jp
#
#
#
#
#
#
https://www.facebook.com/photo.php?fbid=897887908734218&set=a.578872513969094&type=3&ref=embed_post
https://www.facebook.com/photo.php?fbid=897887908734218&set=a.578872513969094&type=3&ref=embed_post
https://www.facebook.com/photo.php?fbid=897887908734218&set=a.578872513969094&type=3&ref=embed_post
https://www.facebook.com/watch/?v=286295107655119
https://www.facebook.com/watch/?v=286295107655119
https://www.facebook.com/watch/?v=286295107655119
https://www.facebook.com/story.php?story_fbid=896686225521053&id=100055390520956&mibextid=I6gGtw
https://www.facebook.com/story.php?story_fbid=896686225521053&id=100055390520956&mibextid=I6gGtw
https://www.facebook.com/story.php?story_fbid=896686225521053&id=100055390520956&mibextid=I6gGtw
https://www.designone.jp
#danang
#hue
#gmail
#messenger
https://www.mess

In [1]:
import requests

# Gửi GET request
response = requests.get('https://www.mnot.net/blog/pinned.json')

# Kiểm tra nếu response thành công
if response.status_code == 200:
    # Parse dữ liệu JSON
    data = response.json()
    
    # In dữ liệu JSON dưới dạng dictionary
    print(data)

{'/blog/2024/04/29/power': {'path': '/blog/2024/04/29/power', 'hits': 401, 'visitors': 287, 'title': 'No One Should Have That Much Power', 'date': '29 April 2024'}, '/blog/2023/12/19/standards-and-centralization': {'path': '/blog/2023/12/19/standards-and-centralization', 'hits': 108, 'visitors': 105, 'title': 'RFC 9518 - What Can Internet Standards Do About Centralisation?', 'date': '19 December 2023'}, '/blog/2023/01/05/law-school': {'path': '/blog/2023/01/05/law-school', 'hits': 28, 'visitors': 27, 'title': 'What I Learned in Law School', 'date': ' 5 January 2023'}, '/blog/2022/06/08/http-extensions': {'path': '/blog/2022/06/08/http-extensions', 'hits': 53, 'visitors': 52, 'title': 'Yet More New HTTP Specs', 'date': ' 8 June 2022'}, '/blog/2022/06/06/http-core': {'path': '/blog/2022/06/06/http-core', 'hits': 90, 'visitors': 89, 'title': 'A New Definition of HTTP', 'date': ' 6 June 2022'}, '/blog/2020/08/28/for_the_users': {'path': '/blog/2020/08/28/for_the_users', 'hits': 98, 'visito

In [1]:
import requests

# Gửi GET request để tải xuống hình ảnh
response = requests.get('https://hoanghamobile.com/tin-tuc/wp-content/uploads/2024/01/anh-nen-cute.jpg')

# Kiểm tra nếu response thành công
if response.status_code == 200:
    # Lưu hình ảnh vào tệp
    with open('downloaded_image.jpg', 'wb') as f:
        f.write(response.content)  # Ghi dữ liệu nhị phân vào tệp

In [1]:
from scrapy.crawler import CrawlerProcess
from scrapy import Spider
from scrapy.utils.project import get_project_settings

class PanigateSpider(Spider):
    name = 'panigate_spider'
    start_urls = ['https://truyenfull.io/the-loai/tien-hiep/']
    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }
    }

    def parse(self, response):
        # Iterate through each book row
        for book in response.css('div.row'):
            title = book.css('h3.truyen-title a::text').get()
            author = book.css('span.author span.glyphicon-pencil + span::text').get()
            book_url = book.css('h3.truyen-title a::attr(href)').get()
            latest_chapter = book.css('div.col-xs-2 a::attr(href)').get()
            
            yield {
                'title': title,
                'author': author,
                'book_url': book_url,
                'latest_chapter': latest_chapter,
            }

        # Pagination handling
        next_page = response.css('li.next a::attr(href)').get()
        if next_page:
            yield response.follow(next_page, self.parse)

# Tạo hàm chạy Scrapy trong Notebook
def run_spider(spider):
    process = CrawlerProcess(get_project_settings())
    process.crawl(spider)
    process.start()

# Chạy spider
run_spider(PanigateSpider)


2025-01-02 16:32:44 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: scrapybot)
2025-01-02 16:32:44 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.7.0, Python 3.13.0 (v3.13.0:60403a5409f, Oct  7 2024, 00:37:40) [Clang 15.0.0 (clang-1500.3.9.4)], pyOpenSSL 24.2.1 (OpenSSL 3.3.2 3 Sep 2024), cryptography 43.0.3, Platform macOS-14.2-arm64-arm-64bit-Mach-O
2025-01-02 16:32:44 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-01-02 16:32:44 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-01-02 16:32:44 [scrapy.extensions.telnet] INFO: Telnet Password: 1de2cc6298b7928c
2025-01-02 16:32:44 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.ex