In [None]:
import json
import os.path as osp
import platform
import time
import re
import csv
import random
import traceback
import logging
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.common.action_chains import ActionChains

In [23]:
class Collect_Xamvn_Info:
    def __init__(self, no_gui=False, proxy=None):
        
        self.result = pd.DataFrame({'url': [], 'name': [], 'reply_num': [], 'view_num': []})
        executable = ''
        
        #LƯU Ý
        # tự chỉnh đường dẫn theo máy 
        if platform.system() == 'Windows':
            print('Detected OS : Windows')
            executable = 'chromedriver.exe'
        elif platform.system() == 'Linux':
            print('Detected OS : Linux')
            executable = './chromedriver/chromedriver_linux'
        elif platform.system() == 'Darwin':
            print('Detected OS : Mac')
            executable = './chromedriver/chromedriver_mac'
        else:
            raise OSError('Unknown OS Type')

        if not osp.exists(executable):
            raise FileNotFoundError(
                'Chromedriver file should be placed at {}'.format(executable))

        chrome_options = Options()
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        if no_gui:
            chrome_options.add_argument('--headless')
        if proxy:
            chrome_options.add_argument("--proxy-server={}".format(proxy))
            
        self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)    

        #auto download driver nhưng mà web chính thức chưa có chrome driver mới nhất version 116 nên phải làm bằng cơm :D
        
#         self.browser = webdriver.Chrome(
#             ChromeDriverManager().install(), chrome_options=chrome_options)

#         browser_version = 'Failed to detect version'
#         chromedriver_version = 'Failed to detect version'
#         major_version_different = False

#         if 'browserVersion' in self.browser.capabilities:
#             browser_version = str(self.browser.capabilities['browserVersion'])

#         if 'chrome' in self.browser.capabilities:
#             if 'chromedriverVersion' in self.browser.capabilities['chrome']:
#                 chromedriver_version = str(
#                     self.browser.capabilities['chrome']['chromedriverVersion']).split(' ')[0]

#         if browser_version.split('.')[0] != chromedriver_version.split('.')[0]:
#             major_version_different = True

#         print('_________________________________')
#         print('Current web-browser version:\t{}'.format(browser_version))
#         print('Current chrome-driver version:\t{}'.format(chromedriver_version))
#         if major_version_different:
#             print('warning: Version different')
#             print(
#                 'Download correct version at "http://chromedriver.chromium.org/downloads" and place in "./chromedriver"')
#         print('_________________________________')

    def thread_link_crawl(self, url, num_pages):
        """Crawl infos of posts with a specific number of pages in a url thread
        
        {url} (Thread): included: 
        https://xamvn.vc/box/massage-bon-mua.165/
        https://xamvn.vc/box/an-choi-huong-thu.174/
        https://xamvn.vc/box/quan-con-so-say.146/
        https://xamvn.vc/whats-new/posts/13844728/
        
        {num_pages}: The number of pages you want to crawl in each site 
        Due to cloudflare block, it is changed to page number
        """
        
        #for page_num in range(1, num_pages + 1):
        #bị cloudflare rùi nên cứ phải tắt đi bật lại driver liên tục :v
        self.browser.get(url + f"page-{num_pages}")
        #chỉnh lại số trừ theo tốc độ mạng hiện tại lúc chạy
        random_time1 = 9.5 - random.randint(4, 7)
        time.sleep(random_time1)

        #scroll down 
        total_height = int(self.browser.execute_script("return document.body.scrollHeight"))
        for i in range(1, total_height, 600):
            self.browser.execute_script("window.scrollTo(0, {});".format(i))
            time.sleep(0.3)

        full_page_html = self.browser.page_source

        try:
            result = self.extract_info(full_page_html)
            self.result = pd.concat([self.result, result], ignore_index=True)
        except Exception as e:
            print("An exception occurred: check internet if needed") 
            logging.error(traceback.format_exc())

    def extract_info(self, html_doc): 
        """Extract 
        [post_name; 
        url;
        reply_number;
        view_number] of each post in one html page
        
        {html_doc}: the full page source of the targeted site 
        """
        
        # parse html doc
        soup = BeautifulSoup(html_doc, "html.parser")
        
        names = []
        urls = []
        reply_num_list = []
        view_num_list = []
        
        all_posts_name_and_url = soup.find_all('div', class_ = 'structItem-cell structItem-cell--main')
        for post in all_posts_name_and_url:
            a_href = post.find_all('a', href=True)

            for i in a_href:
                if re.search("data-preview-url", str(i)):
                    url_and_name = i

            url = url_and_name["href"]
            url = "xamvn.vc" + url
            pattern = r'<a[^>]*>(.*?)</a>'
            match = re.search(pattern, str(url_and_name))
            name = match.group(1)[:-1]
            names.append(name)
            urls.append(url)

        all_posts_reply_num = soup.find_all("dl", class_ = "pairs pairs--justified")
        for post in all_posts_reply_num:
            reply_num = re.findall("\d+", str(post))[0]
            reply_num_list.append(int(reply_num))

        all_posts_view_num = soup.find_all("dl", class_ = "pairs pairs--justified structItem-minor")
        for post in all_posts_view_num:
            view_num = re.findall("\d+\w*", str(post))[0]
            view_num = view_num.replace("K", "000")
            view_num_list.append(int(view_num))
        
        result = pd.DataFrame({'url': urls, 'name': names, 'reply_num': reply_num_list, 'view_num': view_num_list})
        
        return result

#  Bị cloudflare chặn nếu get url liên tục nên phải tắt đi bật lại driver liên tục :v

In [25]:
df = pd.DataFrame({'url': [], 'name': [], 'reply_num': [], 'view_num': []})
no_gui = False

for page_number in range(1, 6):
    collect = Collect_Xamvn_Info(no_gui=no_gui)
    collect.thread_link_crawl("https://xamvn.vc/box/massage-bon-mua.165/", page_number)
    temp_df = collect.result
    collect.browser.quit()
    df = pd.concat([df, temp_df], ignore_index=True)

Detected OS : Windows


  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)
  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)


Detected OS : Windows
Detected OS : Windows
Detected OS : Windows
Detected OS : Windows


In [26]:
for page_number in range(1, 6):
    collect = Collect_Xamvn_Info(no_gui=no_gui)
    collect.thread_link_crawl("https://xamvn.vc/box/an-choi-huong-thu.174/", page_number)
    temp_df = collect.result
    collect.browser.quit()
    df = pd.concat([df, temp_df], ignore_index=True)

Detected OS : Windows


  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)
  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)


Detected OS : Windows
Detected OS : Windows
Detected OS : Windows
Detected OS : Windows


In [27]:
for page_number in range(1, 6):
    collect = Collect_Xamvn_Info(no_gui=no_gui)
    collect.thread_link_crawl("https://xamvn.vc/box/quan-con-so-say.146/", page_number)
    temp_df = collect.result
    collect.browser.quit()
    df = pd.concat([df, temp_df], ignore_index=True)

Detected OS : Windows


  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)
  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)


Detected OS : Windows
Detected OS : Windows
Detected OS : Windows
Detected OS : Windows


In [28]:
df2 = pd.DataFrame({'url': [], 'name': [], 'reply_num': [], 'view_num': []})
no_gui = False

for page_number in range(1, 6):
    collect = Collect_Xamvn_Info(no_gui=no_gui)
    collect.thread_link_crawl("https://xamvn.vc/whats-new/posts/", page_number)
    temp_df = collect.result
    collect.browser.quit()
    df2 = pd.concat([df2, temp_df], ignore_index=True)

Detected OS : Windows


  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)
  self.browser = webdriver.Chrome(executable_path=executable, chrome_options=chrome_options)


Detected OS : Windows
Detected OS : Windows
Detected OS : Windows
Detected OS : Windows


# Xuất file csv

In [31]:
df.to_csv("mại dâm.csv")

In [32]:
df2.to_csv("tổng hợp.csv")