In [None]:
import requests
from bs4 import BeautifulSoup
import psycopg2
from collections import deque
import re

In [None]:
conn = psycopg2.connect(database="?", user="?", password="?", host="?", port="?")
conn.autocommit = True
cur = conn.cursor()

In [None]:
tablename = 'public.tiki_category'
query = f'''
    create table if not exists {tablename} (
        id serial primary key,
        category_id integer,
        category_name varchar,
        url varchar,
        parent_id integer,
        batch timestamp default current_timestamp
    );
'''
cur.execute(query)
conn.commit()

In [None]:
class Category:
    def __init__(self, id, cate_id, name, url, parent_id):
        self.id = id
        self.cate_id = cate_id
        self.name = name
        self.url = url
        self.parent_id = parent_id
        
    def crawl_insert(self):
        try:
            query = f'''
                insert into {tablename} (category_id, category_name, url, parent_id) 
                values(%s, %s, %s, %s)
                returning category_id;
            '''
            val = (self.cate_id, self.name, self.url, self.parent_id)
            cur.execute(query, val)
            self.cate_id = cur.fetchone()[0]
        except Exception as err:
            print(err)

In [None]:
def get_web(url):
    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [None]:
def get_main():
    soup = get_web('https://tiki.vn')
    main_category_list = []

    try:
        categories = soup.find_all('a', {'class', "MenuItem__MenuLink-sc-181aa19-1 fKvTQu"})
        for c in categories:
            id = None
            main_name = c.text
            main_url = c['href']
            main_id = re.findall('([1-9][0-9]*)', main_url)[0]
            main_parent_id = main_id

            main_cate = Category(id, main_id, main_name, main_url, main_parent_id)
            main_cate.crawl_insert()
            main_category_list.append(main_cate)
    except Exception as err:
        print(err)
        
    return main_category_list

In [None]:
def get_sub(parent_cate):
    soup = get_web(parent_cate.url)
    sub_category_list = []

    try:
        categories = soup.find_all('div', {'class': "list-group-item is-child"})
        for c in categories:
            id = None
            sub_name = c.a.text.strip().split('                                ')[0].replace("'", "''")
            sub_url = 'https://tiki.vn' + c.a.get('href')
            sub_id = re.findall('([1-9][0-9]*)', sub_url)[0]
            sub_parent_id = parent_cate.cate_id
            
            sub_cate = Category(id, sub_id, sub_name, sub_url, sub_parent_id)
            sub_cate.crawl_insert()
            if sub_cate.cate_id is not None:
                sub_category_list.append(sub_cate)
    except Exception as err:
        print(err)
    
    return sub_category_list

In [None]:
def get_all(main_cate):
    queue = deque(main_cate)
    while queue:
        parent_cate = queue.popleft()
        category_list = get_sub(parent_cate)
        queue.extend(category_list)

In [None]:
%%time
main_cate = get_main()
get_all(main_cate)
print('Crawl done!')