In [1]:
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import sqlite3

In [3]:
#Function to get main url Tiki
def get_url(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    return soup

In [48]:
#Finding tag for categories 
link = get_url('https://tiki.vn/')
link.find_all('a', {'class' : 'MenuItem__MenuLink-sc-181aa19-1 fKvTQu'})

[<a class="MenuItem__MenuLink-sc-181aa19-1 fKvTQu" href="https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner"><span class="icon-wrap"><i class="icon-main-menu-cellphone lv1-icon tikicon"></i></span><span class="text">Điện Thoại - Máy Tính Bảng</span></a>,
 <a class="MenuItem__MenuLink-sc-181aa19-1 fKvTQu" href="https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner"><span class="icon-wrap"><i class="icon-main-menu-tv lv1-icon tikicon"></i></span><span class="text">Điện Tử - Điện Lạnh</span></a>,
 <a class="MenuItem__MenuLink-sc-181aa19-1 fKvTQu" href="https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner"><span class="icon-wrap"><i class="icon-main-menu-headphone lv1-icon tikicon"></i></span><span class="text">Phụ Kiện - Thiết Bị Số</span></a>,
 <a class="MenuItem__MenuLink-sc-181aa19-1 fKvTQu" href="https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner"><sp

In [12]:
conn = sqlite3.connect('tiki.db')
cur = conn.cursor()

In [90]:
#Creating a table named Categories
def create_table():
    query = '''
    CREATE TABLE IF NOT EXISTS categories 
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    name VARCHAR(256),
    url TEXT,
    parent_id INTEGER
    )'''
    try:
        conn.execute(query)
        conn.commit()
    except Exception as err:
        print('Error', err)

create_table()

In [91]:
class Category():
    def __init__(self, name, url, parent_id = None, cat_id = None):
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id
        
    
    def __repr__(self):
        return f"ID: {self.cat_id}, Name: {self.name}, URL: {self.url}, Parent: {self.parent_id}"
    
    def save_into_db(self):
        query = '''
        INSERT INTO categories
        (name, url, parent_id)
        VALUES (?,?,?)'''
        
        value = (self.name, self.url, self.parent_id)
        
        try:
            cur.execute(query, value)
            self.cat_id = cur.lastrowid
            conn.commit()
        except Exception as err:
            print("Error")

In [80]:
#Function to get main categories
def main(save_db = False):
    soup = get_url('https://tiki.vn/')
    
    result = []
    
    for a in soup.find_all('a', {'class': 'MenuItem__MenuLink-sc-181aa19-1 fKvTQu'}):
        name = a.find('span', {'class': 'text'}).text
        url = a['href']
        main_cat = Category(name, url)
        
        if save_db:
            main_cat.save_into_db()
        result.append(main_cat)
        
    return result

In [97]:
main_categories = main(save_db = True)

In [75]:
print(len(main_categories))
#16 main categories

16


In [157]:
import re

def sub_categories(parent_category, save_db = False):
    parent_url = parent_category.url
    result = []
    
    try:
        soup = get_url(parent_url)
        div_children = soup.find_all('div', {'class': 'list-group-item is-child'})
        
        for div in div_children:
            name = div.a.text.strip()
            
            name = re.sub('\W+\d+\W','', name) #remove digit + '()'
            name = re.sub('\s{2,}','', name) #remove empty space 
            
            sub_url = 'https://tiki.vn/' + div.a['href']
            cat = Category(name, sub_url, parent_category.cat_id)
            
            if save_db:
                cat.save_into_db()
                
            result.append(cat)
                
    except Exception as err:
        print("Error with children categories", err)
    
    return result

In [163]:
def get_all(categories):
    if len(categories) == 0:
        return
    
    for item in categories:
        children_categories = sub_categories(item, save_db = True)
        time.sleep(2)
        print(children_categories)
        get_all(children_categories)

In [None]:
get_all(main_categories)

[ID: 17, Name: Điện Thoại - Máy Tính Bảng, URL: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner, Parent: None, ID: 18, Name: Điện Tử - Điện Lạnh, URL: https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner, Parent: None, ID: 19, Name: Phụ Kiện - Thiết Bị Số, URL: https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner, Parent: None, ID: 20, Name: Laptop - Thiết bị IT, URL: https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner, Parent: None, ID: 21, Name: Máy Ảnh - Quay Phim, URL: https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner, Parent: None, ID: 22, Name: Điện Gia Dụng, URL: https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner, Parent: None, ID: 23, Name: Nhà Cửa Đời Sống, URL: https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner, Parent: None, ID: 24, Name: Hàng Tiêu Dùng - Thực Phẩ