In [None]:
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import sqlite3

In [None]:
#Function to get main url Tiki
def get_url(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    return soup

In [None]:
#Finding tag for categories 
link = get_url('https://tiki.vn/')
link.find_all('a', {'class' : 'MenuItem__MenuLink-sc-181aa19-1 fKvTQu'})

In [None]:
conn = sqlite3.connect('tiki.db')
cur = conn.cursor()

In [None]:
#Creating a table named Categories
def create_table():
    query = '''
    CREATE TABLE IF NOT EXISTS categories 
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    name VARCHAR(256),
    url TEXT,
    parent_id INTEGER
    )'''
    try:
        conn.execute(query)
        conn.commit()
    except Exception as err:
        print('Error', err)

create_table()

In [None]:
class Category():
    def __init__(self, name, url, parent_id = None, cat_id = None):
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id
        
    
    def __repr__(self):
        return f"ID: {self.cat_id}, Name: {self.name}, URL: {self.url}, Parent: {self.parent_id}"
    
    def save_into_db(self):
        query = '''
        INSERT INTO categories
        (name, url, parent_id)
        VALUES (?,?,?)'''
        
        value = (self.name, self.url, self.parent_id)
        
        try:
            cur.execute(query, value)
            self.cat_id = cur.lastrowid
            conn.commit()
        except Exception as err:
            print("Error")

In [None]:
#Function to get main categories
def main(save_db = False):
    soup = get_url('https://tiki.vn/')
    
    result = []
    
    for a in soup.find_all('a', {'class': 'MenuItem__MenuLink-sc-181aa19-1 fKvTQu'}):
        name = a.find('span', {'class': 'text'}).text
        url = a['href']
        main_cat = Category(name, url)
        
        if save_db:
            main_cat.save_into_db()
        result.append(main_cat)
        
    return result

In [None]:
main_categories = main(save_db = True)

In [None]:
print(len(main_categories))
#16 main categories

In [None]:
import re

def sub_categories(parent_category, save_db = False):
    parent_url = parent_category.url
    result = []
    
    try:
        soup = get_url(parent_url)
        div_children = soup.find_all('div', {'class': 'list-group-item is-child'})
        
        for div in div_children:
            name = div.a.text.strip()
            
            name = re.sub('\s{2,}','',name)
            
            sub_url = 'https://tiki.vn/' + div.a['href']
            cat = Category(name, sub_url, parent_category.cat_id)
            
            if save_db:
                cat.save_into_db()
                
            result.append(cat)
                
    except Exception as err:
        print("Error with children categories", err)
    
    return result

In [None]:
def get_all(categories):
    if len(categories) == 0:
        return
    
    for item in categories:
        children_categories = sub_categories(item, save_db = True)
        print(children_categories)
        get_all(children_categories)

In [None]:
get_all(main_categories)

In [634]:
def product_table():
    query = '''
    CREATE TABLE IF NOT EXISTS product
    (Product_ID INTEGER PRIMARY KEY AUTOINCREMENT, 
    Cat_id INT, 
    Title VARCHAR(256),
    Brand VARCHAR(30),
    Original INTEGER,
    Discounted SMALLINT,
    Final INTEGER,
    Link TEXT)''' # FOREIGN KEY (Cat_id) REFRENCES categories (ID) -> see below for explanation 
    #Tom suggested I search up how to do one to many relationships in sqlite python...
    
    try:
        conn.execute(query)
        conn.commit()
    except Exception as err:
        print('Error', err)

In [635]:
product_table()

In [None]:
cur.execute('DROP table product')

In [None]:
class Product():
    def __init__(self, title, brand, original, discounted, final, link, product_id = None, cat_id = None): 
        self.title = title
        self.brand = brand
        self.original = original
        self.discounted = discounted
        self.final = final
        self.link = link 
        self.product_id = product_id 
        self.cat_id = cat_id
    
    def __repr__(self):
        return f'Title: {self.title}, Brand: {self.brand}, Original: {self.original}, Discounted: {self.discounted}, Final: {self.final}, Link: {self.link}'
    
    #Cat_id: {self.cat_id}, 
    
    def save_into_db(self):
        query = '''
        INSERT INTO product
        (Title, Brand, Original, Discounted, Final, Link)
        VALUES (?,?,?,?,?,?);'''
        
        value = (self.cat_id, self.title,self.brand, self.original, self.discounted, self.final, self.link)
        
        #cur.lastrowid autoincrement for Primary key 
        try:
            cur.execute(query,value)
            self.product_id = cur.lastrowid
            conn.commit()
            
        except Exception as err:
            print("Error", err)

In [None]:
def get_all_url_id():
    sub_catagories = pd.read_sql('''
    SELECT DISTINCT a.url, a.id
    FROM categories AS a
    LEFT JOIN categories as b ON a.id = b.parent_id
    WHERE b.id IS NULL
    ORDER BY a.name ASC''', conn)
    
    return sub_catagories

In [None]:
sub_catagories = get_all_url_id()

In [637]:
sub_catagories.url

0       https://tiki.vn//3d/c23742?src=c.1883.hamburge...
1       https://tiki.vn//access-point-diem-truy-cap/c4...
2       https://tiki.vn//access/c14884?src=c.8322.hamb...
3       https://tiki.vn//action-adventure/c142?src=c.8...
4       https://tiki.vn//action-adventure/c11017?src=c...
                              ...                        
2675    https://tiki.vn//o-cam-dien/c2021?src=c.1883.h...
2676    https://tiki.vn//o-khoa-vali/c8389?src=c.914.h...
2677    https://tiki.vn//o-khoa/c2154?src=c.1883.hambu...
2678    https://tiki.vn//o-dia-quang/c28908?src=c.1846...
2679    https://tiki.vn//on-ap-bien-ap/c11854?src=c.18...
Name: url, Length: 2680, dtype: object

In [624]:
sub_categories

Unnamed: 0,url
0,https://tiki.vn//3d/c23742?src=c.1883.hamburge...
1,https://tiki.vn//access-point-diem-truy-cap/c4...
2,https://tiki.vn//access/c14884?src=c.8322.hamb...
3,https://tiki.vn//action-adventure/c142?src=c.8...
4,https://tiki.vn//action-adventure/c11017?src=c...
...,...
2675,https://tiki.vn//o-cam-dien/c2021?src=c.1883.h...
2676,https://tiki.vn//o-khoa-vali/c8389?src=c.914.h...
2677,https://tiki.vn//o-khoa/c2154?src=c.1883.hambu...
2678,https://tiki.vn//o-dia-quang/c28908?src=c.1846...


In [None]:
links = []
#Getting all the links, I don't think this is the right way... 
for url in sub_categories['url']:
    links.append(url)

In [None]:
print(links)

In [638]:
def get_product(save_db = False):
    result = []
    try:
        
        for product in products:
            title = product['data-title']
            brand = product['data-brand']
            original = product.find('span', {'class': 'sale-tag sale-tag-square'}).text.replace('đ','')
            discounted = product.find('span', {'class': 'sale-tag sale-tag-square'}).text.replace('%','')
            final = product['data-price']
            link = 'tiki.vn' + product.a['href']
            
            #sub_catagories.id, somehow, should link to the category table... how? I don't know
            item = Product(sub_catagories.id, title, brand, original, discounted, final, link)
            if save_db:
                item.save_into_db()
            result.append(item)
    except Exception as err:
        print('F', err)

In [None]:
# data = []
# for url in links:
#     try:
#         r = requests.get(url)
#         soup = BeautifulSoup(r, 'html.parser')
#         products = soup.find_all('div', {'class': 'product-item'})
#     except Exception as err:
#         print('Error', err)
#         try:
#             for product in products:
#                 title = product['data-title']
#                 brand = product['data-brand']
#                 original = product.find('span', {'class': 'sale-tag sale-tag-square'}).text.replace('đ','')
#                 discounted = product.find('span', {'class': 'sale-tag sale-tag-square'}).text.replace('%','')
#                 final = product['data-price']
#                 link = 'tiki.vn' + product.a['href']
        
#                 items = Product(title, brand, original, discounted, final, link)
#                 items.save_into_db()
#             data.append(items)
#         except Exception as err:
#             print('F',err)
            