In [13]:
import pandas as pd
import json
import requests 
import time
import random
import os
from urllib.parse import urlparse

# Crawl Product ID

In [14]:
#Define web url headers
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Accept':'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.9,vi;q=0.8,th;q=0.7,zh-CN;q=0.6,zh;q=0.5',
    'X-Guest-Token': 'Ae1PtIuUDbozkgpQ6VOW57jKqvchZxF2',
    'Connection' : 'keep-alive',
    'TE': 'Trailers'
}

In [15]:
#create a list of URL which is all of the sub categories within a root category. Note that if a sub category already less than 50 pages
# then I will not go deeper into its' child categories anymore

#Testing with 3 links for this new script
list_url = ['https://tiki.vn/giay-the-thao-nam-co-cao/c49622',
            'https://tiki.vn/giay-luoi-vai-nam/c49624',
            'https://tiki.vn/giay-tay-nam-khong-day/c49632'
            ]
root_category = 'GIAY-NAM'

list_page_50 = []  #this list contains URL that have more than 50 pages
for url in list_url:    #get category name and category id using urlparse lib
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.split('/')

    category_name = path_segments[1]
    category_id = path_segments[2][1:]

    product_data = []  # this list will hold data of ID, category, maximum page within category
    page_error = []    # This list will hold data of error pages that we cannot request. So that we can crawl again

    params = {                     # website url params, we will loop for different params, which means request different urls
        'limit':'40', 
        'include': 'advertisement',
        'aggregations': '2',
        'trackity_id': '74634137-8702-8b12-fe42-46a46a2a2573',
        'category': category_id,  # change based on category id
        'page': '1',              # I set page=1, later in the loop I will change page from 1 to last_page
        'urlKey': category_name   # change based on category name
    }
    
    response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers, params=params)
    last_page = response.json().get('paging').get('last_page')  # I crawl last_page from TIKI, this is important to distinguish category with less than 50 pages and category >50 pages.
    
    print('CATEGORY ', category_name)
    print('total pages',last_page)
    
    if last_page == 50:           # TIKI API only allow to show 50 pages maximum. So if category with last_page = 50, we will make another list then try to use filters to divide them.  
        list_page_50.append(url)  # Append list_page_50 the url >50 pages
        continue 
    else:
        for i in range(1, last_page + 1):   # Crawl from page 1 to last_page
            params['page'] = i              # set page param to the current crawling page
            response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers, params=params)

            if response.status_code == 200:
                print('request done page', i)
                for record in response.json().get('data',[]): # this to ensure an empty if get method do not get any data inside
                    product_id = record.get('id')
                    product_data.append({'product_id': product_id, 'category_name': category_name, 'max_page': last_page})
            
            else:
                retry = 0  # Count for tracking the number of retry
                max_attempts = 3
                while retry < max_attempts:
                    params = {
                        'limit': '40',
                        'include': 'advertisement',
                        'aggregations': '2',
                        'trackity_id': '74634137-8702-8b12-fe42-46a46a2a2573',
                        'category': category_id,
                        'page': i,
                        'urlKey': category_name
                    }
                    response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers, params=params)

                    if response.status_code == 200: #when attempt succeed!
                        print('Request done page {} at {} attempt'.format(i,retry+1))
                        for record in response.json().get('data'):
                            product_id = record.get('id')
                            product_data.append({'product_id': product_id, 'category_name': category_name, 'max_page': last_page})
                        break  # Break the while loop early

                    # If still have error for this turn
                    retry += 1
                # Check if the last request attempt could not request any better
                if retry == max_attempts:
                    print(f'Reached the maximum number of request attempts for page {i}')
                    page_error.append([category_id, category_name, i])  #save the error page to a list
            time.sleep(random.randrange(1, 2)) #sleep for a few seconds before moving to next page

        # Export file
        product_df = pd.DataFrame(product_data,columns = ['product_id','category_name','max_page'])    # Make a dataframe of crawled ID, category_name, max_page of that category
        output_file_name = f'{root_category}-{category_name}.csv'  #Export to csv file the IDs in that category. Every sub category has a distinct file. I will finally concat later in another script
        product_df.to_csv(output_file_name)


CATEGORY  giay-the-thao-nam-co-cao
total pages 22
request done page 1
request done page 2
request done page 3
request done page 4
request done page 5
request done page 6
request done page 7
request done page 8
request done page 9
request done page 10
request done page 11
request done page 12
request done page 13
request done page 14
request done page 15
request done page 16
request done page 17
request done page 18
request done page 19
request done page 20
request done page 21
request done page 22
CATEGORY  giay-luoi-vai-nam
total pages 11
request done page 1
request done page 2
request done page 3
request done page 4
request done page 5
request done page 6
request done page 7
request done page 8
request done page 9
request done page 10
request done page 11
CATEGORY  giay-tay-nam-khong-day
total pages 50


# Crawl ID with 50 pages more

In [17]:
#Define price range in order to divide category with more than 50 pages into smaller groups
price_range = [
    '500,15000', '15001,30000', '30001,50000', '50001,80000', '80001,150000',
    '150001,250000', '250001,350000', '350001,550000', '550000, 100000000'
                ]

still_50_page = []

for url in list_page_50:
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.split('/')

    category_name = path_segments[1]
    category_id = path_segments[2][1:]
    product_data = []
    for price in price_range:
        params = {
            'limit': '40',
            'include': 'advertisement',
            'aggregations': '2',
            'trackity_id': '74634137-8702-8b12-fe42-46a46a2a2573',
            'category': category_id,
            'page': '1',
            'urlKey': category_name,
            'price': price
        }
        
        response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers, params=params)

        if response.status_code == 200:
            response_json = response.json()
            last_page = response_json.get('paging', {}).get('last_page')
            print('CATEGORY', category_name, 'with price range: ', price)
            print('total pages', last_page)

            if last_page == 50:
                still_50_page.append([category_name, price])
                continue
            else:
                for i in range(1, last_page + 1):
                    params['page'] = i
                    response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers,params=params)

                    if response.status_code == 200:
                        print('request done page', i)
                        response_json = response.json()
                        for record in response_json.get('data', []):
                            product_id = record.get('id')
                            product_data.append({'product_id': product_id, 'category_name': category_name,
                                                 'max_page': last_page})
                    else:
                        retry = 0  # Count for tracking the number of retry
                        max_attempts = 3
                        while retry < max_attempts:
                            params = {
                                'limit': '40',
                                'include': 'advertisement',
                                'aggregations': '2',
                                'trackity_id': '74634137-8702-8b12-fe42-46a46a2a2573',
                                'category': category_id,
                                'page': i,
                                'urlKey': category_name
                            }
                            response = requests.get('https://tiki.vn/api/personalish/v1/blocks/listings', headers=headers, params=params)

                            if response.status_code == 200: #when attempt succeed!
                                print('Request done page {} at {} attempt'.format(i,retry+1))
                                for record in response.json().get('data'):
                                    product_id = record.get('id')
                                    product_data.append({'product_id': product_id, 'category_name': category_name, 'max_page': last_page})
                                break  # Break the while loop early

                            # If still have error for this turn
                            retry += 1
                        # Check if the last request attempt could not request any better
                        if retry == max_attempts:
                            print(f'Reached the maximum number of request attempts for page {i}')
                            page_error.append([category_id, category_name, i, price])  #save the error page to a list
                    time.sleep(1) #sleep for a few seconds before moving to next page

    # Export file
    product_df = pd.DataFrame(product_data,columns = ['product_id','category_name','max_page'])    # Make a dataframe of crawled ID, category_name, max_page of that category
    output_file_name = f'{root_category}-{category_name}.csv'  #Export to csv file the IDs in that category. Every sub category has a distinct file. I will finally concat later in another script
    product_df.to_csv(output_file_name)

CATEGORY giay-tay-nam-khong-day with price range:  500,15000
total pages 1
request done page 1
CATEGORY giay-tay-nam-khong-day with price range:  15001,30000
total pages 1
request done page 1
CATEGORY giay-tay-nam-khong-day with price range:  30001,50000
total pages 1
request done page 1
CATEGORY giay-tay-nam-khong-day with price range:  50001,80000
total pages 1
request done page 1
CATEGORY giay-tay-nam-khong-day with price range:  80001,150000
total pages 3
request done page 1
request done page 2
request done page 3
CATEGORY giay-tay-nam-khong-day with price range:  150001,250000
total pages 26
request done page 1
request done page 2
request done page 3
request done page 4
request done page 5
request done page 6
request done page 7
request done page 8
request done page 9
request done page 10
request done page 11
request done page 12
request done page 13
request done page 14
request done page 15
request done page 16
request done page 17
request done page 18
request done page 19
reques

# PRODUCT INFO CRAWL

In [18]:
def product_parser(json):
    pInfo = {}

    pInfo['id'] = json.get('id')
    pInfo['sku'] = json.get('sku')
    pInfo['name'] = json.get('name')
    pInfo['discription'] = json.get('short_description')
    pInfo['original_price'] = json.get('original_price')
    pInfo['list_price'] = json.get('list_price')
    pInfo['price'] = json.get('price')
    pInfo['alltime_quantity_sold'] = json.get('all_time_quantity_sold',{})
    pInfo['quantity_sold'] = json.get('quantity_sold', {}).get('value')
    pInfo['inventory_status'] = json.get('inventory_status')
    pInfo['fulfillment_type'] = json.get('inventory', {}).get('fulfillment_type')
    pInfo['brand'] = json.get('brand', {}).get('name')
    pInfo['category_id'] = json.get('categories', {}).get('id')
    pInfo['category'] = json.get('categories').get('name')
    pInfo['review_count'] = json.get('review_count')
    pInfo['rating_average'] = json.get('rating_average')
    pInfo['favourite_count'] = json.get('favourite_count')
    pInfo['ASA_cashback'] = json.get('asa_cashback_widget', {}).get('text')
    pInfo['pay_later'] = json.get('installment_info_v2',{})
    pInfo['current_seller'] = json.get('current_seller', {}).get('name')
    pInfo['date_created'] = json.get('day_ago_created')
    pInfo['video_url'] = json.get('video_url',{})
    
    images = json.get('images')
    if images:
        max_key = len(images)
        pInfo['max_image_key'] = max_key
    
    return pInfo

In [40]:
folder = r'D:\TIKI\New_crawling_optimized\GIAY-NAM'
category_list = []

# Begin to merge all csv files in the root folder into a long list of all IDs within category
for file_name in os.listdir(folder):
    file_path = os.path.join(folder,file_name)
    id_list = pd.read_csv(file_path,usecols=['product_id','category_name'])
    category_list.append(id_list)

category_list_df = pd.concat(category_list)
category_list_df = category_list_df.drop_duplicates()   # Drop duplicates IDs


In [43]:
#Finalize the list of all IDs and print the total number of IDs inside that root category
category_list_final = category_list_df['product_id'].to_list()
print(len(category_list_final))

2994


In [44]:
# Define header and param for product info crawling
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    'Accept':'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.9,vi;q=0.8,th;q=0.7,zh-CN;q=0.6,zh;q=0.5',
    'X-Guest-Token': 'Ae1PtIuUDbozkgpQ6VOW57jKqvchZxF2',
    'Connection' : 'keep-alive',
    'TE': 'Trailers'
}

params = {
    'limit':'5',
    'include': 'comments,contribute_info,attribute_vote_summary',
}

- After several conducts, I see that TIKI API will block requests for about 20-30seconds for every 120 IDs being crawled. Therefore I should pause for that period.
- To prevent this. I optimize by introducing a pause after 120 IDs break, 

In the last conduct, if I just crawl from the first to the last ID, I crawled 2994 IDs for 31m15s.
Let's see if this new method could reduce time

In [69]:
import time

product_info = []   # make a list to store product data
error_list = []     # a list to store IDs that get error

for id in category_list_final:   # Iterate through the list of IDs crawled before  
    try:
        response = requests.get('https://tiki.vn/api/v2/products/{}'.format(id), headers=headers, params=params)
        if response.status_code == 200:
            print('crawling product id {}'.format(id))
            product_info.append(product_parser(response.json()))        # Append info crawled into the list
       
    except Exception as e:
        print('An error occurred while crawling product id {}: {}'.format(id, e))
        error_list.append(id)                                           # Append the IDs that get error for next time crawling
        time.sleep(random.uniform(10, 15))                              # Pause for some period
        continue


# After crawling the whole ID list, we  crawl IDs that in error_list
while error_list:               # begin to crawl again all IDs in error_list
    error_list_remained = []    # I create a list to store the IDS that still get errors when crawling
    
    for id in error_list:
        try:
            response = requests.get('https://tiki.vn/api/v2/products/{}'.format(id), headers=headers, params=params)
            if response.status_code == 200:
                print('Succeed re-crawling product id {}'.format(id))
                product_info.append(product_parser(response.json()))
            else:
                print('cannot crawl product id {}!'.format(id))
                error_list_remained.append(id)           # Append the IDs that still get error into 'error_list_remained'
        except Exception as e:
            print('An error occurred while crawling product id {}: {}'.format(id, e))
            error_list_remained.append(id)               # Append the IDs that still get error into 'error_list_remained'
            continue

    if len(error_list_remained) == len(error_list):  #if all of the error_list IDs cannot be crawled, we will exit the loop. It means that there are something wrong with this product ID
        print('break re-crawling due to failing to crawl these ID', error_list)
        break  
        
    else:  
        error_list = error_list_remained    # Minimize the number of IDs of error_list in the while loop
        time.sleep(1)

product_info_df = pd.DataFrame(product_info)


crawling product id 161496880
crawling product id 161490177
crawling product id 101291470
crawling product id 199104164
crawling product id 101292156
crawling product id 208601178
crawling product id 182317830
crawling product id 185146153
crawling product id 225947937
crawling product id 199103927
crawling product id 168953188
crawling product id 168946336
crawling product id 199104161
crawling product id 199104159
crawling product id 4017903
crawling product id 191908085
crawling product id 153259212
crawling product id 190195327
crawling product id 106320873
crawling product id 199104094
crawling product id 199104089
crawling product id 115241354
crawling product id 132372284
crawling product id 191907702
crawling product id 100420464
crawling product id 125779590
crawling product id 176254517
crawling product id 182124443
crawling product id 193736063
crawling product id 208017588
crawling product id 242715109
crawling product id 153257484
crawling product id 152347892
crawling pro

It takes 24m54s. The time for crawling reduces a little bit when I detect this pattern. The time we save is because we don't need to crawl again for a long list of page_error as the result of pausing

In [None]:
product_info_df.to_csv('giay-nam-info-optimized.csv')