In [3]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
from time import time

def get_url(url):
    """Get parsed HTML from url
      Input: url to the webpage
      Output: Parsed HTML text of the webpage
    """
    # Send Get request to server to get data
    r = requests.get(url)

    # Parse HTML text
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # return soup object
    return soup

def scrape_tiki(url='https://tiki.vn/laptop-may-vi-tinh-linh-kien/c1846?_lc=Vk4wMzkwMTIwMDI%3D&src=c.1846.hamburger_menu_fly_out_banner&is_cross_border=1'):
    """Scrape info of products in Laptop-PC"""
    
    # Begin with page 1
    page = 1
    
    # Get parsed HTML by calling get_url function
    soup = get_url(url)
           
    # Find all products on page 1
    products = soup.find_all('div', {'class':'product-item'})
    
    # Store product info in a list
    data = []
    
    # As long as product is not none
    while products != None:
        # Update url to contain '&page=', convert page to string for string concatenation
        url = url + '&page=' + str(page)
        
        # Get parsed HTML of current page
        soup = get_url(url)
        
        # Find all products on current page
        products = soup.find_all('div', {'class':'product-item'})
        
        # Extract info of each product
        for product in products:    
            d = {'Category':'',
             'Name':'',
             'Final_price':'',
             'Regular_price':'',
             'Discount_percent':'',
             'Installment':'',
             'Cross_border':'',
             'Sponsor':'',
             'Reviews':'',
             'Rating':'',
             'Rating_by_stars':'',
             'Url':'',
             'Image_url':''}
            
            # User try block for not terminate whole program due to error 
            try:
                # Extract product's category
                d['Category'] = product['data-category']
                #print(d['Category'])
                
                # Extract product's name
                d['Name'] = product['data-title']
                #print(d['Name'])
                
                # Extract product's url, prefix with 'https://tiki.vn'
                d['Url'] = 'https://tiki.vn' + product.a['href']
                #print(d['Url'])
                
                # Extract product's image url
                d['Image_url'] = product.find('img', {'class':'product-image img-responsive'})['src']
                #print(d['Image'])
                
                # Use try block for catching case No Discount
                try:
                    # Extract all text of <p class='price-sale'>, split it and convert to a list of integers, extract values of final rice, regular price and discount percent respectively
                    d['Final_price'] = int(list(product.find('p', {'class':'price-sale'}).text.split())[0].replace('đ', '').replace('.', '', -1))
                    #print(d['Final_Price'])
                    d['Regular_price'] = int(list(product.find('p', {'class':'price-sale'}).text.split())[2].replace('đ', '').replace('.', '', -1))
                    #print(d['Regular_Price'])
                    d['Discount_percent'] = int(list(product.find('p', {'class':'price-sale'}).text.split())[1].replace('-', '').replace('%',''))
                except:
                    # If there is no discount, set the regular price = final price, set discount percent = 0
                    d['Regular_price'] = d['Final_price']
                    d['Discount_percent'] = 0
                
                # If installment is available, set installment = YES if not set installment = NO
                if product.find('p', {'class':'installment'}):
                    installment = 'YES'
                else:
                    installment = 'NO'
                d['Installment'] = installment
                #print(d['Installment'])
                
                # If category's text include string 'Quốc Tế', set is_cross_border = YES, if not set is_cross_border = NO
                if 'Quốc Tế' in d['Category']:
                    is_cross_border = 'YES'
                else:
                    is_cross_border = 'NO'
                d['Cross_border'] = is_cross_border
                #print(d['Cross_border'])
                
                # If text in <div class='ship-label-wrapper'> include string 'Tài trợ', sponsor = YES, if not set sponsor = NO
                if 'Tài trợ' in product.find('div', {'class':'ship-label-wrapper'}).text:
                    sponsor = 'YES'
                else:
                    sponsor = 'NO'
                #print(sponsor)

                d['Sponsor'] = sponsor
                
                # Use try block to catch case No rating
                try:
                    # Caculate the rating base on the width of star bar
                    # Extract the width, select only the number and convert to integer
                    d['Rating'] = int(product.find('span', {'class':'rating-content'}).span['style'][6:-1])
                    # Calculate number of stars base on rating value
                    d['Rating_by_stars'] = d['Rating'] * 5 / 100
                except:
                    # If there's no rating, set rating = 0
                    d['Rating'] = 0

                # No rating means no review, no rating == no review == no star
                if d['Rating'] == 0:
                    d['Reviews'] = 0
                    d['Rating_by_stars'] = 0
                else:
                    # Get the text in <p class='review'>, format it to get only number then convert to int
                    d['Reviews'] = int(product.find('p', {'class':'review'}).text.replace('(', '').split()[0])
                
                # add product's info in data list
                data.append(d)
            
            # Print some useful information for debugging in case cannot extract data 
            except:
                print('Cannot retrieve information')
                #print(page)
                #print(d['Name'])
                #print(d['Discount_percent'])
        
        #Avoid getting banned by Tiki
        sleep(5)
        
        # Increase page
        page += 1
        print(time())
    
    # Return data list
    return data

# Call scrape_tiki function
data = scrape_tiki()

# Create a data frame by pandas
product_info = pd.DataFrame(data = data, columns = data[0].keys())

product_info.to_pickle("./result.pkl")
product_info.to_csv("./result.csv", index=False)

# print(scrape_tiki())

1596292425.242922
1596292431.102572
1596292437.1091259
1596292443.077297
1596292449.156904
1596292455.332129
1596292461.2880042
1596292467.286527
1596292473.2209332
1596292479.170172
1596292485.25306
1596292491.1105778
1596292496.973241
1596292503.174031
1596292509.1039538
1596292515.23798
1596292521.3329039
1596292527.338634
1596292533.22643
1596292539.231663
1596292545.157984
1596292550.8852859
1596292556.5512059
1596292562.21392
1596292567.86496
1596292573.745601
1596292579.3673558
1596292585.029959
1596292590.7899342
1596292596.590427
1596292602.334404
1596292608.002662
1596292613.727547
1596292619.8203268
1596292625.629963
1596292631.877307
1596292637.5441248
1596292643.1945758
1596292648.8933172
1596292654.665418
1596292660.4503691
1596292666.1483
1596292671.867672
1596292677.4723399
1596292683.153174
1596292688.97926
1596292694.591906
1596292700.228301
1596292705.949426
1596292711.7549708
1596292717.596839
1596292723.427249
1596292729.04355
1596292734.826189
1596292740.518042
15

ChunkedEncodingError: ('Connection broken: IncompleteRead(0 bytes read)', IncompleteRead(0 bytes read))