# Tiki Product & Price Crawling

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datalab.context import Context
import datalab.storage as storage
import google.datalab.bigquery as bq
import time

In [2]:
# Create array for product title & price
title = []
price = []

In [3]:
# Web scrap function
def scrap(WebUrl):
    # Web URL
    url = WebUrl
    
    # Plain HTML code
    plain = requests.get(url).text
    
    # Parser
    s = BeautifulSoup(plain, "html.parser")
    
    # Find and add product titles to array
    for i in s.findAll('p',{'class':'title'}):
        title.append(i.text.strip())
        
    # Find and add product price to aray    
    for i in s.findAll('span',{'class':'final-price'}):
        price.append(i.text.split()[0])

In [4]:
# Run Web scrap function
scrap('https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=mega-menu')

In [5]:
# Create dataframe from arrays
data = {'product':title,'price':price}
df = pd.DataFrame(data)
df

Unnamed: 0,price,product
0,20.690.000,Điện Thoại iPhone X 64GB VN/A - Hàng Chính...
1,7.490.000,Điện Thoại iPhone 6s 32GB - Hàng Chính Hãng...
2,17.810.000,Điện Thoại Samsung Galaxy S10 Plus...
3,2.290.000,Điện Thoại Xiaomi Redmi 6 (3/32GB) - Hàng...
4,2.750.000,Điện Thoại OPPO A3s (16GB/2GB) - Hàng Chính...
5,327.000,Điện Thoại Nokia 105 Single Sim (2017) -...
6,7.150.000,iPad WiFi 32GB New 2018 - Hàng Chính Hãng
7,2.490.000,Máy Đọc Sách Kindle Paperwhite 2018 (7th) -...
8,1.890.000,Điện Thoại Xiaomi Redmi 6A - Hàng Chính Hãng
9,270.000,Điện Thoại Trẻ Em MKIDS - Hàng Chính Hãng...


In [6]:
# Encode product to utf-8
df['product_encoded'] = df['product'].apply(lambda x: x.encode('utf-8'))

In [7]:
# Fix Price
def fix_price(x):
  x = x.split('.')
  x = ''.join(x)
  return int(x)
df['price_fixed'] = df['price'].apply(fix_price)

In [8]:
# Create test dataset for BigQuery

# df_to_bigquery = df[['product_encoded','price_fixed']]
df_to_bigquery = df[['price','price_fixed']]

In [9]:
start = time.time()
df_to_bigquery.to_gbq('tiki_dataset.tiki_table', 
                 Context.default().project_id,
                 chunksize=10000, 
                 if_exists='replace',
                 verbose=False)
end = time.time()
print('Task completed in: {}'.format(end-start))

Task completed in: 140.18724036216736


In [10]:
df_to_bigquery

Unnamed: 0,price,price_fixed
0,20.690.000,20690000
1,7.490.000,7490000
2,17.810.000,17810000
3,2.290.000,2290000
4,2.750.000,2750000
5,327.000,327000
6,7.150.000,7150000
7,2.490.000,2490000
8,1.890.000,1890000
9,270.000,270000
