# Tiki Product & Price Crawling

In [2]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datalab.context import Context
import google.datalab.storage as storage
import google.datalab.bigquery as bq
import time
from io import BytesIO

In [3]:
# Set project variables
project = Context.default().project_id
bucket_name = project + '-datalab'
bucket_path = 'gs://' + bucket_name
bucket_object = bucket_path + '/tiki.csv'
print('Bucket: ' + bucket_name)
print('Object: ' + bucket_object)

Bucket: tiki-web-crawler-datalab
Object: gs://tiki-web-crawler-datalab/tiki.csv


In [4]:
# Read bucket_object (csv)
%gcs read --object 'gs://tiki-web-crawler-datalab/tiki.csv' --variable csv_as_bytes

In [5]:
# Open bucket_object as pd.DataFrame
df = pd.read_csv(BytesIO(csv_as_bytes))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 2 columns):
product        96 non-null object
price_fixed    96 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.6+ KB


In [6]:
# Create array for product title & price
title = []
price = []

In [7]:
# Web scrap function
def scrap(WebUrl):
    # Web URL
    url = WebUrl
    
    # Plain HTML code
    plain = requests.get(url).text
    
    # Parser
    s = BeautifulSoup(plain, "html.parser")
    
    # Find and add product titles to array
    for i in s.findAll('p',{'class':'title'}):
        title.append(i.text.strip())
        
    # Find and add product price to aray    
    for i in s.findAll('span',{'class':'final-price'}):
        price.append(i.text.split()[0])

In [8]:
# Run Web scrap function
scrap('https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=mega-menu&page=3')

In [18]:
# Create new dataframe from arrays
data = {'product':title,'price':price}
df_new = pd.DataFrame(data)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 2 columns):
price      48 non-null object
product    48 non-null object
dtypes: object(2)
memory usage: 848.0+ bytes


In [10]:
# Fix Price
def fix_price(x):
  x = x.split('.')
  x = ''.join(x)
  return int(x)
df_new['price_fixed'] = df_new['price'].apply(fix_price)

In [11]:
# Update bucket_object data with new data
df_to_bigquery = df.append(df_new[['product','price_fixed']],ignore_index=True)
df_to_bigquery.info()

In [None]:
# Version 1 - Long load time
# start = time.time()
# df_to_bigquery.to_gbq('tiki_dataset.tiki_table', 
#                  Context.default().project_id,
#                  chunksize=10000, 
#                  if_exists='replace',
#                  verbose=False)
# end = time.time()
# print('Task completed in: {}'.format(end-start))

In [None]:
# Version 2  - Not working
# bucket_name = Context.default().project_id + '-datalab'
# bucket_path = 'gs://' + bucket_name
# bucket_object = bucket_path + '/tiki_crawler.txt'
# bigquery_dataset_name = 'tiki_dataset'
# bigquery_table_name = 'tiki_table'

# # Define storage bucket
# bucket = storage.Bucket(bucket_name)

# # Create storage bucket if it does not exist
# if not bucket.exists():
#     bucket.create()

# dataset = bq.Dataset(bigquery_dataset_name)
# table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

# # Create BigQuery dataset
# if not dataset.exists():
#     dataset.create()

# # Create or overwrite the existing table if it exists
# table_schema = bq.Schema.from_data(df_to_bigquery)
# table.create(schema = table_schema, overwrite = True)

# # Write the DataFrame to GCS (Google Cloud Storage)
# %storage write --variable df_to_bigquery --object $bucket_object

# # Write the DataFrame to a BigQuery table
# table.insert(df_to_bigquery)

In [15]:
# Replace old bucket_object with new one
df_to_bigquery.to_csv('tiki.csv',index=False)

!gsutil cp 'tiki.csv' $bucket_object

Copying file://tiki.csv [Content-Type=text/csv]...
/ [1 files][  8.3 KiB/  8.3 KiB]                                                
Operation completed over 1 objects/8.3 KiB.                                      


In [None]:
# Update new bucket_object to BigQuery

In [17]:
%%bq load --mode overwrite --path $bucket_object --table tiki_dataset.tiki_table --skip 1