#  Weekly project

## 1. Scraping the first page

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_url(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup
def scrape_tiki(url = 'https://tiki.vn/dien-thoai-may-tinh-bang/c1789?'):
    soup = get_url(url)
    items = soup.find_all('div',{"class": "product-item"})
    data = []
    for item in items:
        try: 
            dic = {"product_id":"","seller_id":"","title":"","price":"","image_url":""}
            dic["product_id"] = item["data-id"]
            dic["seller_id"] = item["data-seller-product-id"]
            dic["title"] = item["data-title"]
            dic["price"] = item["data-price"]
            if item.find("span",{"class":"image"}):
                dic["image_url"] = item.find("span",{"class":"image"}).img["src"]
            data.append(dic)
        except:
            print("We got an error")
    
    return (data)

In [None]:
data = scrape_tiki()
data

In [None]:
page1 = pd.DataFrame(data,columns = data[0].keys())

In [None]:
page1.head()

## 2. Scraping all pages

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from random import randint

def get_url(url):  # create a function to parse the url
    """Get parsed HTML from url
      Input: url to the webpage
      Output: Parsed HTML text of the webpage
    """
    # Send GET request
    r = requests.get(url)  
    
    # Parse HTML text
    soup = BeautifulSoup(r.text, 'html.parser') 
    return soup

def scrape_tiki_all(url = "https://tiki.vn/dien-thoai-may-tinh-bang/c1789?"):
    """Scrape listing info of phone cat of tiki.vn
      Input: url of a cat. Default: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?
      Output: A list containing scraped data of listing items
    """
    url_base = url
    
    # List containing data of all articles
    data = []
    
    page = 1
    
    items = True
    
    # start to find target tags and extract the item infos, stop when no info of items is found
    while items != []:
        
        # Get parsed HTML
        soup = get_url(url)
        
        # Find all tags that contain required info
        items = soup.find_all('div',{"class": "product-item"}) 
        
        # Extract information of each tag
        for item in items: 
            
            # We use the try-except blocks to handle errors
            try: 
                
                # Each tag is dictionary containing the required information
                dic = {"product_id":"","seller_id":"","title":"","price":"","image_url":""}
                dic["product_id"] = item["data-id"]
                dic["seller_id"] = item["data-seller-product-id"]
                dic["title"] = item["data-title"]
                dic["price"] = item["data-price"]
                
                # There are some articles without img tag...
                if item.find("span",{"class":"image"}):
                    dic["image_url"] = item.find("span",{"class":"image"}).img["src"]
                
                # Append the dictionary to data list
                data.append(dic)
            except:
                
                 # Skip if error and print error message
                print("We got an error")
        
        # print out the page number and items to keep track
        print(page, len(data))
        
        # increment page
        page += 1
        
        # create the url of the next page
        url = url_base + "&page=" +str(page)
        
        #  control the scraping speed
        time.sleep(randint(1,4))
    
    
    return data

In [2]:
# Test the scraper
phone = scrape_tiki_all()
phone

1 49
2 97
3 145
4 193
5 241
6 289
7 337
8 385
9 433
10 452
11 452


[{'product_id': '53516830',
  'seller_id': '53793247',
  'title': 'Điện Thoại Samsung Galaxy A11 (32GB/3GB) - Hàng Chính Hãng',
  'price': '3220000',
  'image_url': 'https://salt.tikicdn.com/cache/280x280/ts/product/98/b8/89/c078436f8c04567119078c0e7adfc255.jpg'},
 {'product_id': '48524359',
  'seller_id': '55994540',
  'title': 'Điện Thoại Vsmart Joy 3 - Hàng chính hãng',
  'price': '1958000',
  'image_url': 'https://salt.tikicdn.com/cache/280x280/ts/product/5a/7b/e1/5acd19c60380413b3e72ac3460da0f62.jpg'},
 {'product_id': '53090589',
  'seller_id': '53090591',
  'title': 'Điện Thoại iPhone  SE 64GB ( 2020)  -  Hàng  Chính Hãng',
  'price': '11290000',
  'image_url': 'https://salt.tikicdn.com/cache/280x280/ts/product/16/15/72/7363f6268c34387efc2d4079c537e64e.jpg'},
 {'product_id': '46753117',
  'seller_id': '54749371',
  'title': 'Điện Thoại Vsmart Active 3 - Hàng Chính Hãng',
  'price': '2850000',
  'image_url': 'https://salt.tikicdn.com/cache/280x280/ts/product/21/b2/3a/3cf0cf3433d71

In [3]:
# Save data to a DataFrame
phone_product = pd.DataFrame(phone,columns = phone[0].keys())
phone_product

Unnamed: 0,product_id,seller_id,title,price,image_url
0,53516830,53793247,Điện Thoại Samsung Galaxy A11 (32GB/3GB) - Hàn...,3220000,https://salt.tikicdn.com/cache/280x280/ts/prod...
1,48524359,55994540,Điện Thoại Vsmart Joy 3 - Hàng chính hãng,1958000,https://salt.tikicdn.com/cache/280x280/ts/prod...
2,53090589,53090591,Điện Thoại iPhone SE 64GB ( 2020) - Hàng C...,11290000,https://salt.tikicdn.com/cache/280x280/ts/prod...
3,46753117,54749371,Điện Thoại Vsmart Active 3 - Hàng Chính Hãng,2850000,https://salt.tikicdn.com/cache/280x280/ts/prod...
4,32033717,32033719,Điện Thoại iPhone 11 64GB - Hàng Chính Hãng,18990000,https://salt.tikicdn.com/cache/280x280/ts/prod...
...,...,...,...,...,...
447,47211298,47211304,FD V8H 2.4G Wireless Mute Mouse Plug & Play Sl...,349000,https://salt.tikicdn.com/cache/280x280/ts/prod...
448,58054536,58054537,"Điện Thoại Cổ Điển Đẹp DT151, Bản Dây Line",1950000,https://salt.tikicdn.com/cache/280x280/ts/prod...
449,58841752,58841753,Điện Thoại Cổ Điển DT40S - Bản Dùng Sim Cố Địn...,3200000,https://salt.tikicdn.com/cache/280x280/ts/prod...
450,58459136,58459138,Điện Thoại OPPO RENO 4 PRO (8GB/256GB) - Hàng...,11990000,https://salt.tikicdn.com/cache/280x280/ts/prod...


In [None]:
# extract to pickle
phone_product.to_pickle("./phone_product.pkl")

In [None]:
# extract to csv
phone_product.to_csv("./phone_product.csv", index = False)