In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import re
import datetime

### Scraping MKW.hk
#### One period-only

In [2]:
URL = "http://www.mkw.hk/"
html = requests.get(URL)
URL_source_code=html.text

In [3]:
print(html)

<Response [200]>


In [4]:
soup=BeautifulSoup(URL_source_code,"html.parser")
soup

﻿<!DOCTYPE html>

<!--[if IE]><![endif]-->
<!--[if IE 8 ]><html dir="ltr" lang="en" class="ie8"><![endif]-->
<!--[if IE 9 ]><html dir="ltr" lang="en" class="ie9"><![endif]-->
<!--[if (gt IE 9)|!(IE)]><!-->
<html dir="ltr" lang="en">
<!--<![endif]-->
<head>
<meta charset="utf-8"/>
<title>旺角二手機網-全港最高回收價</title>
<base href="http://mkw.hk/"/>
<meta content="旺角二手機網" name="description"/>
<meta content="手機回收價  iphone回收價  手機回收 iphone13回收價  iPhone 11pro/max回收價  iphone xs 回收價 iphone8回收價  iphone x回收 Samsung回收價 二手電話回收價   iphone12回收價   手機回收站 二手手機回收價 手機trade in 舊手機回收價  LG 華為 sony   先達手機回收 全新手機回收價  iPhone12/12pro /max回收價 iPad回收價 iPhone12回收二手" name="keywords"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<link href="http://mkw.hk/image/catalog/new/MK.jpg" rel="icon"/>
<script src="catalog/view/javascript/jquery/jquery-2.1.1.min.js" type="text/javascript"></script>
<link href="catalog/view/javascript/bootstrap/css/bootstrap.min.css" media="screen" rel="stylesheet"/>
<script src="catalog/view

In [5]:
yy = re.findall('\d+\年', str(soup)) 
mm = re.findall('\d+\月', str(soup)) 
dd = re.findall('\d+\日', str(soup)) 

dated = '-'.join([yy[0][:-1],mm[0][:-1],dd[0][:-1]])
dated = datetime.datetime.strptime(dated, '%Y-%m-%d')
print(dated)

2022-05-01 00:00:00


In [6]:
brand, model, price, date = [], [], [], []

for rows in soup.find_all('tr')[7:]:
    #print(rows)
    try: 
        columns = rows.find_all('td')
        brand.append(columns[0].text.replace('\n',''))
        model.append(columns[1].text.replace('\n',''))
        price.append(columns[2].text.replace('HK$','').replace('HK',''))  
        date.append(dated)
       
    except: model.append(0) or price.append(0) or date.append(0)
        
print(len(brand),len(model),len(price), len(date))

288 288 288 288


In [7]:
MK = pd.DataFrame({'Brand':brand, 'Model':model, 'Price':price, 'Date':date})

In [8]:
MK = MK[MK.Price != 0]

In [9]:
MK.head()  # can use Regex to filter models

Unnamed: 0,Brand,Model,Price,Date
1,APPLE,iPhone 13 128g,5000,2022-05-01 00:00:00
2,APPLE,iPhone 13 256g,5600,2022-05-01 00:00:00
3,APPLE,iPhone 13 256g,5950,2022-05-01 00:00:00
5,APPLE,iPhone 13 pro 128GB,6900,2022-05-01 00:00:00
6,APPLE,iPhone 13 pro 256GB,7700,2022-05-01 00:00:00


### Scraping via Web Archive
#### Multiple periods

In [10]:
# Historical prices to be scraped from Wayback Archive using its API to find relevant dates
# https://archive.org/help/wayback_api.php

In [11]:
# define scraping period
dateRange = pd.Series(pd.bdate_range(start='2019-09-30', end='2022-05-03', freq='M')).dt.strftime('%Y%m%d')

In [12]:
# Using Web Archive API to identify available URLs to be plugged into Beautiful Soup

target = 'http://archive.org/wayback/available?url=http://www.mkw.hk/&timestamp='
headers = {"Accept": "application/json"}
URL_list = []

for date in dateRange:
    try:
        res =requests.get(''.join([target,date]), headers=headers )
        #print(res)
        d = json.loads(res.text)
        URL_list.append(d['archived_snapshots']['closest']['url'])
    except: pass

URL_list  # all links to be scraped

['http://web.archive.org/web/20190924052348/http://mkw.hk:80/',
 'http://web.archive.org/web/20191022175922/http://mkw.hk:80/',
 'http://web.archive.org/web/20191122141143/http://mkw.hk:80/',
 'http://web.archive.org/web/20200109222027/http://www.mkw.hk:80/',
 'http://web.archive.org/web/20200209045102/http://www.mkw.hk:80/',
 'http://web.archive.org/web/20200221025946/http://mkw.hk:80/',
 'http://web.archive.org/web/20200322063612/http://www.mkw.hk:80/',
 'http://web.archive.org/web/20200429115058/http://mkw.hk:80/',
 'http://web.archive.org/web/20200529174855/http://mkw.hk:80/',
 'http://web.archive.org/web/20200629001159/http://mkw.hk:80/',
 'http://web.archive.org/web/20200729052547/http://mkw.hk:80/',
 'http://web.archive.org/web/20200919190139/http://mkw.hk/',
 'http://web.archive.org/web/20200919190139/http://mkw.hk/',
 'http://web.archive.org/web/20201031223605/http://mkw.hk/',
 'http://web.archive.org/web/20201127033029/http://mkw.hk/',
 'http://web.archive.org/web/20210116060

In [13]:
# Execute Beautiful Soup and transfer to pandas DF

brandMaster, modelMaster, priceMaster, dateMaster = [], [], [], []

for URL in URL_list:
    html = requests.get(URL)
    URL_source_code=html.text
    soup=BeautifulSoup(URL_source_code,"html.parser")

    # Date finder
    yy = re.findall('\d+\年', str(soup)) 
    mm = re.findall('\d+\月', str(soup)) 
    dd = re.findall('\d+\日', str(soup)) 

    dated = '-'.join([yy[0][:-1],mm[0][:-1],dd[0][:-1]])
    dated = datetime.datetime.strptime(dated, '%Y-%m-%d')

    # Scrape and build price and model dataset
    brand, model, price, date = [], [], [], []

    for rows in soup.find_all('tr')[7:]:
        try: 
            columns = rows.find_all('td')
            brand.append(columns[0].text.replace('\n',''))
            model.append(columns[1].text.replace('\n',''))
            price.append(columns[2].text.replace('HK$','').replace('HK','').replace('\n',''))
            date.append(dated)
        except: model.append(0) or price.append(0) or date.append(0)  # for single-column rows
    
    brandMaster.extend(brand)
    modelMaster.extend(model)
    priceMaster.extend(price)
    dateMaster.extend(date)
    #print(len(brand),len(model),len(price), len(date))  # check dataset is complete 

MKW = pd.DataFrame({'Brand':brandMaster, 'Model':modelMaster, 'Price':priceMaster, 'Date':dateMaster})

# Clean up blank space values (https://stackoverflow.com/questions/13445241/replacing-blank-values-white-space-with-nan-in-pandas)
for i in MKW.columns:
    MKW[i][MKW[i].apply(lambda i: True if re.search('^\s*$', str(i)) else False)]=0

MKW = MKW[MKW.Price != 0]  # strip out non pricing data

In [14]:
MKW.info(), MKW.head()  # scraped results

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7954 entries, 1 to 8740
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Brand   7954 non-null   object
 1   Model   7954 non-null   object
 2   Price   7954 non-null   object
 3   Date    7954 non-null   object
dtypes: object(4)
memory usage: 310.7+ KB


(None,
    Brand                     Model Price                 Date
 1  APPLE  iPhone 5 16GB/32GB/64GB     50  2019-09-08 00:00:00
 3  APPLE            iPhone 5s 16GB   100  2019-09-08 00:00:00
 4  APPLE      iPhone 5s 32GB 64GB    150  2019-09-08 00:00:00
 6  APPLE           iPhone 5c 16GB    100  2019-09-08 00:00:00
 7  APPLE            iPhone 5c 32GB   100  2019-09-08 00:00:00)

In [15]:
# Remove duplicates since some of data scraped from archived pages of same day (diff time)
print('Duplicated rows:',MKW.duplicated().sum())
MKW.drop_duplicates(inplace=True)
print('Post removing duplicates (rows, cols):',MKW.shape)

Duplicated rows: 5272
Post removing duplicates (rows, cols): (2682, 4)


In [16]:
MKW.reset_index(inplace=True)
MKW = MKW.iloc[:,1:]

In [17]:
MKW.head()

Unnamed: 0,Brand,Model,Price,Date
0,APPLE,iPhone 5 16GB/32GB/64GB,50,2019-09-08 00:00:00
1,APPLE,iPhone 5s 16GB,100,2019-09-08 00:00:00
2,APPLE,iPhone 5s 32GB 64GB,150,2019-09-08 00:00:00
3,APPLE,iPhone 5c 16GB,100,2019-09-08 00:00:00
4,APPLE,iPhone 5c 32GB,100,2019-09-08 00:00:00


In [18]:
MKW.to_csv('supplier_px.csv')  # export