Họ và tên: **Nguyễn Ngọc Băng Tâm**

MSSV: **1712747**

In [1]:
import os

from requests_html import HTMLSession
import requests
import pandas as pd
import re
import time
import json

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import urllib.robotparser # Kiểm tra file robot.txt có được phép crawl không

## 3. Parse HTML from urls

### a. Tạo robot parser

In [2]:
rp = urllib.robotparser.RobotFileParser()
rp.set_url('https://soundcloud.com/robots.txt')
rp.read()

### b. Parse html từ url của user / track / playlist

- Hàm `parse_html_from_url` sẽ parse ra các đối tượng dữ liệu dựa vào template `data_regex` truyền vào và lưu xuống file csv.
- Hàm `preprocess_to_json_string` thực hiện tiền xử lý chuỗi ký tự parse từ file mã nguồn và đưa về format chuỗi json để lưu xuống dataframe

**Lưu ý:** Trong quá trình download mã nguồn trang web, có những trường hợp trang khá phức tạp (nhiều track hoặc người dùng nhập nhiều định dạng hơn) và đòi hỏi thời gian chờ lâu hơn mới download toàn bộ Javascript. Do mã nguồn hiện tại quy định thời gian chờ `sleep_time=1` nên sẽ có lúc không parse được html

In [3]:
def parse_html_from_url(driver, data_type, data_regex, sleep_time=1):
    """
    Parse data from url using regex
    - driver: a browser driver object (firefox, chrome,...)
    - data_type: user / track / playlist
    - data_regex: regex to search for user/ track / playlist
    """
    
    json_lst = []
    
    with open(f'Crawl_urls/unique_popular_{data_type}.txt', 'r') as inp:
        for line in inp:
            if rp.can_fetch('*', line):
                
                # sleep before requesting a new session
                time.sleep(sleep_time)
                driver.get(line)
                page_src = driver.page_source
            
                # parse string using regex
                pos = re.search(data_regex, page_src)
                if pos is not None:
                    res = page_src[pos.start():pos.end()]
                
                    res = preprocess_to_json_string(res, data_type, page_src)
                
                    json_pydata = json.loads(res) # convert JSON string res to a dictionary
                
                    json_lst.append(json_pydata)
                else:
                    print(f'{line} has problems')
                
            else:
                print(f"Can't fetch {line}")
    
    df = pd.json_normalize(json_lst)
    df = df.reset_index()
    df.to_csv(f'New_data/{data_type}.csv', sep='\t', index=False, header=True, encoding='utf-8')

In [4]:
def preprocess_to_json_string(raw_str, data_type, page_src):
    """
    Preprocess raw string to JSON string
    - raw_str
    - data_type: user / track / list
    - page_src: html source of the url (after enabling JS)
    """

    res = None
    
    if data_type == "users":
        res = raw_str[1:-1] # remove '[' and ']' character
    elif data_type == "playlists":
        
        # create the tracks field
        track_lst = []
        tracks = re.findall(r'("id":\d+,"kind":"track")', page_src)
        for track in tracks:
            track = track.replace('"id":', '')
            track = track.replace(',"kind":"track"', '')
            track_lst.append(track)
        
        tracks_str = ','.join(track_lst)
        
        # add the tracks and track_counts field
        pos = re.search(r'"user"', raw_str)
        track_cnts = str(len(re.findall(r'"kind":"track"', raw_str)))
        
        # create the json string
        res = raw_str[1:pos.start()] + '"track_counts":' + str(track_cnts) + ',"tracks" : "' + tracks_str + '"}'
        
    elif data_type == "tracks":
    
        pos = re.search(r'"media"', raw_str)
        res = raw_str[1:pos.start() - 1] + '}' #  close the dict to create json string
        
    else:
        print("Wrong data type")
    
    return res

In [6]:
chrome_options = Options()
chrome_options.add_argument("--headless")

browser = webdriver.Chrome('./chromedriver', options=chrome_options)

#### Parse HTML đối với file `popular_playlists.txt`

In [7]:
playlist_regex = '\[{"artwork_url".+"}\]'
parse_html_from_url(browser, 'playlists', playlist_regex)

https://soundcloud.com/18-user-user/sets/die-very-rough
 has problems
https://soundcloud.com/18-user-user/sets/big-bootie-mix-bottie
 has problems
https://soundcloud.com/2020-songs/sets/what-do-you-know-about-love-pop-smoke
 has problems
https://soundcloud.com/dll-repost-2/sets/what-do-you-know-about-love-pop-smoke-2
 has problems
https://soundcloud.com/2020-songs/sets/whoopty-cj
 has problems
https://soundcloud.com/18-user-user/sets/big-bootie-mix
 has problems
https://soundcloud.com/2020-songs/sets/easy-life-sangria-ft-arlo-parks
 has problems
https://soundcloud.com/18-user-user/sets/drip-like-me
 has problems
https://soundcloud.com/2020-songs/sets/tik-tok
 has problems
https://soundcloud.com/319963465-79694589-weeee/sets/trippie-redd
 has problems
https://soundcloud.com/user-1920291817281910/sets/laugh-now-cry-later
 has problems
https://soundcloud.com/user61569247911/sets/juice-wrld
 has problems
https://soundcloud.com/2020-songs/sets/savage-love
 has problems
https://soundcloud.co

#### Parse HTML đối với file `popular_users.txt`

In [8]:
usr_regex = '\[{"avatar_url".+"}\]'
parse_html_from_url(browser, 'users', usr_regex)

https://soundcloud.com/319963465-79694589-weeee
 has problems
https://soundcloud.com/156667773287464638-i
 has problems
https://soundcloud.com/18-user-user
 has problems
https://soundcloud.com/dll-repost-2
 has problems
https://soundcloud.com/2020-songs
 has problems
https://soundcloud.com/user-1920291817281910
 has problems
https://soundcloud.com/duck-playz-214713788
 has problems


#### Parse HTML đối với file `popular_tracks.txt`

In [9]:
track_regex = '\[{"artwork_url".+"}\]'
parse_html_from_url(browser, 'tracks', track_regex)