In [1]:
import re
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import simplejson as json



In [2]:
res = requests.get("https://piaofang.maoyan.com/dashboard/movie")
res.encoding = 'utf-8'
soup = bs(res.text, 'html.parser')
soup

<!DOCTYPE html>

<html class="mobile" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="initial-scale=1, width=device-width, maximum-scale=1, user-scalable=no" name="viewport"/>
<link href="https://s3plus.meituan.net" rel="dns-prefetch"/>
<link href="https://p0.meituan.net" rel="dns-prefetch"/>
<link href="https://s3plus.meituan.net/v1/mss_e2821d7f0cfe4ac1bf9202ecf9590e67/cdn-prod/file:d6a62569/common.css" rel="stylesheet"/>
<style>
              html, body{
                  max-width: 430px;
                  margin: 0 auto;
              }
              .m-page{
                  display:flex;
                  align-items:center;
                  justify-content:center;
                  flex-direction: column;
                  min-height: 100vh;
              }
              .m-page .hint{
                  background: url(https://s0.meituan.net/bs/file/?f=myfe/piaofang:img/news/comments-none-ccd31fb5.png) no-repeat center .4rem;
                  background-size: 2rem;
 

In [3]:
browserOptions = Options()
#browserOptions.add_argument("--headless")

capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
capa["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(desired_capabilities=capa)
wait = WebDriverWait(driver, 20)

In [4]:
#create snapshot of the entire page to prevent it from constantly changing
driver.get("https://piaofang.maoyan.com/dashboard/movie")
test = None
while not test:
    try:
        test = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'moviename-td')))
    except:
        driver.refresh();
        
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

def log_filter(log_):
    return (
        # is an actual response
        log_["method"] == "Network.responseReceived"
        # and json
        and "json" in log_["params"]["response"]["mimeType"]
    )

responses = []

for log in filter(log_filter, logs):
    request_id = log["params"]["requestId"]
    resp_url = log["params"]["response"]["url"]
    print(f"Caught {resp_url}")
    response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
    responses.append(response)
    

    

Caught https://piaofang.maoyan.com/dashboard-ajax/movie?orderType=0&uuid=4dfdc0c0-7497-4046-9f88-3a276587bf4a&timeStamp=1644658672104&User-Agent=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTVfNykgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzk3LjAuNDY5Mi45OSBTYWZhcmkvNTM3LjM2&index=966&channelId=40009&sVersion=2&signKey=3a2bc2d7e69356ba4a30b3803187e1b2


In [5]:
body0 = json.loads(responses[0]['body'])
movieList = body0['movieList']['list']
date = body0['calendar']['today']
font_url = body0['fontStyle'].split('"')[-2]
font_url

'//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/2ab02453.woff'

In [6]:
# Get reference fonts from the file tree
from fontTools.ttLib import TTFont
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/66.0.3359.139 Safari/537.36 "
    }

woff_url = 'http:' + font_url
response_woff = requests.get(woff_url, headers=headers).content

print("Woff retrieval succuessful: " + str(len(response_woff) > 0))

with open('fonts.woff', 'wb') as f:
    f.write(response_woff)

# Already saved, DO NOT RUN THE FOLLOWING 
#with open('basefonts.woff', 'wb') as f:
    #f.write(response_woff)
    
#font1 = TTFont('basefonts.woff')
#font1.saveXML('basefonts.xml')

Woff retrieval succuessful: True


In [7]:
import pandas as pd
df = pd.DataFrame.from_records(movieList)

In [8]:
df['boxSplitUnit'][4]

{'num': '&#xf56b;&#xe66a;&#xe43f;&#xe808;.&#xf56b;&#xe46d;', 'unit': '万'}

In [9]:
# Now starts the tedius process of mapping the new fonts to the contours and then to the numbers
from xml.etree import ElementTree
from difflib import get_close_matches
from difflib import SequenceMatcher
from fontTools.misc.xmlWriter import XMLWriter
import re

# Create Bijection between numbers and unicode 
unicodeToInt = {"uniF581": 0, "uniF56B": 1, "uniE46D": 2, "uniE13D": 3, "uniEAA8": 4, 
                "uniE808": 5, "uniE1D4": 6, "uniE66A": 7, "uniE43F": 8, "uniECDC": 9}

def hexToUnicode(hexa):
    return 'uni' + hexa[3:].upper()

def uniToHex(uni):
    return "&#x" + uni[3:].lower()

def elemToStr(elem):
    return ElementTree.tostring(elem, encoding='utf8', method='xml')

def trimXML(xml):
    return ''.join(xml.split())

#If the regular equality doesn't work we then have to use fuzzy matching
xRg = re.compile(r'x="([0-9]+)"')
yRg = re.compile(r'y="([0-9]+)"')
def getCoFromXML(xml):
    stringToMatch = trimXML(xml)
    x = [m.group(1) for m in (xRg.match(line) for line in stringToMatch) if m]
    y = [m.group(1) for m in (yRg.match(line) for line in stringToMatch) if m]
    return {"x": x, "y": y}
    
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
    

In [10]:
# Map contours to numbers
baseFonts = TTFont('basefonts.woff')
# The follwing Code is for fuzzy matching, which we fortunately don't need to implement.
# conStrToInt = {}
# print(len(unicodeToInt))
# for j in unicodeToInt:
#     currGlyph = baseFonts['glyf'][j]
#     writer = XMLWriter('curr_glyph.xml')
#     currGlyph.toXML(writer, baseFonts)
#     with open('curr_glyph.xml', 'r') as f:
#         currXML = f.read()
#     conStrToInt[trimXML(currXML)] = unicodeToInt[j]
currentFonts = TTFont('fonts.woff')
hexToInt = {}

for j in currentFonts.getGlyphNames()[1:-1]:
    currGlyph = currentFonts['glyf'][j]
    writer = XMLWriter('curr_glyph.xml')
    currGlyph.toXML(writer, currentFonts)
    with open('curr_glyph.xml', 'r') as f:
        currXML = f.read()
        
    for i in unicodeToInt:
        baseGlyph = baseFonts['glyf'][i]
        if baseGlyph == currGlyph:
            hexToInt[uniToHex(j)] = unicodeToInt[i]
            break

hexToInt

2 extra bytes in post.stringData array
2 extra bytes in post.stringData array


{'&#xe13d': 3,
 '&#xe1d4': 6,
 '&#xe43f': 8,
 '&#xe46d': 2,
 '&#xe66a': 7,
 '&#xe808': 5,
 '&#xeaa8': 4,
 '&#xecdc': 9,
 '&#xf56b': 1,
 '&#xf581': 0}

In [11]:
unitLookup = {'百': 100, '千': 1000, '万': 10000, '亿': 1*10**8}

#converts the weird character to a float
def convertToFloat(string):
    spCharLst = string.split(';')
    result = ''
    for i in spCharLst:
        if len(i) > 7: #has a dot in front
            result += '.' + str(hexToInt[i[1:]])
        elif len(i) == 7: #in case of bad parsing
            result += str(hexToInt[i])
    return float(result)

#helper function for converting the entire block to a single int
def convertDictToInt(dictionary):
    return int(convertToFloat(dictionary['num']) * unitLookup[dictionary['unit']])


In [12]:
df['boxSplitUnit'] = df['boxSplitUnit'].apply(convertDictToInt)
df['splitBoxSplitUnit'] = df['splitBoxSplitUnit'].apply(convertDictToInt)
df['movieInfo'] = df['movieInfo'].apply(lambda x : x['movieName'])
df

Unnamed: 0,avgSeatView,avgShowView,boxRate,boxSplitUnit,movieInfo,showCount,showCountRate,splitBoxRate,splitBoxSplitUnit,sumBoxDesc,sumSplitBoxDesc
0,9.2%,15,32.9%,76452400,长津湖之水门桥,116414,27.9%,33.0%,71126900,32.33亿,29.79亿
1,10.0%,14,25.7%,59664700,这个杀手不太冷静,100584,24.1%,25.6%,55056800,19.31亿,17.63亿
2,10.6%,12,15.1%,35198700,奇迹·笨小孩,67643,16.2%,15.1%,32503300,9.65亿,8.83亿
3,15.3%,17,12.6%,29443300,熊出没·重返地球,41924,10.0%,12.7%,27400900,7.77亿,7.11亿
4,9.2%,10,7.6%,17851200,狙击手,43443,10.4%,7.6%,16467100,4.08亿,3.75亿
5,10.0%,10,2.4%,5750000,喜羊羊与灰太狼之筐出未来,15001,3.5%,2.4%,5299100,1.36亿,1.24亿
6,4.9%,5,1.2%,2990299,四海,15015,3.5%,1.2%,2760100,5.11亿,4.61亿
7,6.0%,7,1.2%,2852000,好想去你的世界爱你,11500,2.7%,1.2%,2607000,801.1万,744.7万
8,13.8%,14,0.3%,774900,汪汪队立大功大电影,1526,0.3%,0.3%,698700,7934.8万,7022.3万
9,3.8%,4,0.1%,457600,小虎墩大英雄,2816,0.6%,0.1%,420500,1987.5万,1844.9万


In [13]:
#大盘
body0['movieList']['nationBoxInfo']

{'nationBoxSplitUnit': {'num': '&#xe46d;&#xe13d;&#xe46d;&#xf56b;&#xe808;.&#xe66a;',
  'unit': '万'},
 'nationSplitBoxSplitUnit': {'num': '&#xe46d;&#xf56b;&#xe808;&#xf581;&#xf56b;.&#xf581;',
  'unit': '万'},
 'showCountDesc': '41.7万',
 'title': '实时大盘',
 'viewCountDesc': '524.5万'}