In [1]:
import re
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import simplejson as json
from datetime import datetime



## Get website response and font data

In [2]:
browserOptions = Options()
#browserOptions.add_argument("--headless")

capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
capa["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(desired_capabilities=capa)
wait = WebDriverWait(driver, 20)

#create snapshot of the entire page to prevent it from constantly changing
driver.get("https://piaofang.maoyan.com/dashboard/movie")
test = None
while not test:
    try:
        test = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'moviename-td')))
    except:
        driver.refresh();

now = datetime.now().strftime("%d/%m/%Y %H:%M:%S") # get exact datetime at the time of scrape
driver.get_screenshot_as_file("logs/" + now + "/screenshot.png") # save screenshot to sanity check later

logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

def log_filter(log_):
    return (
        # is an actual response
        log_["method"] == "Network.responseReceived"
        # and json
        and "json" in log_["params"]["response"]["mimeType"]
    )

responses = []

for log in filter(log_filter, logs):
    request_id = log["params"]["requestId"]
    resp_url = log["params"]["response"]["url"]
    print(f"Caught {resp_url}")
    response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
    responses.append(response)

Caught https://piaofang.maoyan.com/dashboard-ajax/movie?orderType=0&uuid=c102fbc7-0f88-488a-a1ec-a6d724dcf372&timeStamp=1645843009691&User-Agent=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTVfNykgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzk3LjAuNDY5Mi45OSBTYWZhcmkvNTM3LjM2&index=12&channelId=40009&sVersion=2&signKey=c3669ecbc601e0eaa51195998622f283


In [3]:
# Get this instance's font file from backend server
body0 = json.loads(responses[0]['body'])
movieList = body0['movieList']['list']
date = body0['calendar']['today']
font_url = body0['fontStyle'].split('"')[-2]

# Get reference fonts from the file tree
from fontTools.ttLib import TTFont
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/66.0.3359.139 Safari/537.36 "
    }

woff_url = 'http:' + font_url
response_woff = requests.get(woff_url, headers=headers).content

print("Woff retrieval succuessful: " + str(len(response_woff) > 0))

with open('temp/fonts.woff', 'wb') as f:
    f.write(response_woff)

Woff retrieval succuessful: True


## Getting digits from the font data using pytesseract

In [8]:
from fontTools.ttLib import TTFont
from PIL import ImageFont, Image, ImageDraw, ImageOps
import pytesseract
import cv2
import numpy as np
import random

def uniToHex(uni):
    return "&#x" + uni[3:].lower()

def uni_2_png_stream(txt: str, font: str, img_size=512, font_size=0.7, invert=False):
    img = Image.new('1', (img_size, img_size), 255) 
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font, int(img_size * font_size))
    
    txt = chr(txt)
    x, y = draw.textsize(txt, font=font) 
    draw.text(((img_size - x) // 2, (img_size - y) // 2), txt, font=font, fill=0)
    if invert:
        img = img.convert('L')
        img = ImageOps.invert(img)
        img = img.convert('1')
    #img.save(txt + '.png')
    return img 

def predict_neural(unicode, fontFile):
    image = uni_2_png_stream(int(unicode[3:], 16), fontFile, img_size=28, font_size=0.5, invert=True)
    image.save(str(unicodeToInt[unicode]) + '_neuro.png')
    matrix_form = np.array(image)
    weighted_predictions = np.ndarray.flatten(neural_network.run(matrix_form))
    most_possible = np.argmax(weighted_predictions)
    return most_possible

def predict_tesseract(unicode, fontFile, fontSize=0.5):
    image = uni_2_png_stream(int(unicode[3:], 16), fontFile, img_size=1024, font_size=fontSize)
    image.save('logs/' + now + '/' + str(unicode) + '.png')
    text = pytesseract.image_to_string(image, lang="eng", config="--psm 10 outputbase digits -c tessedit_char_whitelist=0123456789")
    return text

def predict_tesseract_definite(unicode, fontFile):
    result, size = '', 1
    while not result and size >= 0:
        result = predict_tesseract(x, filename, fontSize=size)
        size -= 0.01
    return result

In [9]:
# Map contours to numbers - the prediction phase may be very slow
filename = 'temp/fonts.woff'
f = TTFont(filename)
hexToInt = {}
for x in f.getGlyphNames()[1:-1]:
    predict = predict_tesseract_definite(x, filename)
    hexToInt[uniToHex(x)] = int(predict)

hexToInt

2 extra bytes in post.stringData array


{'&#xe7cf': 4,
 '&#xe849': 7,
 '&#xe8ab': 3,
 '&#xe8fb': 1,
 '&#xebe5': 8,
 '&#xf026': 9,
 '&#xf044': 0,
 '&#xf3a9': 2,
 '&#xf3d4': 5,
 '&#xf7ef': 6}

## Parsing the data into pandas dataframe

In [10]:
import pandas as pd
df = pd.DataFrame.from_records(movieList)

In [11]:
unitLookup = {'百': 100, '千': 1000, '万': 10000, '亿': 1*10**8}

#converts the weird character to a float
def convertToFloat(string):
    spCharLst = string.split(';')
    result = ''
    for i in spCharLst:
        if len(i) > 7: #has a dot in front
            result += '.' + str(hexToInt[i[1:]])
        elif len(i) == 7: #in case of bad parsing
            result += str(hexToInt[i])
    return float(result)

#helper function for converting the entire block to a single int
def convertDictToInt(dictionary):
    return int(convertToFloat(dictionary['num']) * unitLookup[dictionary['unit']])


In [12]:
df['boxSplitUnit'] = df['boxSplitUnit'].apply(convertDictToInt)
df['splitBoxSplitUnit'] = df['splitBoxSplitUnit'].apply(convertDictToInt)
df['movieInfo'] = df['movieInfo'].apply(lambda x : x['movieName'])
df

Unnamed: 0,avgSeatView,avgShowView,boxRate,boxSplitUnit,movieInfo,showCount,showCountRate,splitBoxRate,splitBoxSplitUnit,sumBoxDesc,sumSplitBoxDesc
0,1.1%,2,25.9%,5889900,长津湖之水门桥,71262,18.4%,26.8%,5611900,38.02亿,35.08亿
1,0.7%,1,12.1%,2754500,这个杀手不太冷静,67213,17.4%,12.1%,2531300,24.31亿,22.23亿
2,1.1%,2,11.6%,2639600,奇迹·笨小孩,49181,12.7%,11.5%,2406100,12.60亿,11.52亿
3,2.7%,3,11.0%,2511700,花束般的恋爱,24128,6.2%,10.6%,2230700,2269.6万,1999.6万
4,2.1%,3,9.7%,2213400,熊出没·重返地球,24779,6.4%,9.7%,2038300,9.08亿,8.33亿
5,2.6%,3,7.4%,1686399,我们的冬奥,16938,4.3%,7.3%,1530900,4190.3万,3840.9万
6,0.8%,1,4.1%,948300,狙击手,27165,7.0%,4.1%,864800,5.38亿,4.94亿
7,1.2%,2,3.9%,892099,尼罗河上的惨案,11997,3.1%,3.8%,806300,5038.7万,4539.6万
8,0.6%,1,3.4%,789100,纽约的一个雨天,31377,8.1%,3.4%,710699,270.9万,244.1万
9,0.5%,1,2.9%,664500,我心飞扬,27986,7.2%,2.9%,605900,349.4万,316.4万


In [13]:
#大盘
body0['movieList']['nationBoxInfo']

{'nationBoxSplitUnit': {'num': '&#xf3a9;&#xf3a9;&#xf7ef;&#xf026;.&#xe8fb;',
  'unit': '万'},
 'nationSplitBoxSplitUnit': {'num': '&#xf3a9;&#xf044;&#xebe5;&#xe849;.&#xf3d4;',
  'unit': '万'},
 'showCountDesc': '38.5万',
 'title': '实时大盘',
 'viewCountDesc': '55.4万'}

In [14]:
# Comparing with the screenshot earlier
from IPython.display import Image as displayImage
displayImage(filename="logs/" + now + "/screenshot.png") 

FileNotFoundError: [Errno 2] No such file or directory: 'logs/25/02/2022 18:36:50/screenshot.png'