In [61]:
import re
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import simplejson as json



In [62]:
res = requests.get("https://piaofang.maoyan.com/dashboard/movie")
res.encoding = 'utf-8'
soup = bs(res.text, 'html.parser')

In [63]:
browserOptions = Options()
#browserOptions.add_argument("--headless")

capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
capa["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(desired_capabilities=capa)
wait = WebDriverWait(driver, 20)

In [64]:
#create snapshot of the entire page to prevent it from constantly changing
driver.get("https://piaofang.maoyan.com/dashboard/movie")
test = None
while not test:
    try:
        test = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'moviename-td')))
    except:
        driver.refresh();
        
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

def log_filter(log_):
    return (
        # is an actual response
        log_["method"] == "Network.responseReceived"
        # and json
        and "json" in log_["params"]["response"]["mimeType"]
    )

responses = []

for log in filter(log_filter, logs):
    request_id = log["params"]["requestId"]
    resp_url = log["params"]["response"]["url"]
    print(f"Caught {resp_url}")
    response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
    responses.append(response)
    

    

Caught https://piaofang.maoyan.com/dashboard-ajax/movie?orderType=0&uuid=d6742ad8-ec93-4d1a-9006-1ad18986c8c1&timeStamp=1645836100803&User-Agent=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTVfNykgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzk3LjAuNDY5Mi45OSBTYWZhcmkvNTM3LjM2&index=397&channelId=40009&sVersion=2&signKey=771b44a6cf576dbdbde359a7955fd467


In [65]:
body0 = json.loads(responses[0]['body'])
movieList = body0['movieList']['list']
date = body0['calendar']['today']
font_url = body0['fontStyle'].split('"')[-2]
font_url

'//s3plus.meituan.net/v1/mss_73a511b8f91f43d0bdae92584ea6330b/font/6458c11c.woff'

In [66]:
# Get reference fonts from the file tree
from fontTools.ttLib import TTFont
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/66.0.3359.139 Safari/537.36 "
    }

woff_url = 'http:' + font_url
response_woff = requests.get(woff_url, headers=headers).content

print("Woff retrieval succuessful: " + str(len(response_woff) > 0))

with open('temp/fonts.woff', 'wb') as f:
    f.write(response_woff)

# Already saved, DO NOT RUN THE FOLLOWING 
#with open('basefonts.woff', 'wb') as f:
    #f.write(response_woff)
    
#font1 = TTFont('basefonts.woff')
#font1.saveXML('basefonts.xml')

Woff retrieval succuessful: True


In [67]:
import pandas as pd
df = pd.DataFrame.from_records(movieList)

In [68]:
df['boxSplitUnit'][4]

{'num': '&#xe252;&#xe0db;&#xed49;.&#xe252;&#xe75c;', 'unit': '万'}

In [69]:
# Now starts the tedius process of mapping the new fonts to the contours and then to the numbers
from xml.etree import ElementTree
from difflib import get_close_matches
from difflib import SequenceMatcher
from fontTools.misc.xmlWriter import XMLWriter
import re

# Create Bijection between numbers and unicode 
unicodeToInt = {"uniF581": 0, "uniF56B": 1, "uniE46D": 2, "uniE13D": 3, "uniEAA8": 4, 
                "uniE808": 5, "uniE1D4": 6, "uniE66A": 7, "uniE43F": 8, "uniECDC": 9}

def hexToUnicode(hexa):
    return 'uni' + hexa[3:].upper()

def uniToHex(uni):
    return "&#x" + uni[3:].lower()

def base10toHex(num):
    return "&#x" + str(int(num, 16))

def elemToStr(elem):
    return ElementTree.tostring(elem, encoding='utf8', method='xml')

def trimXML(xml):
    return ''.join(xml.split())

#If the regular equality doesn't work we then have to use fuzzy matching
xRg = re.compile(r'x=\"([0-9]+)\"')
yRg = re.compile(r'y=\"([0-9]+)\"')
def getCoFromXML(xml):
    stringToMatch = trimXML(xml)
    print(stringToMatch)
    x = xRg.match(stringToMatch)
    y = yRg.match(stringToMatch)
    return {"x": x, "y": y}
    
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
    

## Getting digits from the font data
with neural networks

In [33]:
from fontTools.ttLib import TTFont
from PIL import ImageFont, Image, ImageDraw, ImageOps
import pytesseract
import cv2
import numpy as np
import digit_recognition as dr
from wand.image import Image as wImage
import random

In [3]:
# Creating training sets and labels

def distortImage(img):
    scale = random.uniform(0.6, 1.4)
    img.distort('scale_rotate_translate', (scale, 0,))
    img.morphology(method='erode', kernel='octagon:1', iterations=1)
    img.save(filename='test.png')
    return img

# Reference Images
basefontimgs = []
basefontlabels = []
for i in range(10):
    with Image.open("training_images/" + str(i) + ".png") as im:
        basefontimgs.append(np.array(im))
        scale = 0.5
        
    basefontlabels.append([0.01 if x != i else 0.09 for x in range(10)])


with wImage(filename=("training_images/" + str(4) + ".png")) as img:
    distortImage(img)


In [23]:
# Import data
train_images, train_labels, test_images, test_labels = dr.pre_processing()

Data has been downloaded
Data preprocessed successfully


In [32]:
Image.fromarray(np.array(train_images[0]).astype(np.uint8)).save("test.png")

In [38]:
# Build neural network (with two hidden layers of 200/100 nodes respectively)
neural_network = dr.NeuralNetwork([784,200,100,10], bias=True)
neural_network.train(train_images, train_labels, epochs=10)
neural_network.evaluate(test_images, test_labels)

Network created successfully
Training...
Epoch: 1 (of 10)
Epoch: 2 (of 10)
Epoch: 3 (of 10)
Epoch: 4 (of 10)
Epoch: 5 (of 10)
Epoch: 6 (of 10)
Epoch: 7 (of 10)
Epoch: 8 (of 10)
Epoch: 9 (of 10)
Epoch: 10 (of 10)
Network trained successfully
Accuracy: 97.23%
Recall for 0: 98.88%
Precision for 0: 97.29%
Recall for 1: 99.21%
Precision for 1: 98.60%
Recall for 2: 97.38%
Precision for 2: 97.38%
Recall for 3: 96.83%
Precision for 3: 96.93%
Recall for 4: 97.15%
Precision for 4: 96.46%
Recall for 5: 97.20%
Precision for 5: 96.23%
Recall for 6: 96.76%
Precision for 6: 98.51%
Recall for 7: 96.11%
Precision for 7: 97.82%
Recall for 8: 96.71%
Precision for 8: 96.91%
Recall for 9: 95.84%
Precision for 9: 95.93%


In [None]:
# train specifically for our basefonts
basefontimgs = []
basefontlabels = []
for i in range(10):
    with Image.open("training_images/" + str(i) + ".png") as im:
        basefontimgs.append(np.array(im))
    basefontlabels.append([0.01 if x != i else 0.09 for x in range(10)])
neural_network.train(basefontimgs, basefontlabels, epochs = 100)

In [None]:
basefontlabels

In [86]:
def uni_2_png_stream(txt: str, font: str, img_size=512, font_size=0.7, invert=False):
    img = Image.new('1', (img_size, img_size), 255) 
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font, int(img_size * font_size))
    
    txt = chr(txt)
    x, y = draw.textsize(txt, font=font) 
    draw.text(((img_size - x) // 2, (img_size - y) // 2), txt, font=font, fill=0)
    if invert:
        img = img.convert('L')
        img = ImageOps.invert(img)
        img = img.convert('1')
    #img.save(txt + '.png')
    return img 

def predict_neural(unicode, fontFile):
    image = uni_2_png_stream(int(unicode[3:], 16), fontFile, img_size=28, font_size=0.5, invert=True)
    image.save(str(unicodeToInt[unicode]) + '_neuro.png')
    matrix_form = np.array(image)
    weighted_predictions = np.ndarray.flatten(neural_network.run(matrix_form))
    most_possible = np.argmax(weighted_predictions)
    return most_possible

def predict_tesseract(unicode, fontFile, fontSize=0.5):
    image = uni_2_png_stream(int(unicode[3:], 16), fontFile, img_size=1024, font_size=fontSize)
    image.save(str(unicode) + '.png')
    text = pytesseract.image_to_string(image, lang="eng", config="--psm 10 outputbase digits -c tessedit_char_whitelist=0123456789")
    return text

def predict_tesseract_definite(unicode, fontFile):
    result, size = '', 1
    while not result and size >= 0:
        result = predict_tesseract(x, filename, fontSize=size)
        size -= 0.01
    return result

In [None]:
# Testing different methods
filename = 'persistence/basefonts.woff'  # 字体文件的路径信息
f = TTFont(filename)
for x in f.getGlyphNames()[1:-1]:
    print(unicodeToInt[x])
    print(predict_neural(x, filename))
    print(predict_tesseract_definite(x, filename))

In [90]:
# Map contours to numbers
filename = 'temp/fonts.woff'
f = TTFont(cfFileName)
hexToInt = {}

for x in f.getGlyphNames()[1:-1]:
    predict = predict_tesseract_definite(x, filename)
    hexToInt[uniToHex(x)] = int(predict)

hexToInt

2 extra bytes in post.stringData array


{'&#xe0db': 3,
 '&#xe252': 1,
 '&#xe3d4': 2,
 '&#xe6ee': 6,
 '&#xe75c': 0,
 '&#xe9b5': 7,
 '&#xed16': 4,
 '&#xed49': 9,
 '&#xee2f': 8,
 '&#xf089': 5}

In [91]:
unitLookup = {'百': 100, '千': 1000, '万': 10000, '亿': 1*10**8}

#converts the weird character to a float
def convertToFloat(string):
    spCharLst = string.split(';')
    result = ''
    for i in spCharLst:
        if len(i) > 7: #has a dot in front
            result += '.' + str(hexToInt[i[1:]])
        elif len(i) == 7: #in case of bad parsing
            result += str(hexToInt[i])
    return float(result)

#helper function for converting the entire block to a single int
def convertDictToInt(dictionary):
    return int(convertToFloat(dictionary['num']) * unitLookup[dictionary['unit']])


In [92]:
df['boxSplitUnit'] = df['boxSplitUnit'].apply(convertDictToInt)
df['splitBoxSplitUnit'] = df['splitBoxSplitUnit'].apply(convertDictToInt)
df['movieInfo'] = df['movieInfo'].apply(lambda x : x['movieName'])
df

Unnamed: 0,avgSeatView,avgShowView,boxRate,boxSplitUnit,movieInfo,showCount,showCountRate,splitBoxRate,splitBoxSplitUnit,sumBoxDesc,sumSplitBoxDesc
0,0.9%,2,26.0%,4615400,长津湖之水门桥,71211,18.5%,26.7%,4349900,38.01亿,35.07亿
1,2.4%,3,12.7%,2253500,花束般的恋爱,23508,6.1%,12.3%,2001000,2243.8万,1976.7万
2,0.9%,2,11.6%,2073700,奇迹·笨小孩,49053,12.7%,11.6%,1885600,12.59亿,11.51亿
3,0.5%,1,11.5%,2039400,这个杀手不太冷静,67211,17.4%,11.5%,1867600,24.31亿,22.22亿
4,1.3%,2,7.8%,1391000,熊出没·重返地球,24620,6.3%,7.8%,1278000,9.07亿,8.32亿
5,1.7%,2,6.4%,1145300,我们的冬奥,16785,4.3%,6.4%,1039500,4136.2万,3791.8万
6,1.0%,2,4.3%,771200,尼罗河上的惨案,12036,3.1%,4.2%,697000,5026.6万,4528.7万
7,0.6%,1,4.0%,721700,狙击手,26982,7.0%,4.0%,658199,5.38亿,4.94亿
8,0.5%,1,3.9%,693100,纽约的一个雨天,31601,8.2%,3.8%,623300,261.3万,235.4万
9,0.4%,1,3.5%,623300,我心飞扬,28654,7.4%,3.5%,568400,345.3万,312.6万


In [93]:
#大盘
body0['movieList']['nationBoxInfo']

{'nationBoxSplitUnit': {'num': '&#xe252;&#xe9b5;&#xe9b5;&#xe3d4;.&#xe6ee;',
  'unit': '万'},
 'nationSplitBoxSplitUnit': {'num': '&#xe252;&#xe6ee;&#xe3d4;&#xe0db;.&#xe3d4;',
  'unit': '万'},
 'showCountDesc': '38.5万',
 'title': '实时大盘',
 'viewCountDesc': '43.1万'}