In [101]:
import re
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import simplejson as json

from fontTools.ttLib import TTFont
from PIL import ImageFont, Image, ImageDraw
import pytesseract
import cv2
import numpy as np
import digit_recognition as dr

In [None]:
res = requests.get("https://piaofang.maoyan.com/dashboard/movie")
res.encoding = 'utf-8'
soup = bs(res.text, 'html.parser')

In [None]:
browserOptions = Options()
#browserOptions.add_argument("--headless")

capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
capa["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(desired_capabilities=capa)
wait = WebDriverWait(driver, 20)

In [None]:
#create snapshot of the entire page to prevent it from constantly changing
driver.get("https://piaofang.maoyan.com/dashboard/movie")
test = None
while not test:
    try:
        test = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'moviename-td')))
    except:
        driver.refresh();
        
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

def log_filter(log_):
    return (
        # is an actual response
        log_["method"] == "Network.responseReceived"
        # and json
        and "json" in log_["params"]["response"]["mimeType"]
    )

responses = []

for log in filter(log_filter, logs):
    request_id = log["params"]["requestId"]
    resp_url = log["params"]["response"]["url"]
    print(f"Caught {resp_url}")
    response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
    responses.append(response)
    

    

In [None]:
body0 = json.loads(responses[0]['body'])
movieList = body0['movieList']['list']
date = body0['calendar']['today']
font_url = body0['fontStyle'].split('"')[-2]
font_url

In [None]:
# Get reference fonts from the file tree
from fontTools.ttLib import TTFont
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/66.0.3359.139 Safari/537.36 "
    }

woff_url = 'http:' + font_url
response_woff = requests.get(woff_url, headers=headers).content

print("Woff retrieval succuessful: " + str(len(response_woff) > 0))

with open('fonts.woff', 'wb') as f:
    f.write(response_woff)

# Already saved, DO NOT RUN THE FOLLOWING 
#with open('basefonts.woff', 'wb') as f:
    #f.write(response_woff)
    
#font1 = TTFont('basefonts.woff')
#font1.saveXML('basefonts.xml')

In [None]:
import pandas as pd
df = pd.DataFrame.from_records(movieList)

In [None]:
df['boxSplitUnit'][4]

In [None]:
# Now starts the tedius process of mapping the new fonts to the contours and then to the numbers
from xml.etree import ElementTree
from difflib import get_close_matches
from difflib import SequenceMatcher
from fontTools.misc.xmlWriter import XMLWriter
import re

# Create Bijection between numbers and unicode 
unicodeToInt = {"uniF581": 0, "uniF56B": 1, "uniE46D": 2, "uniE13D": 3, "uniEAA8": 4, 
                "uniE808": 5, "uniE1D4": 6, "uniE66A": 7, "uniE43F": 8, "uniECDC": 9}

def hexToUnicode(hexa):
    return 'uni' + hexa[3:].upper()

def uniToHex(uni):
    return "&#x" + uni[3:].lower()

def elemToStr(elem):
    return ElementTree.tostring(elem, encoding='utf8', method='xml')

def trimXML(xml):
    return ''.join(xml.split())

#If the regular equality doesn't work we then have to use fuzzy matching
xRg = re.compile(r'x=\"([0-9]+)\"')
yRg = re.compile(r'y=\"([0-9]+)\"')
def getCoFromXML(xml):
    stringToMatch = trimXML(xml)
    print(stringToMatch)
    x = xRg.match(stringToMatch)
    y = yRg.match(stringToMatch)
    return {"x": x, "y": y}
    
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()
    

## Getting digits from the font data

In [125]:
# Import data
train_images, train_labels, test_images, test_labels = dr.pre_processing()

# Build neural network (with two hidden layers of 200/100 nodes respectively)
neural_network = dr.NeuralNetwork([784,200,100,10], bias=True)
neural_network.train(train_images, train_labels, epochs=1)
neural_network.evaluate(test_images, test_labels)

Data has been downloaded
Data preprocessed successfully
Network created successfully
Training...
Epoch: 1 (of 1)
Network trained successfully
Accuracy: 94.51%
Recall for 0: 98.88%
Precision for 0: 95.85%
Recall for 1: 98.85%
Precision for 1: 97.06%
Recall for 2: 90.89%
Precision for 2: 96.70%
Recall for 3: 94.26%
Precision for 3: 92.43%
Recall for 4: 93.79%
Precision for 4: 93.60%
Recall for 5: 91.48%
Precision for 5: 95.33%
Recall for 6: 96.45%
Precision for 6: 94.87%
Recall for 7: 91.63%
Precision for 7: 97.82%
Recall for 8: 93.22%
Precision for 8: 93.80%
Recall for 9: 95.04%
Precision for 9: 88.14%


In [129]:
# train specifically for our basefonts
basefontimgs = []
basefontlabels = []
for i in range(10):
    with Image.open("training_images/" + str(i) + ".png") as im:
        basefontimgs.append(np.array(im))
    basefontlabels.append([0.01 if x != i else 0.09 for x in range(10)])
neural_network.train(basefontimgs, basefontlabels, epochs = 500)

Training...
Epoch: 1 (of 500)
Epoch: 2 (of 500)
Epoch: 3 (of 500)
Epoch: 4 (of 500)
Epoch: 5 (of 500)
Epoch: 6 (of 500)
Epoch: 7 (of 500)
Epoch: 8 (of 500)
Epoch: 9 (of 500)
Epoch: 10 (of 500)
Epoch: 11 (of 500)
Epoch: 12 (of 500)
Epoch: 13 (of 500)
Epoch: 14 (of 500)
Epoch: 15 (of 500)
Epoch: 16 (of 500)
Epoch: 17 (of 500)
Epoch: 18 (of 500)
Epoch: 19 (of 500)
Epoch: 20 (of 500)
Epoch: 21 (of 500)
Epoch: 22 (of 500)
Epoch: 23 (of 500)
Epoch: 24 (of 500)
Epoch: 25 (of 500)
Epoch: 26 (of 500)
Epoch: 27 (of 500)
Epoch: 28 (of 500)
Epoch: 29 (of 500)
Epoch: 30 (of 500)
Epoch: 31 (of 500)
Epoch: 32 (of 500)
Epoch: 33 (of 500)
Epoch: 34 (of 500)
Epoch: 35 (of 500)
Epoch: 36 (of 500)
Epoch: 37 (of 500)
Epoch: 38 (of 500)
Epoch: 39 (of 500)
Epoch: 40 (of 500)
Epoch: 41 (of 500)
Epoch: 42 (of 500)
Epoch: 43 (of 500)
Epoch: 44 (of 500)
Epoch: 45 (of 500)
Epoch: 46 (of 500)
Epoch: 47 (of 500)
Epoch: 48 (of 500)
Epoch: 49 (of 500)
Epoch: 50 (of 500)
Epoch: 51 (of 500)
Epoch: 52 (of 500)
Epoch: 53

Epoch: 423 (of 500)
Epoch: 424 (of 500)
Epoch: 425 (of 500)
Epoch: 426 (of 500)
Epoch: 427 (of 500)
Epoch: 428 (of 500)
Epoch: 429 (of 500)
Epoch: 430 (of 500)
Epoch: 431 (of 500)
Epoch: 432 (of 500)
Epoch: 433 (of 500)
Epoch: 434 (of 500)
Epoch: 435 (of 500)
Epoch: 436 (of 500)
Epoch: 437 (of 500)
Epoch: 438 (of 500)
Epoch: 439 (of 500)
Epoch: 440 (of 500)
Epoch: 441 (of 500)
Epoch: 442 (of 500)
Epoch: 443 (of 500)
Epoch: 444 (of 500)
Epoch: 445 (of 500)
Epoch: 446 (of 500)
Epoch: 447 (of 500)
Epoch: 448 (of 500)
Epoch: 449 (of 500)
Epoch: 450 (of 500)
Epoch: 451 (of 500)
Epoch: 452 (of 500)
Epoch: 453 (of 500)
Epoch: 454 (of 500)
Epoch: 455 (of 500)
Epoch: 456 (of 500)
Epoch: 457 (of 500)
Epoch: 458 (of 500)
Epoch: 459 (of 500)
Epoch: 460 (of 500)
Epoch: 461 (of 500)
Epoch: 462 (of 500)
Epoch: 463 (of 500)
Epoch: 464 (of 500)
Epoch: 465 (of 500)
Epoch: 466 (of 500)
Epoch: 467 (of 500)
Epoch: 468 (of 500)
Epoch: 469 (of 500)
Epoch: 470 (of 500)
Epoch: 471 (of 500)
Epoch: 472 (of 500)


In [130]:
basefontlabels

[[0.09, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.09, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.09, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.09, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.09, 0.01, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.09, 0.01, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.09, 0.01, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.09, 0.01, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.09, 0.01],
 [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.09]]

In [131]:
def uni_2_png_stream(txt: str, font: str, img_size=28):
    """将字形转化为图片流

    Args:
        txt (str): 图片标志信息, 从 TTFont.getBestCmap() 获得
        font (str): 字体文件名
        img_size (int, optional): [description]. Defaults to 512.

    Returns:
        一个 pillow 图片对象.
    """
    img = Image.new('1', (img_size, img_size), 255)  # (1)
    draw = ImageDraw.Draw(img)   #  (2)
    font = ImageFont.truetype(font, int(img_size * 0.7))   #  (3)

    txt = chr(txt)
    x, y = draw.textsize(txt, font=font)  #  (4)
    draw.text(((img_size - x) // 2, (img_size - y) // 2), txt, font=font, fill=0)   # (5)
    return img   # 如果需要将图片存储到本地, 那么直接调用 img 实例的 save( path:str ) 方法即可


filename = 'basefonts.woff'  # 字体文件的路径信息
f = TTFont(filename)
numbers = []
for i,x in enumerate(f.getBestCmap()):
    # 每个pil对应字体文件中的一个字形
    image = uni_2_png_stream(x, filename)
    matrix_form = np.array(image)
    weighted_predictions = np.ndarray.flatten(neural_network.run(matrix_form))
    most_possible = np.argmax(weighted_predictions)
    print(most_possible)


2 extra bytes in post.stringData array


8
3
6
8
2
7
5
4
9
1
0


In [44]:
# Map contours to numbers
baseFonts = TTFont('basefonts.woff')
# The follwing Code is for fuzzy matching, which we fortunately don't need to implement.
# conStrToInt = {}
# print(len(unicodeToInt))
# for j in unicodeToInt:
#     currGlyph = baseFonts['glyf'][j]
#     writer = XMLWriter('curr_glyph.xml')
#     currGlyph.toXML(writer, baseFonts)
#     with open('curr_glyph.xml', 'r') as f:
#         currXML = f.read()
#     conStrToInt[trimXML(currXML)] = unicodeToInt[j]
currentFonts = TTFont('fonts.woff')
hexToInt = {}

for j in currentFonts.getGlyphNames()[1:-1]:
    currGlyph = currentFonts['glyf'][j]
    writer = XMLWriter('curr_glyph.xml')
    currGlyph.toXML(writer, currentFonts)
    with open('curr_glyph.xml', 'r') as f:
        currXML = f.read()
        
    for i in unicodeToInt:
        baseGlyph = baseFonts['glyf'][i]
        if baseGlyph == currGlyph:
            hexToInt[uniToHex(j)] = unicodeToInt[i]
            break

hexToInt

2 extra bytes in post.stringData array
2 extra bytes in post.stringData array


{}

In [None]:
unitLookup = {'百': 100, '千': 1000, '万': 10000, '亿': 1*10**8}

#converts the weird character to a float
def convertToFloat(string):
    spCharLst = string.split(';')
    result = ''
    for i in spCharLst:
        if len(i) > 7: #has a dot in front
            result += '.' + str(hexToInt[i[1:]])
        elif len(i) == 7: #in case of bad parsing
            result += str(hexToInt[i])
    return float(result)

#helper function for converting the entire block to a single int
def convertDictToInt(dictionary):
    return int(convertToFloat(dictionary['num']) * unitLookup[dictionary['unit']])


In [None]:
df['boxSplitUnit'] = df['boxSplitUnit'].apply(convertDictToInt)
df['splitBoxSplitUnit'] = df['splitBoxSplitUnit'].apply(convertDictToInt)
df['movieInfo'] = df['movieInfo'].apply(lambda x : x['movieName'])
df

In [None]:
#大盘
body0['movieList']['nationBoxInfo']