In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.proxy import Proxy, ProxyType
import simplejson as json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')



# Stage 1: Maoyan

## Get website response and font data

In [2]:
browserOptions = Options()
browserOptions.add_argument('--ignore-ssl-errors=yes')
browserOptions.add_argument('--ignore-certificate-errors')
#browserOptions.add_argument("--headless")

res = json.loads(requests.get('https://www.proxyscan.io/api/proxy?type=https').text)
prox = Proxy()
prox.proxy_type = ProxyType.MANUAL
prox.http_proxy = str(res[0]['Ip']) + ":" + str(res[0]['Port'])

capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
capa["goog:loggingPrefs"] = {"performance": "ALL"}
prox.add_to_capabilities(capa)
driver = webdriver.Chrome(desired_capabilities=capa, chrome_options=browserOptions)
wait = WebDriverWait(driver, 20)

#create snapshot of the entire page to prevent it from constantly changing
driver.get("https://piaofang.maoyan.com/dashboard/movie")
test = None
while not test:
    try:
        test = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'moviename-td')))
    except:
        driver.refresh();

now = datetime.now().strftime("%d-%m-%Y_%H:%M:%S") # get exact datetime at the time of scrape
os.mkdir("logs/" + now)

driver.get_screenshot_as_file("logs/" + now + "/screenshot.png") # save screenshot to sanity check later

logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

def log_filter(log_):
    return (
        # is an actual response
        log_["method"] == "Network.responseReceived"
        # and json
        and "json" in log_["params"]["response"]["mimeType"]
    )

responses = []

for log in filter(log_filter, logs):
    request_id = log["params"]["requestId"]
    resp_url = log["params"]["response"]["url"]
    print(f"Caught {resp_url}")
    response = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
    responses.append(response)

Caught https://piaofang.maoyan.com/dashboard-ajax/movie?orderType=0&uuid=aee59939-cc8e-431f-a75c-834e66c97d2b&timeStamp=1647049713134&User-Agent=TW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTBfMTVfNykgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzk3LjAuNDY5Mi45OSBTYWZhcmkvNTM3LjM2&index=947&channelId=40009&sVersion=2&signKey=548496fc3e227393218eefd9e6cd9cb8


In [3]:
# Get this instance's font file from backend server
body0 = json.loads(responses[0]['body'])
movieList = body0['movieList']['list']
date = body0['calendar']['today']
font_url = body0['fontStyle'].split('"')[-2]

# Get reference fonts from the file tree
from fontTools.ttLib import TTFont
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/66.0.3359.139 Safari/537.36 "
    }

woff_url = 'http:' + font_url
response_woff = requests.get(woff_url, headers=headers).content

print("Woff retrieval succuessful: " + str(len(response_woff) > 0))

with open('temp/fonts.woff', 'wb') as f:
    f.write(response_woff)

Woff retrieval succuessful: True


In [4]:
driver.close() # we don't need the driver anymore from this point forward

## Getting digits from the font data using pytesseract

In [5]:
from fontTools.ttLib import TTFont
from PIL import ImageFont, Image, ImageDraw, ImageOps
import pytesseract
import cv2
import numpy as np
import random

def uniToHex(uni):
    return "&#x" + uni[3:].lower()

def uni_2_png_stream(txt: str, font: str, img_size=512, font_size=0.7, invert=False):
    img = Image.new('1', (img_size, img_size), 255) 
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font, int(img_size * font_size))
    
    txt = chr(txt)
    x, y = draw.textsize(txt, font=font) 
    draw.text(((img_size - x) // 2, (img_size - y) // 2), txt, font=font, fill=0)
    if invert:
        img = img.convert('L')
        img = ImageOps.invert(img)
        img = img.convert('1')
    #img.save(txt + '.png')
    return img 

def predict_neural(unicode, fontFile):
    image = uni_2_png_stream(int(unicode[3:], 16), fontFile, img_size=28, font_size=0.5, invert=True)
    image.save(str(unicodeToInt[unicode]) + '_neuro.png')
    matrix_form = np.array(image)
    weighted_predictions = np.ndarray.flatten(neural_network.run(matrix_form))
    most_possible = np.argmax(weighted_predictions)
    return most_possible

def predict_tesseract(unicode, fontFile, fontSize=0.5):
    image = uni_2_png_stream(int(unicode[3:], 16), fontFile, img_size=1024, font_size=fontSize)
    image.save('logs/' + str(now) + '/' + str(unicode) + '.png')
    text = pytesseract.image_to_string(image, lang="eng", config="--psm 10 outputbase digits -c tessedit_char_whitelist=0123456789")
    return text

def predict_tesseract_definite(unicode, fontFile):
    result, size = '', 1
    while not result and size >= 0:
        result = predict_tesseract(x, filename, fontSize=size)
        size -= 0.01
    return result

In [6]:
# Map contours to numbers - the prediction phase may be very slow
filename = 'temp/fonts.woff'
f = TTFont(filename)
hexToInt = {}
for x in f.getGlyphNames()[1:-1]:
    predict = predict_tesseract_definite(x, filename)
    hexToInt[uniToHex(x)] = int(predict)

hexToInt

2 extra bytes in post.stringData array


{'&#xe06a': 3,
 '&#xe15e': 6,
 '&#xe2eb': 7,
 '&#xe63e': 0,
 '&#xe724': 2,
 '&#xe88e': 8,
 '&#xea78': 1,
 '&#xec70': 5,
 '&#xf5fb': 9,
 '&#xf679': 4}

## Parsing the data into pandas dataframe

In [7]:
import pandas as pd
df = pd.DataFrame.from_records(movieList)

In [8]:
unitLookup = {'百': 100, '千': 1000, '万': 10000, '亿': 1*10**8}

#converts the weird character to a float
def convertToFloat(string):
    spCharLst = string.split(';')
    result = ''
    for i in spCharLst:
        if len(i) > 7: #has a dot in front
            result += '.' + str(hexToInt[i[1:]])
        elif len(i) == 7: #in case of bad parsing
            result += str(hexToInt[i])
    return float(result)

#helper function for converting the entire block to a single int
def convertDictToInt(dictionary):
    return int(convertToFloat(dictionary['num']) * unitLookup[dictionary['unit']])


In [9]:
df['boxSplitUnit'] = df['boxSplitUnit'].apply(convertDictToInt)
df['splitBoxSplitUnit'] = df['splitBoxSplitUnit'].apply(convertDictToInt)
df['movieInfo'] = df['movieInfo'].apply(lambda x : x['movieName'])
df.to_csv("logs/" + now + "/maoyan_data.csv", encoding='utf_8_sig')
df

Unnamed: 0,avgSeatView,avgShowView,boxRate,boxSplitUnit,movieInfo,showCount,showCountRate,splitBoxRate,splitBoxSplitUnit,sumBoxDesc,sumSplitBoxDesc
0,12.7%,27,26.5%,2610000,神秘海域,2558,0.7%,26.2%,2351600,263.1万,237.3万
1,0.6%,2,21.7%,2140400,长津湖之水门桥,50627,15.8%,22.4%,2009400,39.80亿,36.76亿
2,0.6%,1,7.8%,771300,可不可以你也刚好喜欢我,35859,11.1%,7.6%,686100,542.2万,481.6万
3,0.2%,1,7.4%,728600,这个杀手不太冷静,51246,16.0%,7.3%,661500,25.70亿,23.51亿
4,1.0%,2,6.7%,659700,花束般的恋爱,17440,5.4%,6.4%,578200,7449.6万,6561.4万
5,0.4%,1,6.5%,644500,奇迹·笨小孩,36006,11.2%,6.5%,584300,13.49亿,12.34亿
6,0.7%,1,5.6%,555800,熊出没·重返地球,18640,5.8%,5.6%,505500,9.45亿,8.67亿
7,0.2%,1,4.7%,468600,如果有一天我将会离开你,37426,11.6%,4.7%,427500,189.9万,174.2万
8,0.8%,1,2.7%,268600,我们的冬奥,8510,2.6%,2.7%,243900,5953.2万,5461.0万
9,0.2%,1,2.1%,215500,狙击手,21309,6.6%,2.2%,197700,5.90亿,5.41亿


In [10]:
#大盘
dapan = pd.DataFrame.from_records(body0['movieList']['nationBoxInfo'])
dapan['nationBoxSplitUnit'][0] = convertDictToInt(body0['movieList']['nationBoxInfo']['nationBoxSplitUnit'])
dapan['nationSplitBoxSplitUnit'][0] = convertDictToInt(body0['movieList']['nationBoxInfo']['nationSplitBoxSplitUnit'])
dapan.drop(labels=['unit'], axis=0, inplace=True)
dapan.to_csv("logs/" + now + "/dapan.csv", encoding='utf_8_sig')

In [11]:
# Comparing with the screenshot earlier
#from IPython.display import Image as displayImage
#displayImage(filename="logs/" + now + "/screenshot.png") 

# Stage 2: Douban
## Search for movie names and get page sources

In [12]:
browserOptions = Options()
#browserOptions.add_argument("--headless")

capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
capa["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(desired_capabilities=capa, chrome_options=browserOptions)
wait = WebDriverWait(driver, 20)

driver.get("https://movie.douban.com/")

test = None
while not test:
    try:
        test = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'nav')))
    except:
        pass
        #driver.refresh();

jsonLst = []    
soupLst = []
percent1star, percent2star, percent3star, percent4star, percent5star = [], [], [], [], []
betterThan = []
shortReview, reviewRating, helpful, totalReviews = [], [], [], []
imdb = []
playSource = []

def search(name):
    inputElement = driver.find_element_by_id('inp-query')
    inputElement.send_keys(name)
    inputElement.send_keys(Keys.ENTER)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'cover-link')))
    firstMovie = driver.find_elements_by_class_name('cover-link')[0]
    firstMovie.click()
    res = requests.get(driver.current_url, headers=headers)
    soup = bs(res.text, 'lxml')
    soupLst.append(soup)
    
    if len(soup.select('span[class^="rating_per"]')) == 5:
        for i, x in enumerate(soup.select('span[class^="rating_per"]')):
            try:
                globals()['percent' + str(i + 1) + 'star'].append(x.text)
            except:
                globals()['percent' + str(i + 1) + 'star'].append(None)
    else: 
        for i in range(1, 6):
            globals()['percent' + str(i) + 'star'].append(None)
            
            
    try:
        playSource.append([x.text.strip() for x in soup.select('a[class^="playBtn"]')])
    except:
        playSource.append([])
            
    try:
        betterThan.append([x.text for x in soup.select('a[href^="/typerank?type_name="]')])
    except: 
        betterThan.append([])
        
    try: 
        shortReview.append([x.text for x in soup.select('span[class^="short"]')])
    except: 
        shortReview.append([])
        
    try:
        reviewRating.append([x.text for x in soup.select('span[class^="votes vote-count"]')])
    except: 
        reviewRating.append([])
        
    try:
        helpful.append([x['class'][0][-2:-1] for x in soup.select('span[class$="0 rating"]')])
    except:
        helpful.append([])
    
    try:
        totalReviews.append(soup.select_one('a[href$="comments?status=P"]').text.strip())
    except:
        totalReviews.append(None)
    
    try:
        imdb.append(re.search('IMDb:</span>(.*)<br/>', str(soup.select_one('div[id^="info"]')), re.IGNORECASE).group(1).strip())
    except:
        imdb.append(None)
        
    sj = json.loads(soup.select_one('script[type^="application/ld+json"]').text, strict=False)
    jsonLst.append(sj)

for i in df['movieInfo']:
    search(i)
    
df_douban = pd.DataFrame.from_records(jsonLst)
df_douban.to_csv("logs/" + now + "/douban_data_raw.csv", encoding='utf_8_sig')

## Parse collected data and put into combined dataframe

In [14]:
def parsePeopleLst(lst):
    result = []
    for i in lst: 
        result.append(i['name'])
    return result

def parseRatingLst(lst):
    return (lst['ratingValue'], lst['ratingCount'], lst['bestRating'], lst['worstRating'])

df_combined = df
df_combined['imdb'] = imdb
df_combined['duration'] = df_douban['duration']
df_combined['datePublished'] = df_douban['datePublished']
df_combined['genre'] = df_douban['genre']
df_combined['ratingValue'], df_combined['ratingCount'], df_combined['bestRating'], df_combined['worstRating'] = zip(*df_douban['aggregateRating'].apply(parseRatingLst))

for i in range(1, 6):
    df_combined['ratingPercentage' + str(i) + 'Star'] = globals()['percent' + str(i) + 'star']
    
df_combined['betterThan'] = betterThan
df_combined['shortReview'], df_combined['reviewRating'], df_combined['helpful'], df_combined['totalReviews'] = shortReview, reviewRating, helpful, totalReviews

df_combined['playSources'] = playSource

df_combined['director'] = df_douban['director'].apply(parsePeopleLst)
df_combined['author'] = df_douban['author'].apply(parsePeopleLst)
df_combined['actors'] = df_douban['actor'].apply(parsePeopleLst)
df_combined['description'] = df_douban['description']
df_combined['url'] = df_douban['url']
df_combined['doubanDataRaw'] = soupLst

df_combined.to_csv("logs/" + now + "/combined.csv", encoding='utf_8_sig')

In [15]:
df_combined

Unnamed: 0,avgSeatView,avgShowView,boxRate,boxSplitUnit,movieInfo,showCount,showCountRate,splitBoxRate,splitBoxSplitUnit,sumBoxDesc,...,reviewRating,helpful,totalReviews,playSources,director,author,actors,description,url,doubanDataRaw
0,12.7%,27,26.5%,2610000,神秘海域,2558,0.7%,26.2%,2351600,263.1万,...,"[106, 23, 7, 6, 11]","[2, 3, 5, 3, 3]",全部 887 条,[],[鲁本·弗雷斯彻 Ruben Fleischer],"[雷夫·贾金斯 Rafe Judkins, 阿特·马库姆 Art Marcum, 马特·霍洛...","[汤姆·赫兰德 Tom Holland, 马克·沃尔伯格 Mark Wahlberg, 索菲...",足智多谋的内森·德雷克（汤姆·赫兰德 饰）和经验丰富的寻宝者维克多·苏利文（马克·沃尔伯格 ...,/subject/3822687/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
1,0.6%,2,21.7%,2140400,长津湖之水门桥,50627,15.8%,22.4%,2009400,39.80亿,...,"[6331, 3299, 2152, 2347, 830]","[4, 4, 4, 4, 1]",全部 86251 条,[],[徐克 Hark Tsui],"[兰晓龙 Xiaolong Lan, 黄建新 Jianxin Huang]","[吴京 Jing Wu, 易烊千玺 Jackson Yee, 朱亚文 Yawen Zhu, ...",电影以抗美援朝战争第二次战役中的长津湖战役为背景，讲述了在结束了新兴里和下碣隅里的战斗之后，...,/subject/35613853/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
2,0.6%,1,7.8%,771300,可不可以你也刚好喜欢我,35859,11.1%,7.6%,686100,542.2万,...,"[24, 28, 6, 1, 3]","[2, 1, 2, 1, 4]",全部 1235 条,[],[简学彬 Shiue Bin Jian],[肆一 ],"[曹佑宁 Yu-ning Tsao, 陈妤 Yu Chen, 林映唯 Patricia Li...",影片讲述了暗中喜欢着青梅竹马李助豪的女生田筱湘，借由塔罗牌算出了告白的最佳时机，但没想到要告...,/subject/34858501/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
3,0.2%,1,7.4%,728600,这个杀手不太冷静,51246,16.0%,7.3%,661500,25.70亿,...,"[2143, 1274, 1020, 1301, 692]","[3, 3, 4, 2, 3]",全部 140495 条,[],[邢文雄 Wenxiong Xing],[邢文雄 Wenxiong Xing],"[马丽 Li Ma, 魏翔 Xiang Wei, 陈明昊 Minghao Chen, 周大勇...",魏成功（魏翔 饰）非常热爱表演，然而其貌不扬的他往往只能够在电影里觅得一个跑龙套的角色，可即...,/subject/35505100/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
4,1.0%,2,6.7%,659700,花束般的恋爱,17440,5.4%,6.4%,578200,7449.6万,...,"[13637, 9122, 6199, 5303, 3802]","[4, 4, 3, 4, 5]",全部 145784 条,[],[土井裕泰 Nobuhiro Doi],[坂元裕二 Yûji Sakamoto],"[菅田将晖 Masaki Suda, 有村架纯 Kasumi Arimura, 细田佳央太 ...",山音麦 (菅田将晖 饰) 和八谷绢 (有村架纯 饰) 因错过尾班车而相遇，在深夜咖啡馆聊起文...,/subject/34874432/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
5,0.4%,1,6.5%,644500,奇迹·笨小孩,36006,11.2%,6.5%,584300,13.49亿,...,"[8999, 11418, 10592, 2217, 3844]","[3, 5, 5, 2, 4]",全部 139111 条,[],[文牧野 Muye Wen],"[周楚岑 Chucen Zhou, 修梦迪 Mengdi Xiu, 文牧野 Muye Wen...","[易烊千玺 Jackson Yee, 田雨 Yu Tian, 陈哈琳 Halin Chen,...",二十岁的景浩（易烊千玺 饰）独自带着年幼的妹妹来到深圳生活，兄妹俩生活温馨却拮据。为了妹妹高...,/subject/35312437/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
6,0.7%,1,5.6%,555800,熊出没·重返地球,18640,5.8%,5.6%,505500,9.45亿,...,"[1084, 447, 189, 178, 49]","[3, 5, 3, 3, 2]",全部 9772 条,[],[林汇达 Huida Lin],"[万秦 Qin Wan, 徐芸 Yun Xu, 蒋琳 Lin Jiang]","[张秉君 Bingjun Zhang, 张伟 Wei Zhang, 谭笑 Xiao Tan,...",有点懒又有点馋的熊二虽然总是各种失误犯错，内心却一直梦想成为一位英雄，以此获得大家特别是哥哥...,/subject/35377026/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
7,0.2%,1,4.7%,468600,如果有一天我将会离开你,37426,11.6%,4.7%,427500,189.9万,...,"[74, 40, 39, 37, 9]","[4, 3, 4, 4, 3]",全部 2652 条,[],[李亘 Gen Li],[李亘 Gen Li],"[齐溪 Xi Qi, 谢承泽 Chengze Xie, 牛超 Chao Niu, 邱天 Ti...",交换生李小李（谢承泽 饰）来到日本东京偏僻的“渊野边”留学一年。呆板木讷的他一心想要打工却屡...,/subject/35148918/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
8,0.8%,1,2.7%,268600,我们的冬奥,8510,2.6%,2.7%,243900,5953.2万,...,"[73, 37, 69, 62, 90]","[3, 3, 3, 3, 2]",全部 2099 条,[],"[林永长 Yongchang Lin, 李豪凌 Haoling Li, 庄昊 Hao Zhu...","[徐芸 Yun Xu, 王刚 Gang Wang, 张杰 Jie Zhang, 徐碧君 Bi...","[申世佑 Shiyou Shen, 刘思奇 Siqi Liu, 张茗 Ming Zhang,...",冰墩墩和雪容融在去参加奥运村的开村典礼时发生了一系列奇妙故事。小光头强与小熊大熊二一起帮助大...,/subject/35517371/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."
9,0.2%,1,2.1%,215500,狙击手,21309,6.6%,2.2%,197700,5.90亿,...,"[6221, 3580, 2899, 1594, 1733]","[4, 3, 4, 3, 4]",全部 72780 条,[],"[张艺谋 Yimou Zhang, 张末 Mo Zhang]",[陈宇 Yu Chen],"[陈永胜 Yongsheng Chen, 章宇 Yu Zhang, 张译 Yi Zhang,...",影片根据抗美援朝战争“冷枪冷炮”运动中神枪手群体事迹改编。1952年冬至1953年初，中国人...,/subject/35215390/,"<!DOCTYPE html> <html class=""ua-mac ua-webkit""..."


In [None]:
PATH_OF_GIT_REPO = r'.git'
from git import Repo
def git_push():
    repo = Repo(PATH_OF_GIT_REPO)
    repo.git.add(update=True)
    repo.index.commit("ip pooling capabilities")
    origin = repo.remote(name='remote')
    origin.push()   

git_push()