### Overview

Use Selenium and call requests to extract movie information, and get movie data with other API, dump them into disk.

Processed data should be stored into MongoDB to persist data into DB.

In [48]:
import re
from bs4 import BeautifulSoup
import requests
import time
import os
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import json
import warnings

warnings.simplefilter("ignore")


class DoubanSplider:
    def __init__(self) -> None:
        self.base_top_url = "https://movie.douban.com/top250?start={}&filter="
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        self.driver = webdriver.Chrome(ChromeDriverManager().install())

    def get_response(self, url):
        response = requests.get(url, headers=self.headers)

        if response.status_code != 200:
            print("Get some error from request! As error code is {}".format(response.status_code))
        
        bs = BeautifulSoup(response.content, 'html.parser')
        time.sleep(.2)
        return bs
    
    def get_response_with_driver(self, url):
        self.driver.get(url)
        
        bs = BeautifulSoup(self.driver.page_source, 'html.parser')
        time.sleep(.1)
        return bs

    def get_top250_links(self):
        each_page = 25
        full_n = 250
        res_tuple = []

        for i in range(int(full_n/each_page)):
            print("Now if page: {}".format(i))
            bs = self.get_response_with_driver(self.base_top_url.format(i * each_page))

            # find each page items
            items = bs.find_all('div', class_='item')
            for item in items:
                each_item_info = self._get_top_content(item)
                res_tuple.append(each_item_info)

        return res_tuple
        
        
    @staticmethod
    def save_list_into_file(obj_list, file_name, file_path=None):
        if not file_path:
            file_path = 'tmp_data'
        
        if file_name.find('.') == -1:
            file_name += '.txt'
        
        with open(os.path.join(file_path, file_name), 'w', encoding='utf-8') as f:
            for obj in obj_list:
                if isinstance(obj, list) or isinstance(obj, tuple):
                    f.write(','.join(obj) + '\n')
                else:
                    f.write(obj + '\n')

    @staticmethod
    def _get_top_content(item):
        title = item.find(class_='hd').find(class_='title').get_text()
        url = item.find(class_='hd').find('a').get('href')
        score = item.find_all("span", class_='rating_num')[0].get_text()
        n_users = item.find_all("span")[-2].get_text()
        return (title, score, n_users, url)

    def get_movie_base_info(self):
        """
        get_movie_base_info Just to use a open source link to get sample data based on movie ID.
        """
        base_api_url = "https://movie.querydata.org/api?id={}"

        # get full ids with movie name
        with open(os.path.join('tmp_data', 'top250_link.txt'), 'r', encoding='utf-8') as f:
            data_line = f.readlines()

        movie_info_dict = {}
        for data in data_line:
            id = data.split(',')[-1].split("/")[-2]
            movie_name = data.split(',')[0]
            response = requests.get(base_api_url.format(id)).text
            movie_info_dict[movie_name] = response
        
        return movie_info_dict


In [3]:
splider = DoubanSplider()

[WDM] - 

[WDM] - Current google-chrome version is 94.0.4606
[WDM] - Get LATEST driver version for 94.0.4606
[WDM] - Driver [C:\Users\guangqiang.lu\.wdm\drivers\chromedriver\win32\94.0.4606.61\chromedriver.exe] found in cache
  from ipykernel import kernelapp as app


In [5]:
# process with API
base_api_url = "https://movie.querydata.org/api?id={}"

# get full ids with movie name
with open(os.path.join('tmp_data', 'top250_link.txt'), 'r', encoding='utf-8') as f:
    data_line = f.readlines()

In [67]:
# get already download info file
def get_file_content(file_name, file_path='tmp_data', data_type='json'):
    with open(os.path.join(file_path, file_name), 'r', encoding='utf-8') as f:
        if data_type =='json':
            data = json.loads(f.read())
        else:
            data = f.readlines()
    
    return data


In [70]:
already_processed_data = get_file_content("already_processed.txt")

len(already_processed_data.keys())

250

In [None]:
# get each movie with API call, it takes times to do this!
base_api_url = "https://movie.querydata.org/api?id={}"

# get full ids with movie name
with open(os.path.join('tmp_data', 'top250_link.txt'), 'r', encoding='utf-8') as f:
    data_line = f.readlines()

movie_info_dict = {}
for data in data_line:
    id = data.split(',')[-1].split("/")[-2]
    movie_name = data.split(',')[0]
    if movie_name in already_get_movies:
        continue
    response = requests.get(base_api_url.format(id)).text
    movie_info_dict[movie_name] = response

In [82]:
# do data extration for what we need from output
# what we need to get: movie name, alias, rating etc.

def get_dict_for_each_dict_sample(sample_dict):
    needed_keys = ['originalName',  'doubanRating', 'year', 'duration']
    extracted_list_keys = [ 'director', 'actor', 'writer']

    out_dict = {}
    for k in needed_keys:
        out_dict[k] = sample_dict.get(k)
    
    # change alis to list
    if sample_dict.get("alias"):
        out_dict['alias'] = [t.strip() for t in sample_dict.get('alias').split("/")]
    
    def _extract_list_data(sample_dict, key, out_key):
        """only for  'director', 'actor', 'writer'"""
        key_dict = {}
        sample_data = sample_dict.get(key)

        ch_info_list = []
        en_info_list = []
        for res in sample_data:
            if 'data' in res.keys():
                res = res.get("data")
            for data in res:
                if data.get('lang') == 'Cn':
                    ch_info_list.append(data.get("name"))

                if data.get('lang') == 'En':
                    en_info_list.append(data.get("name"))

        if ch_info_list:
            key_dict[out_key + "_cn"] = ch_info_list

        if en_info_list:
            key_dict[out_key + "_en"] = en_info_list

        return key_dict

    # add with list of basic information
    for k in extracted_list_keys:
        out_dict.update(_extract_list_data(sample_dict, k, k))

    # add with short dis
    for short in sample_dict.get("data"):
        if short.get("lang") == 'Cn':
            out_dict["short_disc_cn"] = short.get("description")
        if short.get("lang") == 'En':
            out_dict["short_disc_en"] = short.get("description")
            
    return out_dict

In [92]:
final_out_dict = {}

for i, d in enumerate(already_processed_data.items()):
    name, value_dic = d
    value_dic = json.loads(value_dic)
    out_dict = get_dict_for_each_dict_sample(value_dic)
    final_out_dict[name] = out_dict 

In [93]:
# dump json into disk
with open(os.path.join("tmp_data", "data_export.json"), 'w', encoding='utf-8') as f:
    f.write(json.dumps(final_out_dict))

In [99]:
final_out_dict.get(list(final_out_dict.keys())[0])

{'originalName': 'The Shawshank Redemption',
 'doubanRating': '9.7',
 'year': '1994',
 'duration': 8520,
 'alias': ['月黑高飞(港)', '刺激1995(台)', '地狱诺言', '铁窗岁月', '消香克的救赎'],
 'director_cn': ['弗兰克·德拉邦特'],
 'director_en': ['Frank Darabont'],
 'actor_cn': ['蒂姆·罗宾斯',
  '摩根·弗里曼',
  '鲍勃·冈顿',
  '威廉姆·赛德勒',
  '克兰西·布朗',
  '吉尔·贝罗斯',
  '马克·罗斯顿',
  '詹姆斯·惠特摩',
  '杰弗里·德曼',
  '拉里·布兰登伯格',
  '尼尔·吉恩托利',
  '布赖恩·利比',
  '大卫·普罗瓦尔',
  '约瑟夫·劳格诺',
  '祖德·塞克利拉',
  '保罗·麦克兰尼',
  '芮妮·布莱恩',
  '阿方索·弗里曼',
  'V·J·福斯特',
  '弗兰克·梅德拉诺',
  '马克·迈尔斯',
  '尼尔·萨默斯',
  '耐德·巴拉米',
  '布赖恩·戴拉特',
  '唐·麦克马纳斯'],
 'actor_en': ['Tim Robbins',
  'Morgan Freeman',
  'Bob Gunton',
  'William Sadler',
  'Clancy Brown',
  'Gil Bellows',
  'Mark Rolston',
  'James Whitmore',
  'Jeffrey DeMunn',
  'Larry Brandenburg',
  'Neil Giuntoli',
  'Brian Libby',
  'David Proval',
  'Joseph Ragno',
  'Jude Ciccolella',
  'Paul McCrane',
  'Renee Blaine',
  'Alfonso Freeman',
  'V.J. Foster',
  'Frank Medrano',
  'Mack Miles',
  'Neil Summers',
  'Ned Bellamy',
  

#### Dump data into MongoDB

In [5]:
from pymongo import MongoClient

client = MongoClient("localhost", 27017)

# create a db and a new collection
db = client.movie_info
collection = db.top_movies

In [101]:
# construct a list of json object for mongo
mongo_res = []

for name, item in final_out_dict.items():
    tmp_dic = {}
    tmp_dic["movie_name"] = name
    tmp_dic["movie_info"] = item
    
    mongo_res.append(tmp_dic)
res = collection.insert_many(mongo_res)

In [110]:
# get result from mongoDB
collection.find_one({"movie_info.doubanRating":"9.7"})

{'_id': ObjectId('616ea2d5378b1533601d8e31'),
 'movie_name': '肖申克的救赎',
 'movie_info': {'originalName': 'The Shawshank Redemption',
  'doubanRating': '9.7',
  'year': '1994',
  'duration': 8520,
  'alias': ['月黑高飞(港)', '刺激1995(台)', '地狱诺言', '铁窗岁月', '消香克的救赎'],
  'director_cn': ['弗兰克·德拉邦特'],
  'director_en': ['Frank Darabont'],
  'actor_cn': ['蒂姆·罗宾斯',
   '摩根·弗里曼',
   '鲍勃·冈顿',
   '威廉姆·赛德勒',
   '克兰西·布朗',
   '吉尔·贝罗斯',
   '马克·罗斯顿',
   '詹姆斯·惠特摩',
   '杰弗里·德曼',
   '拉里·布兰登伯格',
   '尼尔·吉恩托利',
   '布赖恩·利比',
   '大卫·普罗瓦尔',
   '约瑟夫·劳格诺',
   '祖德·塞克利拉',
   '保罗·麦克兰尼',
   '芮妮·布莱恩',
   '阿方索·弗里曼',
   'V·J·福斯特',
   '弗兰克·梅德拉诺',
   '马克·迈尔斯',
   '尼尔·萨默斯',
   '耐德·巴拉米',
   '布赖恩·戴拉特',
   '唐·麦克马纳斯'],
  'actor_en': ['Tim Robbins',
   'Morgan Freeman',
   'Bob Gunton',
   'William Sadler',
   'Clancy Brown',
   'Gil Bellows',
   'Mark Rolston',
   'James Whitmore',
   'Jeffrey DeMunn',
   'Larry Brandenburg',
   'Neil Giuntoli',
   'Brian Libby',
   'David Proval',
   'Joseph Ragno',
   'Jude Ciccolella',
   'Paul McCra

#### Get data from mongoDB

In [6]:
from pymongo import MongoClient

client = MongoClient("localhost", 27017)

# create a db and a new collection
db = client.movie_info
collection = db.top_movies

In [None]:
res = collection.find()

mongo_out = []
try:
    while res:
        mongo_out.append(res.next())
except:
    pass

# remove mongo _id index
for i in range(len(mongo_out)):
    mongo_out[i].pop("_id")

print(len(mongo_out))    

250


In [1]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch("localhost:9200")


In [42]:
# create ES index for douban movie
index_name = "es_movie"

for i, d in enumerate(mongo_out):
    res = es_client.index(index=index_name, body=d, id=i)

  """


In [49]:
res = es_client.search(index=index_name, query={"match_all":{}})
print("GET {} result from ES.".format(res['hits']['total']['value']))

GET 250 result from ES.


In [52]:
for hit in res['hits']['hits']:
    print("{} | {}".format(hit['_source']['movie_name'], hit['_source']['movie_info']['year']))

肖申克的救赎 | 1994
霸王别姬 | 1993
阿甘正传 | 1994
这个杀手不太冷 | 1994
泰坦尼克号 | 1997
美丽人生 | 1997
千与千寻 | 2001
辛德勒的名单 | 1993
盗梦空间 | 2010
忠犬八公的故事 | 2009


In [53]:
# try to get result with DSL
from elasticsearch_dsl import Search

In [56]:
response = Search(using=es_client, index=index_name).filter("match", movie_name="肖申克的救赎").execute()

for hit in response:
    print( hit.movie_name)

肖申克的救赎
泰坦尼克号
辛德勒的名单
忠犬八公的故事
楚门的世界
放牛班的春天
哈尔的移动城堡
少年派的奇幻漂流
穿条纹睡衣的男孩
西西里的美丽传说
