In [2]:
from bs4 import BeautifulSoup
from requests import TooManyRedirects
import re
import requests
from datetime import datetime
import os
import pandas as pd

In [3]:
def get_critic_page_init(movie_name):
    movie_name = movie_name.lower()
    response = requests.get(f'https://www.rottentomatoes.com/m/{movie_name.replace(' ','_')}/reviews')
    soup = BeautifulSoup(response.content, 'html.parser')
    critics = soup.find_all('div',class_ = 'review-row')

    # find movie id
    rtid = soup.find('script',{"id":"mps-page-integration"}).contents[0].replace('|','').strip()
    rtid = [x for x in rtid.split(',') if 'rtid' in x][0].strip().split(':')[-1][1:-1]
    return critics,rtid

In [4]:
def get_critic_page_follow(movie_id,start_token = None):
    url = f'https://www.rottentomatoes.com/napi/movie/{movie_id}/reviews/all?after={start_token}%3D%3D&pageCount=20'
    print(f'Using url:{url}')
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print('Aborted as response code is not 200')
            return None
    except error as e:
        print(f'[request {url} failed] : {e}')
    return json.loads(response.content)

In [5]:
def get_review_data_soup(critics):
    critics_reviewer = critics.find_all('div',class_ = 'reviewer-name-and-publication')
    disp_name = critics_reviewer[0].find( class_ = "display-name").contents[0].strip()
    # find review_date
    critics_score_content = [x for x in critics.find('p',class_ = 'original-score-and-url').contents if hasattr(x,'contents')]
    def try_dt_map(x):
        try:
            dt = datetime.strptime(x.contents[0], '%b %d, %Y')
            return dt
        except:
            return None
    review_date = [dt for dt in list(map(try_dt_map,critics_score_content)) if bool(dt)]
    if review_date:
        review_date = review_date[0]
    else:
        review_date = 'Not found'
        
    # find score
    critics_score_str = [x for x in critics.find('p',class_ = 'original-score-and-url').contents if not hasattr(x,'contents')]
    def try_score_map(x):
        try:
            score = x.replace('|','').strip()
            return score
        except:
            return None
    review_score = [score for score in list(map(try_score_map,critics_score_str)) if bool(score)]
    if review_score:
        review_score = review_score[0].split()[-1].strip()
    else:
        review_score = 'Not found'

    # find sentiment
    review_sentiment = critics.find('score-icon-critics').attrs['sentiment']
    # find review text
    review_text = critics.find('p', class_ = 'review-text').contents[0]
    if review_text:
        review_text = review_text
    else:
        review_text = 'Not Found'
        
    return [disp_name,review_date,review_score,review_sentiment,review_text,1]
                      

In [37]:
def get_review_data_json(json_rvw_data, page = None):
    def get_json_info(jsn,key):
        if key in jsn.keys():
            return jsn[key]
        else:
            return None
        
    disp_name = get_json_info(json_rvw_data,'criticName')
    review_date = get_json_info(json_rvw_data, 'creationDate')
    review_date = datetime.strptime(json_rvw_data['creationDate'], '%b %d, %Y')
    review_score = get_json_info(json_rvw_data, 'originalScore')
    review_sentiment = get_json_info(json_rvw_data, 'scoreSentiment')
    review_text = get_json_info(json_rvw_data, 'quote')
    
    # review_date = datetime.strptime(json_rvw_data['creationDate'], '%b %d, %Y')
    # review_score = json_rvw_data['originalScore']
    # review_sentiment = json_rvw_data['scoreSentiment']
    # review_text = json_rvw_data['quote']
    return [disp_name,review_date,review_score,review_sentiment,review_text,page]

In [41]:
def get_movie_reviews(movie_title):
    # create dataframe
    df = pd.DataFrame({'movie_title':[],
                       'movie_id':[],
                       'reviewer_name':[],
                       'review_date':[],
                       'review_score':[],
                       'review_sentiment':[],
                       'review_text':[],
                       'on_page':[]
                        })
    def append_review_df(df, movie_title, movie_id, reviewer_name, review_date, review_score, review_sentiment, review_text, on_page):
        df_to_append = pd.DataFrame({'movie_title':[movie_title],
                                     'movie_id':[movie_id],
                                     'reviewer_name':[reviewer_name],
                                     'review_date':[review_date],
                                     'review_score':[review_score],
                                     'review_sentiment':[review_sentiment],
                                     'review_text':[review_text],
                                     'on_page':[on_page]
                                    })
        return df._append(df_to_append, ignore_index = True)
    # page1 init
    init_pg_critics, rtid = get_critic_page_init('hocus_pocus')
    # ingest page1 data
    print(f'Scraping page{1} of {movie_title}')
    for row in range(len(init_pg_critics)):
        review_row = init_pg_critics[row]
        rtid = get_critic_page_init('hocus_pocus')[1]
        review_row_data = get_review_data_soup(review_row)
        df = append_review_df(df, movie_title, rtid, *review_row_data)
    # other page
    is_first_loop = True
    while True:
        if is_first_loop:
            start_token = 'MQ'
        res = get_critic_page_follow(movie_id = rtid,start_token = start_token)
        if not res: # the movie has no more review page
            break
        pg = 2 # page count
        print(f'Scraping page{pg} of {movie_title} using start key = {start_token}')
        for row in res['reviews']:
            # print(row)
            # print('-'*10)
            review_row_data = get_review_data_json(row, page = pg)
            df = append_review_df(df, movie_title, rtid, *review_row_data)
        is_first_loop = False # Disable init var
        # check if has next page
        print(res['pageInfo']['hasNextPage'])
        if not res['pageInfo']['hasNextPage']:
            return df
            
        start_token = res['pageInfo']['endCursor'].replace('==','')
        print(f'Found next page token {start_token}')    
    
        
        
    return df

In [43]:
x = get_movie_reviews('Hocus Pocus')

x

Scraping page1 of Hocus Pocus


  return df._append(df_to_append, ignore_index = True)


Using url:https://www.rottentomatoes.com/napi/movie/2876fc94-98a0-3009-bd09-629094db068e/reviews/all?after=MQ%3D%3D&pageCount=20
Scraping page2 of Hocus Pocus using start key = MQ
True
Found next page token Mg
Using url:https://www.rottentomatoes.com/napi/movie/2876fc94-98a0-3009-bd09-629094db068e/reviews/all?after=Mg%3D%3D&pageCount=20
Scraping page2 of Hocus Pocus using start key = Mg
True
Found next page token Mw
Using url:https://www.rottentomatoes.com/napi/movie/2876fc94-98a0-3009-bd09-629094db068e/reviews/all?after=Mw%3D%3D&pageCount=20
Scraping page2 of Hocus Pocus using start key = Mw
False


Unnamed: 0,movie_title,movie_id,reviewer_name,review_date,review_score,review_sentiment,review_text,on_page
0,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Adrienne Tyler,2024-10-25,6/10,POSITIVE,Hocus Pocus brings together comedy and spookin...,1.0
1,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Peter Bradshaw,2023-09-27,3/5,POSITIVE,It is an enjoyable watch and a decent bit of e...,1.0
2,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Eve Tushnet,2023-08-21,Not found,POSITIVE,"Hocus Pocus walks a fine line, where we get to...",1.0
3,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Mike Massie,2023-03-31,3/10,NEGATIVE,The main premise opens the door to all manner ...,1.0
4,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Daniel Barnes,2022-10-18,1/5,NEGATIVE,"This sloppily written, egregiously unfunny Hal...",1.0
...,...,...,...,...,...,...,...,...
59,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,David N. Butterworth,2000-01-01,2/4,NEGATIVE,Strictly for kids.,2.0
60,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Hollis Chacona,2000-01-01,2/5,NEGATIVE,"Chock-full of visual tricks and treats, Hocus ...",2.0
61,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Roger Ebert,2000-01-01,1/4,NEGATIVE,Watching the movie is like attending a party y...,2.0
62,Hocus Pocus,2876fc94-98a0-3009-bd09-629094db068e,Chris Hicks,2000-01-01,,NEGATIVE,"Unfortunately, all their hammy mugging makes t...",2.0


In [None]:
for i in range(len(get_critic_page_init('hocus_pocus')[0])):
    c = get_critic_page_init('hocus_pocus')[0][i]
    rtid = get_critic_page_init('hocus_pocus')[1]
    print(get_review_data_soup(c))

In [None]:
for i in get_critic_page_follow(movie_id = rtid,start_token = 'MQ')['reviews']:
    print(get_review_data_json(get_critic_page_follow(movie_id = rtid,start_token = 'MQ')['reviews'][i]))

In [None]:
rr = get_critic_page_follow(movie_id = rtid,start_token = 'Mg')
#soup = BeautifulSoup(get_critic_page_follow(movie_id = rtid,start_token = 'MQ'), 'html.parser')

In [None]:
rr['reviews'][0]

In [None]:
if bool(rr['pageInfo']['hasNextPage']):
    print(666)
else:
    print(888)

In [None]:
rr['reviews'][0]

In [None]:
get_review_data_json(rr['reviews'][0])

In [None]:
datetime.strptime(rr['reviews'][0]['creationDate'], '%b %d, %Y')