In [2]:
from bs4 import BeautifulSoup
from requests import TooManyRedirects
import re
import requests
from datetime import datetime
import os
import pandas as pd
import regex as re

In [3]:
def get_critic_page_init(movie_name):
    movie_name = re.sub(r"[^a-zA-Z]{1,}", "_", movie_name).lower()
    if movie_name[-1] == '_':
        movie_name = movie_name[:-1]
    print(f'Try to get response from https://www.rottentomatoes.com/m/{movie_name}/reviews')
    response = requests.get(f'https://www.rottentomatoes.com/m/{movie_name}/reviews')
    soup = BeautifulSoup(response.content, 'html.parser')
    critics = soup.find_all('div',class_ = 'review-row')

    # find movie id
    rtid = soup.find('script',{"id":"mps-page-integration"}).contents[0].replace('|','').strip()
    rtid = [x for x in rtid.split(',') if 'rtid' in x][0].strip().split(':')[-1][1:-1]

    load_btn = soup.find_all('rt-button',{'data-loadmoremanager':"btnLoadMore:click"})
    if load_btn:
        hasNextPage = True
    else:
        hasNextPage = False
    return critics,rtid, hasNextPage

In [4]:
def get_critic_page_follow(movie_id,start_token = None):
    url = f'https://www.rottentomatoes.com/napi/movie/{movie_id}/reviews/all?after={start_token}%3D%3D&pageCount=20'
    print(f'Using url:{url}')
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print('Aborted as response code is not 200')
            return None
    except error as e:
        print(f'[request {url} failed] : {e}')
    return json.loads(response.content)

In [5]:
def get_review_data_soup(critics):
    critics_reviewer = critics.find_all('div',class_ = 'reviewer-name-and-publication')
    disp_name = critics_reviewer[0].find( class_ = "display-name").contents[0].strip()
    # find review_date
    critics_score_content = [x for x in critics.find('p',class_ = 'original-score-and-url').contents if hasattr(x,'contents')]
    def try_dt_map(x):
        try:
            dt = datetime.strptime(x.contents[0], '%b %d, %Y')
            return dt
        except:
            return None
    review_date = [dt for dt in list(map(try_dt_map,critics_score_content)) if bool(dt)]
    if review_date:
        review_date = review_date[0]
    else:
        review_date = 'Not found'
        
    # find score
    critics_score_str = [x for x in critics.find('p',class_ = 'original-score-and-url').contents if not hasattr(x,'contents')]
    def try_score_map(x):
        try:
            score = x.replace('|','').strip()
            return score
        except:
            return None
    review_score = [score for score in list(map(try_score_map,critics_score_str)) if bool(score)]
    if review_score:
        review_score = review_score[0].split()[-1].strip()
    else:
        review_score = 'Not found'

    # find sentiment
    review_sentiment = critics.find('score-icon-critics').attrs['sentiment']
    # find review text
    review_text = critics.find('p', class_ = 'review-text').contents[0]
    if review_text:
        review_text = review_text
    else:
        review_text = 'Not Found'
        
    return [disp_name,review_date,review_score,review_sentiment,review_text,1]
                      

In [6]:
def get_review_data_json(json_rvw_data, page = None):
    def get_json_info(jsn,key):
        if key in jsn.keys():
            return jsn[key]
        else:
            return None
    disp_name = get_json_info(json_rvw_data,'criticName')
    review_date = get_json_info(json_rvw_data, 'creationDate')
    review_date = datetime.strptime(json_rvw_data['creationDate'], '%b %d, %Y')
    review_score = get_json_info(json_rvw_data, 'originalScore')
    review_sentiment = get_json_info(json_rvw_data, 'scoreSentiment')
    review_text = get_json_info(json_rvw_data, 'quote')

    return [disp_name,review_date,review_score,review_sentiment,review_text,page]

In [7]:
def get_movie_reviews(movie_title):
    # create dataframe
    df = pd.DataFrame({'movie_title':[],
                       'movie_id':[],
                       'reviewer_name':[],
                       'review_date':[],
                       'review_score':[],
                       'review_sentiment':[],
                       'review_text':[],
                       'on_page':[]
                        })
    def append_review_df(df, movie_title, movie_id, reviewer_name, review_date, review_score, review_sentiment, review_text, on_page):
        df_to_append = pd.DataFrame({'movie_title':[movie_title],
                                     'movie_id':[movie_id],
                                     'reviewer_name':[reviewer_name],
                                     'review_date':[review_date],
                                     'review_score':[review_score],
                                     'review_sentiment':[review_sentiment],
                                     'review_text':[review_text],
                                     'on_page':[on_page]
                                    })
        return df._append(df_to_append, ignore_index = True)
    # page1 init
    init_pg_critics, rtid, hasNextPage = get_critic_page_init(movie_title)
    # ingest page1 data
    print(f'Scraping page{1} of {movie_title}')
    for row in range(len(init_pg_critics)):
        review_row = init_pg_critics[row]
        rtid = get_critic_page_init(movie_title)[1]
        review_row_data = get_review_data_soup(review_row)
        df = append_review_df(df, movie_title, rtid, *review_row_data)
    # end if no load more btn found
    if not hasNextPage:
        return df
    
    # other page
    is_first_loop = True
    while True:   
        if is_first_loop:
            start_token = 'MQ'
        res = get_critic_page_follow(movie_id = rtid,start_token = start_token)
        if not res: # the movie has no more review page
            break
        pg = 2 # page count
        print(f'Scraping page{pg} of {movie_title} using start key = {start_token}')
        for row in res['reviews']:)
            review_row_data = get_review_data_json(row, page = pg)
            df = append_review_df(df, movie_title, rtid, *review_row_data)
        is_first_loop = False # Disable init var
        # check if has next page
        print(res['pageInfo']['hasNextPage'])
        if not res['pageInfo']['hasNextPage']:
            return df      
        start_token = res['pageInfo']['endCursor'].replace('==','')
        print(f'Found next page token {start_token}')    

    return df

In [8]:
x = get_movie_reviews('Venom: The Last Dance')

x

Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Scraping page1 of Venom: The Last Dance
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews


  return df._append(df_to_append, ignore_index = True)


Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rottentomatoes.com/m/venom_the_last_dance/reviews
Try to get response from https://www.rotten

Unnamed: 0,movie_title,movie_id,reviewer_name,review_date,review_score,review_sentiment,review_text,on_page
0,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Troy Ribeiro,2024-11-03,3/5,NEGATIVE,"Sadly, the film stumbles through a series of m...",1.0
1,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Laura Stott,2024-11-02,3/5,POSITIVE,There's plenty of satisfying set pieces here a...,1.0
2,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Ganesh Aaglave,2024-11-02,3/5,POSITIVE,It’s Tom Hardy show and as expected he hits th...,1.0
3,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Robert Roten,2024-11-02,C+,POSITIVE,"This is a flawed, but enjoyable movie despite ...",1.0
4,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Nicola Austin,2024-11-02,2/5,NEGATIVE,Despite featuring a Venom horse and Venom danc...,1.0
...,...,...,...,...,...,...,...,...
168,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Casey Chong,2024-10-23,1.5/5,NEGATIVE,A futile third Venom movie that fails to give ...,2.0
169,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Jake Cole,2024-10-23,3/4,POSITIVE,As the film progresses&#44; it consistently es...,2.0
170,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Elliott Collins,2024-10-23,2/5,NEGATIVE,In a whirlwind of chaotic creativity&#44; Veno...,2.0
171,Venom: The Last Dance,0f101f8c-ec09-39c4-9be0-2f9cc464d332,Soren Andersen,2024-10-23,1.5/4,NEGATIVE,“The Last Dance” brings nothing new to the ser...,2.0


In [10]:
x = get_movie_reviews('am_i_racist?')
x

Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Scraping page1 of am_i_racist?
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews


  return df._append(df_to_append, ignore_index = True)


Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Try to get response from https://www.rottentomatoes.com/m/am_i_racist/reviews
Using url:https://www.rottentomatoes.com/napi/movie/6451c00e-5f5b-4c7f-97a4-cabeb5f3b094/reviews/all?after=MQ%3D%3D&pageCount=20
Scraping page

Unnamed: 0,movie_title,movie_id,reviewer_name,review_date,review_score,review_sentiment,review_text,on_page
0,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Vinson Cunningham,2024-10-10,Not found,NEGATIVE,The joke is that it's hard for a white person ...,1.0
1,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Michael Medved,2024-10-04,3/4,POSITIVE,"It's cleverly edited, brilliantly scripted, an...",1.0
2,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Dennis Schwartz,2024-10-02,B-,POSITIVE,A provocative Borat-like comedic documentary f...,1.0
3,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Adam Olinger,2024-09-22,Not found,NEGATIVE,It didn't make me laugh very much.,1.0
4,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Tyler Smith,2024-09-19,Not found,NEGATIVE,"Half-formed, at best.",1.0
5,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Jessie Gender,2024-09-18,Not found,NEGATIVE,Despite a better target than the bigoted What ...,1.0
6,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Neal Pollack,2024-09-18,3/5,POSITIVE,When it hones in on what is essentially a sham...,1.0
7,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Wade Major,2024-09-17,Not found,POSITIVE,"You're not getting anywhere on the subject, bu...",1.0
8,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Jeremy Jahns,2024-09-16,Not found,POSITIVE,It made me laugh.,1.0
9,am_i_racist?,6451c00e-5f5b-4c7f-97a4-cabeb5f3b094,Alan Ng,2024-09-11,8.5/10,POSITIVE,"Half the audience will laugh, and the other ha...",1.0
