In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import os
import unicodedata

## Audience Ratings

In [2]:
# Read all html Rotten Tomatoes 100 Best movies
movie_list =[]
folder = 'rotten_tomatoes_html'
for movie in os.listdir(folder):
    with open(os.path.join(folder, movie), encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        
        # extract title
        title = soup.find('title').contents[0][: -len(' - rotten tomatoes')].strip()      
        title = unicodedata.normalize('NFKD', title)
        
        # extract audience_score
        aud_score = soup.find(class_='audience-score meter').find('span').contents[0][:-1]
        
        # extract number of audience ratings
        aud_rating = soup.find(class_='audience-info hidden-xs superPageFontColor').find_all('div')[1].contents[2].strip().replace(',','')
        
        info = {'title': title, 'audience_score': int(aud_score), 'audience_rating': int(aud_rating)}
        movie_list.append(info)        

In [3]:
# Create Dataframe Audience to capture Rotten Tomatoes extra info
df_audience = pd.DataFrame(movie_list, columns=info.keys())

## Critics Rating

In [4]:
# Available data from Critics review
df_critics = pd.read_csv('rotten_tomatoes_best100movies.tsv', sep='\t')

In [5]:
for title in df_critics.title:
    if not title in df_audience.title.tolist():
        print(title)

Rashômon (1951)
Army of Shadows (L'Armée des ombres) (1969)
Tokyo Story (Tôkyô monogatari) (1953)


In [6]:
# For some reason, unicodedata.normalize did not fix all accented chars
# Sorting the dataframes by Title to join by index instead

df_critics = df_critics.sort_values(by='title').reset_index(drop=True)
df_audience = df_audience.sort_values(by='title').reset_index(drop=True)

In [7]:
# Joining data
# df = df_critics.join(df_audience, on='title')
df = df_critics.join(df_audience, rsuffix='_aud')

In [10]:
df = df.sort_values('ranking').reset_index(drop=True)

In [12]:
# Saving it locally
df.to_csv('critics_audience_ratings.csv', index=False)