In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import os
import unicodedata

## Audience Ratings

In [2]:
# Read all html Rotten Tomatoes 100 Best movies
movie_list =[]
folder = 'rotten_tomatoes_html'
for movie in os.listdir(folder):
    with open(os.path.join(folder, movie), encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        
        # extract title
        title = soup.find('title').contents[0][: -len(' - rotten tomatoes')].strip()      
        title = unicodedata.normalize('NFKD', title)
        
        # extract audience_score
        aud_score = soup.find(class_='audience-score meter').find('span').contents[0][:-1]
        
        # extract number of audience ratings
        aud_rating = soup.find(class_='audience-info hidden-xs superPageFontColor').find_all('div')[1].contents[2].strip().replace(',','')
        
        info = {'title': title, 'audience_score': int(aud_score), 'audience_rating': int(aud_rating)}
        movie_list.append(info)        

In [37]:
# Create Dataframe Audience to capture Rotten Tomatoes extra info
df_audience = pd.DataFrame(movie_list, columns=info.keys())

## Critics Rating

In [4]:
# Available data from Critics review
df_critics = pd.read_csv('rotten_tomatoes_best100movies.tsv', sep='\t')

In [38]:
for title in df_critics.title:
    if not title in df_audience.title.tolist():
        print(title)

Army of Shadows (L'Armée des ombres) (1969)
Rashômon (1951)
Tokyo Story (Tôkyô monogatari) (1953)


In [6]:
# For some reason, unicodedata.normalize did not fix all accented chars
# Sorting the dataframes by Title to join by index instead

df_critics = df_critics.sort_values(by='title').reset_index(drop=True)
df_audience = df_audience.sort_values(by='title').reset_index(drop=True)

In [9]:
# Joining data
# df = df_critics.join(df_audience, on='title')
df = df_critics.join(df_audience, rsuffix='_aud')

In [10]:
df.head()

Unnamed: 0,ranking,critic_score,title,number_of_critic_ratings,title_aud,audience_score,audience_rating
0,53,100,12 Angry Men (Twelve Angry Men) (1957),49,12 Angry Men (Twelve Angry Men) (1957),97,103672
1,29,96,12 Years a Slave (2013),316,12 Years a Slave (2013),90,138789
2,22,98,A Hard Day's Night (1964),104,A Hard Day's Night (1964),89,50067
3,60,98,A Streetcar Named Desire (1951),54,A Streetcar Named Desire (1951),90,54761
4,48,97,Alien (1979),104,Alien (1979),94,457186


## Capturing Movies Image from Wikipedia (MediaWiki)
We will use `wptools`, a wrapper around the MediaWiki API that makes it even easier to extract information for specific 'wiki-pages'.

In [53]:
import wptools
page = wptools.page('E.T._the_Extra-Terrestrial').get()

en.wikipedia.org (query) E.T._the_Extra-Terrestrial
en.wikipedia.org (parse) 73441
www.wikidata.org (wikidata) Q11621
www.wikidata.org (labels) Q60629803|P2518|P162|Q7341915|P1431|Q46...
www.wikidata.org (labels) P2509|Q45171911|P725|P1040|Q849124|P246...
www.wikidata.org (labels) Q1270715|Q488651|P3212|P840|P2130|Q3745...
www.wikidata.org (labels) P3143|Q1748409|Q505449|P2334|Q30|P18|P3...
en.wikipedia.org (restbase) /page/summary/E.T._the_Extra-Terrestrial
en.wikipedia.org (imageinfo) File:E t the extra terrestrial ver3....
E.T. the Extra-Terrestrial (en) data
{
  aliases: <list(2)> E.T., ET
  assessments: <dict(4)> United States, Film, Science Fiction, Lib...
  claims: <dict(95)> P1562, P57, P272, P345, P31, P161, P373, P480...
  description: <str(63)> 1982 American science fiction film direct...
  exhtml: <str(570)> <p><i><b>E.T. the Extra-Terrestrial</b></i> i...
  exrest: <str(549)> E.T. the Extra-Terrestrial is a 1982 American...
  extext: <str(1731)> _**E.T. the Extra-Terrestri

In [54]:
# Accessing Image 
page.data['image']

[{'kind': 'parse-image',
  'file': 'File:E t the extra terrestrial ver3.jpg',
  'orig': 'E t the extra terrestrial ver3.jpg',
  'timestamp': '2016-06-04T10:30:46Z',
  'size': 83073,
  'width': 253,
  'height': 394,
  'url': 'https://upload.wikimedia.org/wikipedia/en/6/66/E_t_the_extra_terrestrial_ver3.jpg',
  'descriptionurl': 'https://en.wikipedia.org/wiki/File:E_t_the_extra_terrestrial_ver3.jpg',
  'descriptionshorturl': 'https://en.wikipedia.org/w/index.php?curid=7419503',
  'title': 'File:E t the extra terrestrial ver3.jpg',
  'metadata': {'DateTime': {'value': '2016-06-04 10:30:46',
    'source': 'mediawiki-metadata',
    'hidden': ''},
   'ObjectName': {'value': 'E t the extra terrestrial ver3',
    'source': 'mediawiki-metadata',
    'hidden': ''},
   'CommonsMetadataExtension': {'value': 1.2,
    'source': 'extension',
    'hidden': ''},
   'Categories': {'value': 'All non-free media|E.T. the Extra-Terrestrial|Fair use images of movie posters|Files with no machine-readable auth