# __Data Wrangling on Best 100 Movies of All Time Rotten Tomatoes__

In [120]:
import os
import pandas as pd
from bs4 import BeautifulSoup # to parse the html files
import requests
import wptools
from PIL import Image
from io import BytesIO
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

<font size="4">In this project I am provided by the critic rating dataset(`bestofrt.csv`) and the html files of the top 100 rated movies of all time on rotten tomatoes in `rt_html` folder and the goal is to scrap the audience rating and some other cool stuff.</font>

# __Data Gathering__

### __1. Get the Audience Rating__

In [12]:
# Parse html files and get the desired information

lst = [] # Empty list to be appended with dictionary of each movie as rows for the dataframe.
html_foldername = 'rt_html' # Folder contain the html files for each movie of rotten tomatoes top 100 list

files = os.listdir(html_foldername) # Get html file names in a list
files = [os.path.join(html_foldername, file) for file in files] 

for file in files:
    with open(file) as fp:
        soup = BeautifulSoup(fp)
        title = soup.find('title').contents[0][:-len(' - Rotten Tomatoes')]
        score = int(soup.find('div', 'audience-score meter').find('span').contents[0][:-1])
        ratings = int(soup.find('div', class_="audience-info").find_all('div')[1].contents[2].replace(',',''))

    lst.append({'title': title, 'audience_score':score, 'number_of_audience_ratings':ratings})

lst

[{'title': '12 Angry Men (Twelve Angry Men)\xa0(1957)',
  'audience_score': 97,
  'number_of_audience_ratings': 103672},
 {'title': 'The 39 Steps\xa0(1935)',
  'audience_score': 86,
  'number_of_audience_ratings': 23647},
 {'title': 'The Adventures of Robin Hood\xa0(1938)',
  'audience_score': 89,
  'number_of_audience_ratings': 33584},
 {'title': 'All About Eve\xa0(1950)',
  'audience_score': 94,
  'number_of_audience_ratings': 44564},
 {'title': 'All Quiet on the Western Front\xa0(1930)',
  'audience_score': 89,
  'number_of_audience_ratings': 17768},
 {'title': 'Casablanca\xa0(1942)',
  'audience_score': 95,
  'number_of_audience_ratings': 355952},
 {'title': 'Frankenstein\xa0(1931)',
  'audience_score': 87,
  'number_of_audience_ratings': 41140},
 {'title': 'King Kong\xa0(1933)',
  'audience_score': 86,
  'number_of_audience_ratings': 89669},
 {'title': 'Laura\xa0(1944)',
  'audience_score': 91,
  'number_of_audience_ratings': 10481},
 {'title': 'M\xa0(1931)',
  'audience_score': 9

In [20]:
# Create a dataframe from the list to be merged with the provided one.
df = pd.DataFrame(lst)
df.head()

Unnamed: 0,title,audience_score,number_of_audience_ratings
0,12 Angry Men (Twelve Angry Men) (1957),97,103672
1,The 39 Steps (1935),86,23647
2,The Adventures of Robin Hood (1938),89,33584
3,All About Eve (1950),94,44564
4,All Quiet on the Western Front (1930),89,17768


<font size="4">If you tried to print the movie title you will notice that it has `\xa0` and this is different from the provided dataset. So we have to fix this to be able to merge the two datasets.

In [22]:
df['title'][0]

'12 Angry Men (Twelve Angry Men)\xa0(1957)'

In [25]:
fixed_titles = [title.replace('\xa0', ' ') for title in df['title'].values]
df['title'] = fixed_titles

In [27]:
df['title'][0]

'12 Angry Men (Twelve Angry Men) (1957)'

<font size="5">Merging the two datasets</font>

In [33]:
df_critic = pd.read_csv('bestofrt.tsv', sep='\t')
df_critic.head()

Unnamed: 0,ranking,critic_score,title,number_of_critic_ratings
0,1,99,The Wizard of Oz (1939),110
1,2,100,Citizen Kane (1941),75
2,3,100,The Third Man (1949),77
3,4,99,Get Out (2017),282
4,5,97,Mad Max: Fury Road (2015),370


In [37]:
# Merging on the title column on the left
df_merged = df_critic.merge(df, on='title', how='left')
df_merged.head()

Unnamed: 0,ranking,critic_score,title,number_of_critic_ratings,audience_score,number_of_audience_ratings
0,1,99,The Wizard of Oz (1939),110,89.0,874425.0
1,2,100,Citizen Kane (1941),75,90.0,157274.0
2,3,100,The Third Man (1949),77,93.0,53081.0
3,4,99,Get Out (2017),282,87.0,63837.0
4,5,97,Mad Max: Fury Road (2015),370,86.0,123937.0


In [39]:
# Save the new dataset into a csv file
df_merged.to_csv('critic_audience_ratings.csv', index=False)

### __2. Get text review to create word cloud posters__

<font size="4">Here I am provided with text files links `ebert_review_urls list` where each file corresponds to moview review from `ebert` and the goal is to download these files and create a dataframe of three columns the title, the review url and the review text itself.</font>

In [46]:
ebert_review_urls = ['https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9900_1-the-wizard-of-oz-1939-film/1-the-wizard-of-oz-1939-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9901_2-citizen-kane/2-citizen-kane.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9901_3-the-third-man/3-the-third-man.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_4-get-out-film/4-get-out-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_5-mad-max-fury-road/5-mad-max-fury-road.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9902_6-the-cabinet-of-dr.-caligari/6-the-cabinet-of-dr.-caligari.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_7-all-about-eve/7-all-about-eve.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_8-inside-out-2015-film/8-inside-out-2015-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9903_9-the-godfather/9-the-godfather.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_10-metropolis-1927-film/10-metropolis-1927-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_11-e.t.-the-extra-terrestrial/11-e.t.-the-extra-terrestrial.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_12-modern-times-film/12-modern-times-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9904_14-singin-in-the-rain/14-singin-in-the-rain.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_15-boyhood-film/15-boyhood-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_16-casablanca-film/16-casablanca-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9905_17-moonlight-2016-film/17-moonlight-2016-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_18-psycho-1960-film/18-psycho-1960-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_19-laura-1944-film/19-laura-1944-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9906_20-nosferatu/20-nosferatu.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_21-snow-white-and-the-seven-dwarfs-1937-film/21-snow-white-and-the-seven-dwarfs-1937-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_22-a-hard-day27s-night-film/22-a-hard-day27s-night-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9907_23-la-grande-illusion/23-la-grande-illusion.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_25-the-battle-of-algiers/25-the-battle-of-algiers.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_26-dunkirk-2017-film/26-dunkirk-2017-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9908_27-the-maltese-falcon-1941-film/27-the-maltese-falcon-1941-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_29-12-years-a-slave-film/29-12-years-a-slave-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_30-gravity-2013-film/30-gravity-2013-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9909_31-sunset-boulevard-film/31-sunset-boulevard-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_32-king-kong-1933-film/32-king-kong-1933-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_33-spotlight-film/33-spotlight-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990a_34-the-adventures-of-robin-hood/34-the-adventures-of-robin-hood.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_35-rashomon/35-rashomon.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_36-rear-window/36-rear-window.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990b_37-selma-film/37-selma-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_38-taxi-driver/38-taxi-driver.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_39-toy-story-3/39-toy-story-3.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990c_40-argo-2012-film/40-argo-2012-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_41-toy-story-2/41-toy-story-2.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_42-the-big-sick/42-the-big-sick.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_43-bride-of-frankenstein/43-bride-of-frankenstein.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990d_44-zootopia/44-zootopia.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_45-m-1931-film/45-m-1931-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_46-wonder-woman-2017-film/46-wonder-woman-2017-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990e_48-alien-film/48-alien-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_49-bicycle-thieves/49-bicycle-thieves.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_50-seven-samurai/50-seven-samurai.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad990f_51-the-treasure-of-the-sierra-madre-film/51-the-treasure-of-the-sierra-madre-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_52-up-2009-film/52-up-2009-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_53-12-angry-men-1957-film/53-12-angry-men-1957-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9910_54-the-400-blows/54-the-400-blows.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9911_55-logan-film/55-logan-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9911_57-army-of-shadows/57-army-of-shadows.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9912_58-arrival-film/58-arrival-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9912_59-baby-driver/59-baby-driver.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_60-a-streetcar-named-desire-1951-film/60-a-streetcar-named-desire-1951-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_61-the-night-of-the-hunter-film/61-the-night-of-the-hunter-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_62-star-wars-the-force-awakens/62-star-wars-the-force-awakens.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9913_63-manchester-by-the-sea-film/63-manchester-by-the-sea-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_64-dr.-strangelove/64-dr.-strangelove.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_66-vertigo-film/66-vertigo-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9914_67-the-dark-knight-film/67-the-dark-knight-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_68-touch-of-evil/68-touch-of-evil.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_69-the-babadook/69-the-babadook.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9915_72-rosemary27s-baby-film/72-rosemary27s-baby-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9916_73-finding-nemo/73-finding-nemo.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9916_74-brooklyn-film/74-brooklyn-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9917_75-the-wrestler-2008-film/75-the-wrestler-2008-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9917_77-l.a.-confidential-film/77-l.a.-confidential-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_78-gone-with-the-wind-film/78-gone-with-the-wind-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_79-the-good-the-bad-and-the-ugly/79-the-good-the-bad-and-the-ugly.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9918_80-skyfall/80-skyfall.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_82-tokyo-story/82-tokyo-story.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_83-hell-or-high-water-film/83-hell-or-high-water-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_84-pinocchio-1940-film/84-pinocchio-1940-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad9919_85-the-jungle-book-2016-film/85-the-jungle-book-2016-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991a_86-la-la-land-film/86-la-la-land-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991b_87-star-trek-film/87-star-trek-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991b_89-apocalypse-now/89-apocalypse-now.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_90-on-the-waterfront/90-on-the-waterfront.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_91-the-wages-of-fear/91-the-wages-of-fear.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991c_92-the-last-picture-show/92-the-last-picture-show.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_93-harry-potter-and-the-deathly-hallows-part-2/93-harry-potter-and-the-deathly-hallows-part-2.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_94-the-grapes-of-wrath-film/94-the-grapes-of-wrath-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991d_96-man-on-wire/96-man-on-wire.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_97-jaws-film/97-jaws-film.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_98-toy-story/98-toy-story.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_99-the-godfather-part-ii/99-the-godfather-part-ii.txt',
                     'https://d17h27t6h515a5.cloudfront.net/topher/2017/September/59ad991e_100-battleship-potemkin/100-battleship-potemkin.txt']

In [54]:
# First download the review files over HTTP requests and save them in ebert_reviews folder
text_reviews_foldername = 'ebert_reviews'
if text_reviews_foldername not in os.listdir():
    os.makedirs(text_reviews_foldername)

for file_url in ebert_review_urls:
    response = requests.get(file_url)
    with open(os.path.join(text_reviews_foldername, file_url.split('/')[-1]), 'wb') as fp:
        fp.write(response.content) # 'wb' because the file might contain non ascii characters
                                    # and this would raise an error


In [60]:
# The next step is to parse the text files to get the desired info for the dataframe

review_files = [os.path.join(text_reviews_foldername, file) for file in os.listdir(text_reviews_foldername)]

lst = [] # empty list for the dataframe

for file in review_files:
    with open(file, encoding='utf-8') as fp: # as the file contains non ascii characters
        title = fp.readline()[:-1] # to exclude the newline character '\n'
        review_url = fp.readline()[:-1]
        review_text = fp.read()
        lst.append({'title':title, 
                       'review_url':review_url, 
                       'review_text':review_text})
        
df = pd.DataFrame(lst)
df.head()

Unnamed: 0,title,review_url,review_text
0,The Wizard of Oz (1939),http://www.rogerebert.com/reviews/great-movie-...,As a child I simply did not notice whether a m...
1,Metropolis (1927),http://www.rogerebert.com/reviews/great-movie-...,The opening shots of the restored “Metropolis”...
2,Battleship Potemkin (1925),http://www.rogerebert.com/reviews/great-movie-...,"""The Battleship Potemkin” has been so famous f..."
3,E.T. The Extra-Terrestrial (1982),http://www.rogerebert.com/reviews/great-movie-...,Dear Raven and Emil:\n\nSunday we sat on the b...
4,Modern Times (1936),http://www.rogerebert.com/reviews/modern-times...,"A lot of movies are said to be timeless, but s..."


In [62]:
# save the dataframe 
df.to_csv('ebert_reviews.csv', index=False)

### __3. Get the movies posters for the word cloud__

<font size="4">I am gonna get them from wiki using wiki api. There two choices here whether to access the api directly or to use an access library like `wptools` which I gonna use here.
The thing is I am gonna call the api and retrieve the wiki page of each movie in `JSON` format. Then I am gonna parse them to get the urls for the movie posters, then get them over http requests and store them in `bestofrt_posters` folder.
Here I am provided with list of wiki pages names of each movie that passed to the api to retreive the page.

In [69]:
title_list = [
 'The_Wizard_of_Oz_(1939_film)',
 'Citizen_Kane',
 'The_Third_Man',
 'Get_Out_(film)',
 'Mad_Max:_Fury_Road',
 'The_Cabinet_of_Dr._Caligari',
 'All_About_Eve',
 'Inside_Out_(2015_film)',
 'The_Godfather',
 'Metropolis_(1927_film)',
 'E.T._the_Extra-Terrestrial',
 'Modern_Times_(film)',
 'It_Happened_One_Night',
 "Singin'_in_the_Rain",
 'Boyhood_(film)',
 'Casablanca_(film)',
 'Moonlight_(2016_film)',
 'Psycho_(1960_film)',
 'Laura_(1944_film)',
 'Nosferatu',
 'Snow_White_and_the_Seven_Dwarfs_(1937_film)',
 "A_Hard_Day%27s_Night_(film)",
 'La_Grande_Illusion',
 'North_by_Northwest',
 'The_Battle_of_Algiers',
 'Dunkirk_(2017_film)',
 'The_Maltese_Falcon_(1941_film)',
 'Repulsion_(film)',
 '12_Years_a_Slave_(film)',
 'Gravity_(2013_film)',
 'Sunset_Boulevard_(film)',
 'King_Kong_(1933_film)',
 'Spotlight_(film)',
 'The_Adventures_of_Robin_Hood',
 'Rashomon',
 'Rear_Window',
 'Selma_(film)',
 'Taxi_Driver',
 'Toy_Story_3',
 'Argo_(2012_film)',
 'Toy_Story_2',
 'The_Big_Sick',
 'Bride_of_Frankenstein',
 'Zootopia',
 'M_(1931_film)',
 'Wonder_Woman_(2017_film)',
 'The_Philadelphia_Story_(film)',
 'Alien_(film)',
 'Bicycle_Thieves',
 'Seven_Samurai',
 'The_Treasure_of_the_Sierra_Madre_(film)',
 'Up_(2009_film)',
 '12_Angry_Men_(1957_film)',
 'The_400_Blows',
 'Logan_(film)',
 'All_Quiet_on_the_Western_Front_(1930_film)',
 'Army_of_Shadows',
 'Arrival_(film)',
 'Baby_Driver',
 'A_Streetcar_Named_Desire_(1951_film)',
 'The_Night_of_the_Hunter_(film)',
 'Star_Wars:_The_Force_Awakens',
 'Manchester_by_the_Sea_(film)',
 'Dr._Strangelove',
 'Frankenstein_(1931_film)',
 'Vertigo_(film)',
 'The_Dark_Knight_(film)',
 'Touch_of_Evil',
 'The_Babadook',
 'The_Conformist_(film)',
 'Rebecca_(1940_film)',
 "Rosemary%27s_Baby_(film)",
 'Finding_Nemo',
 'Brooklyn_(film)',
 'The_Wrestler_(2008_film)',
 'The_39_Steps_(1935_film)',
 'L.A._Confidential_(film)',
 'Gone_with_the_Wind_(film)',
 'The_Good,_the_Bad_and_the_Ugly',
 'Skyfall',
 'Rome,_Open_City',
 'Tokyo_Story',
 'Hell_or_High_Water_(film)',
 'Pinocchio_(1940_film)',
 'The_Jungle_Book_(2016_film)',
 'La_La_Land_(film)',
 'Star_Trek_(film)',
 'High_Noon',
 'Apocalypse_Now',
 'On_the_Waterfront',
 'The_Wages_of_Fear',
 'The_Last_Picture_Show',
 'Harry_Potter_and_the_Deathly_Hallows_–_Part_2',
 'The_Grapes_of_Wrath_(film)',
 'Roman_Holiday',
 'Man_on_Wire',
 'Jaws_(film)',
 'Toy_Story',
 'The_Godfather_Part_II',
 'Battleship_Potemkin'
]

In [73]:
posters_foldername = 'bestofrt_posters'
if posters_foldername not in os.listdir():
    os.makedirs(posters_foldername)

df_list = []
image_errors = {} # to append failed requests
# Not to get 403 status code pass this header to the http request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

for rank, movie in enumerate(title_list):
    try:
        page = wptools.page(movie, silent=True).get()
        img_url = page.data['image'][0]['url']  # page.data is dictionary
        img = requests.get(img_url, headers=headers) # get the image over http

        '''
        if img.status_code != 200:
            print(f'could not get {movie}')
            continue
        '''
        i = Image.open(BytesIO(img.content))
        img_format = img_url.split('.')[-1]
        i.save(os.path.join(posters_foldername, str(rank+1)+'_'+movie+'.'+img_format))
        df_list.append({'ranking':rank+1, 
                        'title': movie, 
                        'poster_url':img_url})
    except Exception as e:
        print(str(rank+1) + "_" + movie + ": " + str(e))
        try:
            image_errors[movie] = page.data['image']
        except:
            image_errors[movie] = page.data
        

10_Metropolis_(1927_film): https://en.wikipedia.org/w/api.php?action=query&exintro&formatversion=2&inprop=url|watchers&list=random&pithumbsize=240&pllimit=500&ppprop=disambiguation|wikibase_item&prop=extracts|info|links|pageassessments|pageimages|pageprops|pageterms|redirects&redirects&rdlimit=500&rnlimit=1&rnnamespace=0&titles=Metropolis%20%281927%20film%29&plcontinue=49696|0|Mah_Nà_Mah_Nà
15_Boyhood_(film): 'image'


API error: {'code': 'invalidtitle', 'info': 'Bad title "A_Hard_Day%27s_Night_(film)".', 'docref': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/&gt; for notice of API deprecations and breaking changes.'}


22_A_Hard_Day%27s_Night_(film): https://en.wikipedia.org/w/api.php?action=parse&formatversion=2&contentmodel=text&disableeditsection=&disablelimitreport=&disabletoc=&prop=text|iwlinks|parsetree|wikitext|displaytitle|properties&redirects&page=A_Hard_Day%2527s_Night_%28film%29
67_The_Dark_Knight_(film): https://en.wikipedia.org/w/api.php?action=query&exintro&formatversion=2&inprop=url|watchers&list=random&pithumbsize=240&pllimit=500&ppprop=disambiguation|wikibase_item&prop=extracts|info|links|pageassessments|pageimages|pageprops|pageterms|redirects&redirects&rdlimit=500&rnlimit=1&rnnamespace=0&titles=The%20Dark%20Knight&plcontinue=4276475|0|Timothée_Chalamet


API error: {'code': 'invalidtitle', 'info': 'Bad title "Rosemary%27s_Baby_(film)".', 'docref': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/&gt; for notice of API deprecations and breaking changes.'}


72_Rosemary%27s_Baby_(film): https://en.wikipedia.org/w/api.php?action=parse&formatversion=2&contentmodel=text&disableeditsection=&disablelimitreport=&disabletoc=&prop=text|iwlinks|parsetree|wikitext|displaytitle|properties&redirects&page=Rosemary%2527s_Baby_%28film%29
87_Star_Trek_(film): cannot identify image file <_io.BytesIO object at 0x00000161B6FDEAC0>


<font size="5">Issues need to be fixed manually</font>

In [79]:
for title, _ in image_errors.items():
    if title == 'Metropolis_(1927_film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/9/97/Metropolis_%28German_three-sheet_poster%29.jpg'
    if title == 'Boyhood_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/a/a6/Boyhood_%282014%29.png'

    if title == 'A_Hard_Day%27s_Night_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/4/47/A_Hard_Days_night_movieposter.jpg'

    if title == 'The_Dark_Knight_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/1/1c/The_Dark_Knight_%282008_film%29.jpg'

    if title == 'Rosemary%27s_Baby_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/e/ef/Rosemarys_baby_poster.jpg'

    if title == 'Star_Trek_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/commons/4/4b/Star_trek_the_motion_picture_logo.png'
        
    
    rank = int(title_list.index(title)+1)
    df_list.append({'ranking': rank, 
                    'title': title[3:], 
                    'poster_url': url})
    r = requests.get(url, headers=headers)
    # Download movie poster image
    i = Image.open(BytesIO(r.content))
    image_file_format = url.split('.')[-1]
    i.save(posters_foldername + "/" + str(rank)+'_'+title + '.' + image_file_format)
    

In [81]:
df = pd.DataFrame(df_list)
df.head()

Unnamed: 0,ranking,title,poster_url
0,1,The_Wizard_of_Oz_(1939_film),https://upload.wikimedia.org/wikipedia/commons...
1,2,Citizen_Kane,https://upload.wikimedia.org/wikipedia/commons...
2,3,The_Third_Man,https://upload.wikimedia.org/wikipedia/commons...
3,4,Get_Out_(film),https://upload.wikimedia.org/wikipedia/en/a/a3...
4,5,Mad_Max:_Fury_Road,https://upload.wikimedia.org/wikipedia/en/6/6e...


In [83]:
df.to_csv('posters.tsv', sep='\t', index=False)

# __Assessing The Data__

In [89]:
bestofrt_df = pd.read_csv('critic_audience_ratings.csv')
bestofrt_df.isnull().sum()

ranking                       0
critic_score                  0
title                         0
number_of_critic_ratings      0
audience_score                3
number_of_audience_ratings    3
dtype: int64

In [108]:
bestofrt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ranking                     100 non-null    int64  
 1   critic_score                100 non-null    int64  
 2   title                       100 non-null    object 
 3   number_of_critic_ratings    100 non-null    int64  
 4   audience_score              97 non-null     float64
 5   number_of_audience_ratings  97 non-null     float64
dtypes: float64(2), int64(3), object(1)
memory usage: 4.8+ KB


In [91]:
bestofrt_df['ranking'].nunique()

100

In [93]:
posters_df = pd.read_csv('posters.tsv', sep='\t')
posters_df.shape

(101, 3)

In [97]:
posters_df.duplicated().sum()

1

In [99]:
posters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ranking     101 non-null    int64 
 1   title       101 non-null    object
 2   poster_url  101 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [101]:
ebert_reviews_df = pd.read_csv('ebert_reviews.csv')
ebert_reviews_df.shape

(88, 3)

In [103]:
ebert_reviews_df.isnull().sum()

title          0
review_url     0
review_text    0
dtype: int64

# __Cleaning Data__

In [106]:
posters_df.drop_duplicates(inplace=True)

In [110]:
bestofrt_posters_df = bestofrt_df.merge(posters_df, on='ranking', how='left')
bestofrt_posters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ranking                     100 non-null    int64  
 1   critic_score                100 non-null    int64  
 2   title_x                     100 non-null    object 
 3   number_of_critic_ratings    100 non-null    int64  
 4   audience_score              97 non-null     float64
 5   number_of_audience_ratings  97 non-null     float64
 6   title_y                     100 non-null    object 
 7   poster_url                  100 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 6.4+ KB


In [112]:
bestofrt_posters_df = bestofrt_posters_df.rename(columns={'title_x':'title'})

In [116]:
bestofrt_posters_reviews_df = bestofrt_posters_df.merge(ebert_reviews_df, on='title', how='left')
bestofrt_posters_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ranking                     100 non-null    int64  
 1   critic_score                100 non-null    int64  
 2   title                       100 non-null    object 
 3   number_of_critic_ratings    100 non-null    int64  
 4   audience_score              97 non-null     float64
 5   number_of_audience_ratings  97 non-null     float64
 6   title_y                     100 non-null    object 
 7   poster_url                  100 non-null    object 
 8   review_url                  88 non-null     object 
 9   review_text                 88 non-null     object 
dtypes: float64(2), int64(3), object(5)
memory usage: 7.9+ KB


In [118]:
bestofrt_posters_reviews_df.to_csv('complete_dataset.csv', index=False)

# __The Word Cloud__

In [127]:
wordcloud_foldername = 'Wordclouds_posters'
if wordcloud_foldername not in os.listdir():
    os.makedirs(wordcloud_foldername)

df = pd.read_csv('complete_dataset.csv', usecols=['ranking', 'review_text'])
df.set_index('ranking', inplace=True)

In [129]:
df.isnull().sum()

review_text    12
dtype: int64

In [131]:
df.dropna(inplace=True)

In [133]:
df.isnull().sum()

review_text    0
dtype: int64

In [141]:
# Rename the poster file names to ease parsing
posters_foldername = 'bestofrt_posters'
masks = os.listdir(posters_foldername)
for file in masks:
    if file[:3].isdecimal():
        continue
    elif file[:2].isdecimal():
        os.rename(os.path.join(posters_foldername, file), os.path.join(posters_foldername, '0'+file))
    elif file[:1].isdecimal():
        os.rename(os.path.join(posters_foldername, file), os.path.join(posters_foldername, '00'+file))


In [143]:
def get_wordcloud(img, output_path, new_size, review_text):
    
    # Create a new image with the desired size and a white background
    new_img = Image.new("RGB", new_size, (0, 0, 0))
    mask_img = Image.new("RGB", img.size, (255, 255, 255))
    # Calculate the position to paste the original image onto the new image
    img.thumbnail(new_size)  # Resize the image while maintaining aspect ratio
    x = (new_size[0] - img.width) // 2
    y = (new_size[1] - img.height) // 2

    # Paste the original image onto the new image
    #new_img.paste(img, (x, y))
    new_img.paste(mask_img, (x, y))


    poster_mask = np.array(new_img)
    wordcloud = WordCloud(width=800, height=400, background_color='white', mask=poster_mask, contour_width=3, contour_color='black').generate(review_text)
    # Convert the WordCloud object to a PIL Image
    wordcloud_image = wordcloud.to_image()


    
    wordcloud_image.paste(img, (x, y))
    # Save the reshaped image
    wordcloud_image.save(output_path)




In [None]:
masks = os.listdir(posters_foldername)

for poster in masks:
    try:
        poster_mask = Image.open(os.path.join(posters_foldername, poster))
        rank = int(poster[:3])
        review_text = df.loc[rank]['review_text']
    
        get_wordcloud(poster_mask, os.path.join(wordcloud_foldername, poster), (400, 600), review_text)
    except:
        print(f'could not open {poster}')

# __TODO:__ Enhance Image Processing