In [None]:
import pickle
import os


MOVIES_DATA_FILE = 'movies-with-num-rev-9267-15-01-2022.pkl'

movies = pickle.load(open(os.path.join('data', MOVIES_DATA_FILE), 'rb'))
print('Number of movies', len(movies))

In [None]:
from datetime import datetime

# Should probably use Pandas here instead
parsed_movies_scores = []
parsed_movies_release_dates = []
parsed_movies_titles = []
parsed_movies_number_reviews = []
parsed_movies_genres = []
skipped_movies = []

DIFFERENT_REVIEW_TEXT = {
    '500+ Ratings': 500,
    '5,000+ Verified Ratings': 5000,
    '50,000+ Ratings': 50000,
    '5,000+ Ratings': 5000,
    '2,500+ Verified Ratings': 2500,
    '250,000+ Ratings': 250000,
    '50,000+ Verified Ratings': 50000,
    '25,000+ Verified Ratings': 25000,
    '100+ Verified Ratings': 100,
    '1,000+ Verified Ratings': 1000,
    '1,000+ Ratings': 1000,
    '50+ Verified Ratings': 50,
    '10,000+ Verified Ratings': 10000,
    'Fewer than 50 Ratings':1 , 
    'Fewer than 50 Verified Ratings': 1,
    '500+ Verified Ratings': 500,
    '2,500+ Ratings': 2500,
    '100+ Ratings': 100,
    '10,000+ Ratings': 10000,
    '50+ Ratings': 50,
    '100,000+ Ratings':100000, 
    '250+ Ratings': 250,
    '25,000+ Ratings': 25000,
    '250+ Verified Ratings': 250,
}

for movie in movies:
    try:
        name = movie['page_title'].replace(' - Rotten Tomatoes', '')
        audience_score = int(movie['audience_score'])
        release_date = datetime.strptime(movie['release_date'], '%b %d, %Y')
        num_reviews_text = int(DIFFERENT_REVIEW_TEXT[movie['number_of_reviews']])
        genre = movie['genre']
        if (num_reviews_text < 250 or release_date < datetime(2000, 1, 1)):
            skipped_movies.append(movie)
            continue
        parsed_movies_genres.append(genre)
        parsed_movies_release_dates.append(release_date)
        parsed_movies_scores.append(audience_score)
        parsed_movies_titles.append(name)
        parsed_movies_number_reviews.append(num_reviews_text)
    except:
        skipped_movies.append(movie)

print('skipped', len(skipped_movies))





In [None]:
from bokeh.plotting import figure, show, ColumnDataSource, output_file
from bokeh.layouts import column
from bokeh.models import CustomJS, DateRangeSlider, Slider
from datetime import date


data  =dict(
    x=parsed_movies_release_dates,
    y=parsed_movies_scores,
    parsed_movies_number_reviews=parsed_movies_number_reviews,
    parsed_movies_titles=parsed_movies_titles,
    parsed_movies_release_dates= [date.strftime("%m/%d/%Y") for date in parsed_movies_release_dates],
    parsed_movies_genres=parsed_movies_genres
)
source = ColumnDataSource(data=data)
original_source = ColumnDataSource(data=data)

tooltips = [
    ("Title", "@parsed_movies_titles"),
    ("Audience Score", "$y"),
    ("Release Date", "@parsed_movies_release_dates"),
    ("Number of ratings", "@parsed_movies_number_reviews"),
    ("Genres", "@parsed_movies_genres"),
]

output_file(os.path.join('graphs', "movies.html"))

plot = figure(title="Movies by Rotten Tomatoes audience score - Year 2000 onwards", x_axis_label='Release Date', y_axis_label='Audience Score', x_axis_type='datetime',  tooltips=tooltips)

plot.circle('x', 'y', source=source)

show(plot)



