### Import Libraries

In [87]:
#Libraries
import numpy as np
import pandas as pd
from ast import literal_eval

### Ignore Warnings

In [88]:
import warnings
warnings.filterwarnings("ignore")

### Import Data

In [89]:
#extract data
movies = pd.read_csv("Movies Clean.csv")

In [90]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89264 entries, 0 to 89263
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movieId               89264 non-null  float64
 1   userId                89264 non-null  float64
 2   rating                89264 non-null  float64
 3   movie_title           89264 non-null  object 
 4   budget                89264 non-null  int64  
 5   genres_y              89264 non-null  object 
 6   imdb_id               89264 non-null  object 
 7   original_language     89264 non-null  object 
 8   popularity            89264 non-null  float64
 9   production_companies  89264 non-null  object 
 10  production_countries  89264 non-null  object 
 11  release_date          89264 non-null  object 
 12  revenue               89264 non-null  float64
 13  runtime               89264 non-null  float64
 14  spoken_languages      89264 non-null  object 
 15  status             

### Quality of Movies to be on Top Chart

In [91]:
#separated the genres
movies['genres'] = movies['genres_y'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [92]:
#find vote count and average
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

6.175871571966302

In [93]:
#find 95th percentile
m = vote_counts.quantile(0.95)
m

5091.0

In [94]:
#seperated year from the released date
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [95]:
#find qualified data to be on the cart
qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['movie_title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres','rating']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(4483, 7)

For a movie to pass the qualify to be considered for the chart, a movie has to have at least 5091 votes and average rating 6.1758. Hence, from this dataset, 4483 movies qualify to be on top chart.

In [96]:
#write to csv
from csv import writer
qualified.to_csv('Qualified Movies.csv', index=False)

In [97]:
qualifiedMovies = pd.read_csv("Qualified Movies.csv")

In [98]:
#checking duplicate
qualifiedMovies[qualifiedMovies.duplicated(keep=False)]

Unnamed: 0,movie_title,year,vote_count,vote_average,popularity,genres,rating
0,Toy Story,1995,5415,7,21.946943,"['Animation', 'Comedy', 'Family']",3.0
1,Toy Story,1995,5415,7,21.946943,"['Animation', 'Comedy', 'Family']",4.0
2,Toy Story,1995,5415,7,21.946943,"['Animation', 'Comedy', 'Family']",5.0
3,Toy Story,1995,5415,7,21.946943,"['Animation', 'Comedy', 'Family']",2.0
4,Toy Story,1995,5415,7,21.946943,"['Animation', 'Comedy', 'Family']",3.0
...,...,...,...,...,...,...,...
4478,Beauty and the Beast,2017,5530,6,287.253654,"['Family', 'Fantasy', 'Romance']",4.0
4479,Beauty and the Beast,2017,5530,6,287.253654,"['Family', 'Fantasy', 'Romance']",5.0
4480,Beauty and the Beast,2017,5530,6,287.253654,"['Family', 'Fantasy', 'Romance']",3.5
4481,Beauty and the Beast,2017,5530,6,287.253654,"['Family', 'Fantasy', 'Romance']",2.0


In [99]:
#remove theduplicateddataandkeepfirst
qualifiedMovies.drop_duplicates(keep='first', inplace=True)
qualifiedMovies[qualifiedMovies.duplicated(keep=False)]

Unnamed: 0,movie_title,year,vote_count,vote_average,popularity,genres,rating


### Top Movies based on Rating, Popularity, and Vote

In [100]:
qualifiedMovies = qualifiedMovies.sort_values(['rating','popularity','vote_average'], ascending=False)
qualifiedMovies.head(10)

Unnamed: 0,movie_title,year,vote_count,vote_average,popularity,genres,rating
4303,Beauty and the Beast,2017,5530,6,287.253654,"['Family', 'Fantasy', 'Romance']",5.0
4119,Big Hero 6,2014,6289,7,213.849907,"['Adventure', 'Family', 'Animation', 'Action',...",5.0
3308,Avatar,2009,12114,7,185.070892,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",5.0
4050,Gone Girl,2014,6023,7,154.801009,"['Mystery', 'Thriller', 'Drama']",5.0
4175,The Hunger Games: Mockingjay - Part 1,2014,5767,6,147.098006,"['Science Fiction', 'Adventure', 'Thriller']",5.0
249,Pulp Fiction,1994,8670,8,140.950236,"['Thriller', 'Crime']",5.0
1505,Fight Club,1999,9678,8,63.869599,['Drama'],5.0
4075,Guardians of the Galaxy,2014,10014,7,53.291601,"['Action', 'Science Fiction', 'Adventure']",5.0
572,Forrest Gump,1994,8147,8,48.307194,"['Comedy', 'Drama', 'Romance']",5.0
2364,Pirates of the Caribbean: The Curse of the Bla...,2003,7191,7,47.326665,"['Adventure', 'Fantasy', 'Action']",5.0


The top movie based on rating, popularity, and vote is Beauty and the Beast that released in the year of 2017 with the highest popularity which is 287.2535 follow by Big Hero 6, and Avatar.

### Top Movies based on Genres

In [121]:
#separated genres
s = movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = movies.drop('genres', axis=1).join(s)

In [133]:
def build_chart(genre, percentile=0.85):
  qualified = gen_md[(gen_md['vote_count'] >= m) & (gen_md['vote_count'].notnull()) & (gen_md['vote_average'].notnull())][['movie_title', 'year', 'vote_count', 'vote_average', 'popularity', 'genre','rating']]
  qualified['vote_count'] = qualified['vote_count'].astype('int')
  qualified['vote_average'] = qualified['vote_average'].astype('int')
  
  return qualified

#write to csv
from csv import writer
qualified.to_csv('Build Genre Chart.csv', index=False)

In [134]:
genreChart = pd.read_csv("Build Genre Chart.csv")

#remove theduplicateddataandkeepfirst
genreChart.drop_duplicates(keep='first', inplace=True)

In [138]:
#sorting
genreChart = genreChart.sort_values(['rating','popularity','vote_average'], ascending=False)

In [150]:
#Top movies based on genre romance
genreChart[genreChart['genre'] == 'Comedy'].head(3)

Unnamed: 0,movie_title,year,vote_count,vote_average,popularity,genre,rating
11887,Big Hero 6,2014,6289,7,213.849907,Comedy,5.0
1392,Forrest Gump,1994,8147,8,48.307194,Comedy,5.0
12185,Kingsman: The Secret Service,2015,6069,7,28.224212,Comedy,5.0


Top comedy movie according to our rating and popularity is Big Hero 6 with 5 rate and 213.85 popularity follow by Forrest Gump and Kingsman: The Secret Service