In [69]:
import pandas as pd 
import numpy as np
import ast
import matplotlib.pyplot as plt

%matplotlib inline

In [70]:
#read data
movieData = pd.read_csv('tmdb_5000_movies.csv')
movieData.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [71]:
movieData.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [72]:
def resolve_data(data):
    tmp = list(data)

    tmp = list(map(lambda item: ast.literal_eval(item) , tmp))
    return tmp

def get_names(data):
    resolvedData = resolve_data(data)
    return list(map(lambda x: ', '.join(list(map(lambda y: y['name'], x))), resolvedData))

In [None]:
for key in ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']:
    movieData[key] = get_names(movieData[key])

movieData.head(5)

In [37]:
movieData['profit'] = movieData['revenue'] - movieData['budget']

In [41]:
dt = movieData[['genres','title','original_language','release_date','vote_average','runtime','profit']]
dt.head()

Unnamed: 0,genres,title,original_language,release_date,vote_average,runtime,profit
0,"Action, Adventure, Fantasy, Science Fiction",Avatar,en,2009-12-10,7.2,162.0,2550965087
1,"Adventure, Fantasy, Action",Pirates of the Caribbean: At World's End,en,2007-05-19,6.9,169.0,661000000
2,"Action, Adventure, Crime",Spectre,en,2015-10-26,6.3,148.0,635674609
3,"Action, Crime, Drama, Thriller",The Dark Knight Rises,en,2012-07-16,7.6,165.0,834939099
4,"Action, Adventure, Science Fiction",John Carter,en,2012-03-07,6.1,132.0,24139100


In [42]:
dt.shape

(4803, 7)

In [43]:
s = dt['genres'].str.split(', ').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'genres'
del dt['genres']
df = dt.join(s)


df.head()

Unnamed: 0,title,original_language,release_date,vote_average,runtime,profit,genres
0,Avatar,en,2009-12-10,7.2,162.0,2550965087,Action
0,Avatar,en,2009-12-10,7.2,162.0,2550965087,Adventure
0,Avatar,en,2009-12-10,7.2,162.0,2550965087,Fantasy
0,Avatar,en,2009-12-10,7.2,162.0,2550965087,Science Fiction
1,Pirates of the Caribbean: At World's End,en,2007-05-19,6.9,169.0,661000000,Adventure


In [44]:
df.shape

(12188, 7)

In [45]:
df['genres'].unique()

array(['Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Crime',
       'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy',
       'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music',
       'Documentary', 'Foreign', 'TV Movie', ''], dtype=object)

In [46]:
len(df['genres'].unique())

21

In [66]:
# let's figure out what genres is most popular
df1 = df[df['vote_average']>=8]
df1.shape

(197, 7)

In [67]:
groupByGenres = (pd.DataFrame(df1.groupby('genres').title.nunique())).sort_values('title', ascending=False )
groupByGenres

Unnamed: 0_level_0,title
genres,Unnamed: 1_level_1
Drama,54
Crime,17
Comedy,16
Thriller,15
Adventure,13
Action,12
Fantasy,9
Documentary,8
Family,8
Romance,8


In [68]:
# Visualize that

groupByGenres['title'].plot.barh(stacked=True, title = 'Genres with >= 8 ratings', figsize=(10, 8));