In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval

In [2]:
movies_metadata = pd.read_csv('./movies_metadata.csv')
ratings_small = pd.read_csv('./ratings_small.csv')
keywords = pd.read_csv('./keywords.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Data Inspection

In [3]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
ratings_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [6]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [7]:
print(movies_metadata.belongs_to_collection[0])
print(movies_metadata.genres[0])
print(movies_metadata.overview[0])
print(movies_metadata.tagline[1])

{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}
[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]
Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Roll the dice and unleash the excitement!


In [8]:
movies_metadata.genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

### Clean metadata

In [9]:
metadata = movies_metadata.copy()
metadata["id"] = pd.to_numeric(metadata["id"], errors="coerce")
metadata = keywords.merge(metadata,on='id')

In [10]:
metadata.head()

Unnamed: 0,id,keywords,adult,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,tt0114709,en,Toy Story,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,tt0113497,en,Jumanji,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,tt0113228,en,Grumpier Old Men,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...",False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0114885,en,Waiting to Exhale,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,tt0113041,en,Father of the Bride Part II,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [11]:
# Drop Unnecessary information
metadata = metadata.drop(['budget','belongs_to_collection','homepage','overview','poster_path',
              'spoken_languages', 'status', 'tagline'],axis = 1)

In [12]:
# Many NA values are associated with animation movies, which is shown as "video".
# Clean Animation Movies
metadata = metadata[metadata['video'] == False]

In [13]:
metadata.isnull().any()

id                      False
keywords                False
adult                   False
genres                  False
imdb_id                  True
original_language        True
original_title          False
popularity              False
production_companies    False
production_countries    False
release_date             True
revenue                 False
runtime                  True
title                   False
video                   False
vote_average            False
vote_count              False
dtype: bool

In [14]:
metadata.head()

Unnamed: 0,id,keywords,adult,genres,imdb_id,original_language,original_title,popularity,production_companies,production_countries,release_date,revenue,runtime,title,video,vote_average,vote_count
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",tt0114709,en,Toy Story,21.9469,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,Toy Story,False,7.7,5415.0
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",tt0113497,en,Jumanji,17.0155,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,Jumanji,False,6.9,2413.0
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",tt0113228,en,Grumpier Old Men,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,Grumpier Old Men,False,6.5,92.0
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...",False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",tt0114885,en,Waiting to Exhale,3.85949,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,Waiting to Exhale,False,6.1,34.0
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",False,"[{'id': 35, 'name': 'Comedy'}]",tt0113041,en,Father of the Bride Part II,8.38752,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,Father of the Bride Part II,False,5.7,173.0


### Dealing with NA values

In [15]:
metadata.isnull().any()

id                      False
keywords                False
adult                   False
genres                  False
imdb_id                  True
original_language        True
original_title          False
popularity              False
production_companies    False
production_countries    False
release_date             True
revenue                 False
runtime                  True
title                   False
video                   False
vote_average            False
vote_count              False
dtype: bool

In [16]:
print(metadata.imdb_id .isnull().sum(),
      metadata.original_language.isnull().sum(),
      metadata.release_date.isnull().sum(),
      metadata.runtime.isnull().sum())

17 11 80 263


In [17]:
metadata.count()

id                      46383
keywords                46383
adult                   46383
genres                  46383
imdb_id                 46366
original_language       46372
original_title          46383
popularity              46383
production_companies    46383
production_countries    46383
release_date            46303
revenue                 46383
runtime                 46120
title                   46383
video                   46383
vote_average            46383
vote_count              46383
dtype: int64

In [18]:
# Since the number of NA values are so small, we decide to drop them
metadata.dropna(subset = ['imdb_id','original_language','release_date','runtime'],inplace = True)

In [19]:
metadata.isnull().any()

id                      False
keywords                False
adult                   False
genres                  False
imdb_id                 False
original_language       False
original_title          False
popularity              False
production_companies    False
production_countries    False
release_date            False
revenue                 False
runtime                 False
title                   False
video                   False
vote_average            False
vote_count              False
dtype: bool

### Further transformation of format

In [20]:
# Transform genres format
metadata['genres'] = metadata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [21]:
# Transform production companies format
metadata['production_companies'] = metadata['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [22]:
# transform production countries format
metadata['production_countries'] = metadata['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [23]:
# transform keywords format
metadata['keywords'] = metadata['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [24]:
metadata['release_date'] = pd.to_datetime(metadata['release_date'], format='%Y-%m-%d',errors='coerce')

In [25]:
metadata['year'] = metadata['release_date'].dt.year

In [26]:
metadata = metadata.drop('release_date',axis = 1)

In [27]:
metadata

Unnamed: 0,id,keywords,adult,genres,imdb_id,original_language,original_title,popularity,production_companies,production_countries,revenue,runtime,title,video,vote_average,vote_count,year
0,862,"[jealousy, toy, boy, friendship, friends, riva...",False,"[Animation, Comedy, Family]",tt0114709,en,Toy Story,21.9469,[Pixar Animation Studios],[United States of America],373554033.0,81.0,Toy Story,False,7.7,5415.0,1995
1,8844,"[board game, disappearance, based on children'...",False,"[Adventure, Fantasy, Family]",tt0113497,en,Jumanji,17.0155,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],262797249.0,104.0,Jumanji,False,6.9,2413.0,1995
2,15602,"[fishing, best friend, duringcreditsstinger, o...",False,"[Romance, Comedy]",tt0113228,en,Grumpier Old Men,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],0.0,101.0,Grumpier Old Men,False,6.5,92.0,1995
3,31357,"[based on novel, interracial relationship, sin...",False,"[Comedy, Drama, Romance]",tt0114885,en,Waiting to Exhale,3.85949,[Twentieth Century Fox Film Corporation],[United States of America],81452156.0,127.0,Waiting to Exhale,False,6.1,34.0,1995
4,11862,"[baby, midlife crisis, confidence, aging, daug...",False,[Comedy],tt0113041,en,Father of the Bride Part II,8.38752,"[Sandollar Productions, Touchstone Pictures]",[United States of America],76578911.0,106.0,Father of the Bride Part II,False,5.7,173.0,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46476,30840,[],False,"[Drama, Action, Romance]",tt0102797,en,Robin Hood,5.683753,"[Westdeutscher Rundfunk (WDR), Working Title F...","[Canada, Germany, United Kingdom, United State...",0.0,104.0,Robin Hood,False,5.7,26.0,1991
46478,111109,"[artist, play, pinoy]",False,[Drama],tt2028550,tl,Siglo ng Pagluluwal,0.178241,[Sine Olivia],[Philippines],0.0,360.0,Century of Birthing,False,9.0,3.0,2011
46479,67758,[],False,"[Action, Drama, Thriller]",tt0303758,en,Betrayal,0.903007,[American World Pictures],[United States of America],0.0,90.0,Betrayal,False,3.8,6.0,2003
46480,227506,[],False,[],tt0008536,en,Satana likuyushchiy,0.003503,[Yermoliev],[Russia],0.0,87.0,Satan Triumphant,False,0.0,0.0,1917
