In [74]:
import pandas as pd
import numpy as np
import os
import ast
from joblib import Parallel, delayed
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [75]:
movie_meta_df = pd.read_csv(os.path.join('data', 'movies_metadata.csv'))

In [76]:
genres = set()
for genres_arr in movie_meta_df['genres']:
    genres_arr = ast.literal_eval(genres_arr)
    for genre in genres_arr:
        genres.add(genre['name'])

In [77]:
movie_meta_df.columns

Index([u'adult', u'belongs_to_collection', u'budget', u'genres', u'homepage',
       u'id', u'imdb_id', u'original_language', u'original_title', u'overview',
       u'popularity', u'poster_path', u'production_companies',
       u'production_countries', u'release_date', u'revenue', u'runtime',
       u'spoken_languages', u'status', u'tagline', u'title', u'video',
       u'vote_average', u'vote_count'],
      dtype='object')

In [78]:
movie_meta_df = movie_meta_df.drop(['belongs_to_collection', 'imdb_id', 'original_title', 'overview', 'poster_path', 'release_date', 'title', 'video', 'homepage', 'tagline', 'vote_average', 'vote_count', 'status', 'popularity', 'production_companies'], axis=1)

In [79]:
movie_meta_df.head()

Unnamed: 0,adult,budget,genres,id,original_language,production_countries,revenue,runtime,spoken_languages
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]"
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso..."
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]"
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]"
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]"


In [81]:
movie_meta_df = movie_meta_df[movie_meta_df['revenue'] > 0]

In [105]:
movie_meta_df['budget'] = movie_meta_df['budget'].astype('int')
movie_meta_df = movie_meta_df[movie_meta_df['budget'] > 0]

In [107]:
len(movie_meta_df)

5381

In [93]:
trailer_meta_df = pd.read_csv(os.path.join('data', 'video_metadata.csv'))

In [95]:
n_trailers = []
for trailer_arr in trailer_meta_df['trailers']:
    try:
        trailer_arr = ast.literal_eval(trailer_arr)
        n_trailers.append(len(trailer_arr))
    except:
        n_trailers.append(0)
trailer_meta_df['n_trailers'] = n_trailers
trailer_meta_df = trailer_meta_df.drop('trailers', axis=1)

In [97]:
trailer_meta_df = trailer_meta_df[trailer_meta_df['n_trailers'] > 0]

In [98]:
trailer_meta_df = trailer_meta_df[trailer_meta_df['views'] > 0]

In [99]:
len(trailer_meta_df)

22300

In [110]:
trailer_meta_df = trailer_meta_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [111]:
trailer_meta_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,views,likes,dislikes,n_trailers
0,1,114709,862.0,8050136,6330,1965,3
1,2,113497,8844.0,83048,70,14,1
2,3,113228,15602.0,89669,74,5,1
3,4,114885,31357.0,104948,0,0,1
4,5,113041,11862.0,16281,20,1,1


In [112]:
trailer_meta_df['tmdbId'].dtype

dtype('float64')

In [114]:
movie_meta_df['id'] = movie_meta_df['id'].astype('int')

In [115]:
trailer_meta_df['id'] = trailer_meta_df['tmdbId'].astype('int')

In [116]:
all_df = pd.merge(movie_meta_df, trailer_meta_df, on=['id'])

In [117]:
len(all_df)

4823

In [118]:
all_df.head()

Unnamed: 0,adult,budget,genres,id,original_language,production_countries,revenue,runtime,spoken_languages,movieId,imdbId,tmdbId,views,likes,dislikes,n_trailers
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",1,114709,862.0,8050136,6330,1965,3
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",2,113497,8844.0,83048,70,14,1
2,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",4,114885,31357.0,104948,0,0,1
3,False,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6,113277,949.0,32419,119,1,1
4,False,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",9091,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",9,114576,9091.0,6162,12,3,2


In [119]:
all_df.to_csv(os.path.join('data', 'combined_metadata.csv'))