In [23]:
# import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [24]:
url_rt3 = './data/tmdb.movies.csv.gz'
tmdb_movies = pd.read_csv(url_rt3)

In [25]:
url_rt4 = './data/tn.movie_budgets.csv.gz'
tmdb_budgets = pd.read_csv(url_rt4)

In [26]:
# add year & month 
tmdb_movies['date'] = pd.to_datetime(tmdb_movies['release_date'])
tmdb_movies['year'] = tmdb_movies['date'].dt.year
tmdb_movies['month']= tmdb_movies['date'].dt.month

tmdb_budgets['date'] = pd.to_datetime(tmdb_budgets['release_date'])
tmdb_budgets['year'] = tmdb_budgets['date'].dt.year
tmdb_budgets['month'] = tmdb_budgets['date'].dt.month

# slice tmdb_movies (year 2010-2019)
mask = (tmdb_movies['year']<=2019) & (tmdb_movies['year']>=2010)
tmdb_movies = tmdb_movies.loc[mask].copy()

# slicing dataset 2010-2019
mask = (tmdb_budgets['year']<=2019) & (tmdb_budgets['year']>=2010)
tmdb_budgets = tmdb_budgets.loc[mask].copy()

In [27]:
import string
def str_num(s):
    ''' convert s to a list of numbers
        example: 
        input: s = '[12, 14, 10751]'
        output: [12, 14, 10751]   
    '''
    # strip all punctuation from a string 
    s = s.translate(str.maketrans('', '', string.punctuation))
    # string to list 
    #s_list = [int(n) for n in s.split()]
    return s.split()

In [28]:
# 1. covert genre_ids [12, 14, 10751] in to columns
tmdb_movies['genre_ids'] = tmdb_movies['genre_ids'].apply(str_num)

In [29]:
tmdb_movies.columns

Index(['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title',
       'popularity', 'release_date', 'title', 'vote_average', 'vote_count',
       'date', 'year', 'month'],
      dtype='object')

In [30]:
cols = ['genre_ids','original_title','title', 'vote_average', 'vote_count','popularity','date', 'year', 'month','original_language']
tmdb_movies_genre = tmdb_movies[cols].copy()

In [31]:
cols_var = ['original_title','title', 'vote_average', 'vote_count','popularity','date', 'year', 'month','original_language']
tmdb_movies_genre = tmdb_movies_genre['genre_ids'].apply(pd.Series).merge(tmdb_movies_genre, left_index = True, right_index = True)\
.drop(["genre_ids"], axis = 1).melt(id_vars = cols_var, value_name = "genre")

In [35]:
# drop col variable 
tmdb_movies_genre.drop(columns=['variable'], inplace = True)    

In [37]:
# drop rows with genre NaN 
tmdb_movies_genre.dropna(subset = ['genre'], inplace = True)

In [40]:
tmdb_genres=[{'id': 28, 'name': 'Action'},
            {'id': 12, 'name': 'Adventure'},
            {'id': 16, 'name': 'Animation'},
            {'id': 35, 'name': 'Comedy'},
            {'id': 80, 'name': 'Crime'},
            {'id': 99, 'name': 'Documentary'},
            {'id': 18, 'name': 'Drama'},
            {'id': 10751, 'name': 'Family'},
            {'id': 14, 'name': 'Fantasy'},
            {'id': 36, 'name': 'History'},
            {'id': 27, 'name': 'Horror'},
            {'id': 10402, 'name': 'Music'},
            {'id': 9648, 'name': 'Mystery'},
            {'id': 10749, 'name': 'Romance'},
            {'id': 878, 'name': 'Science Fiction'},
            {'id': 10770, 'name': 'TV Movie'},
            {'id': 53, 'name': 'Thriller'},
            {'id': 10752, 'name': 'War'},
            {'id': 37, 'name': 'Western'}]

In [43]:
# mapping genres_id to genre

# 1. create a dict with {id_value1:genre1, id_value2:genre2} from tmdb_genres
genre_dict = {}
for pair in tmdb_genres:
    genre_dict[pair['id']] = pair['name']
genre_dict    

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [48]:
# 'genre' col has dtype (O), convert to int 
tmdb_movies_genre['genre'] = tmdb_movies_genre['genre'].astype(int)

In [49]:
# check 
tmdb_movies_genre['genre'].dtype

dtype('int64')

In [50]:
# map genre_id to genre 
tmdb_movies_genre['genre'] = tmdb_movies_genre['genre'].map(genre_dict)

- Check target result 

In [38]:
# Check target result

# check and compare with movie '"BLESSED"'
mask = tmdb_movies['title'] =='"BLESSED"'
tmdb_movies[mask]

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,date,year,month
26355,26355,"[99, 12]",564096,en,"""BLESSED""",0.6,2018-11-23,"""BLESSED""",7.0,1,2018-11-23,2018,11


In [51]:
# Check target result

tmdb_movies_genre.sort_values('title', axis=0)

Unnamed: 0,original_title,title,vote_average,vote_count,popularity,date,year,month,original_language,genre
26129,"""BLESSED""","""BLESSED""",7.0,1,0.600,2018-11-23,2018,11,en,Documentary
52419,"""BLESSED""","""BLESSED""",7.0,1,0.600,2018-11-23,2018,11,en,Adventure
33992,"""Legitimate Rape"" Pharmaceutical Ad","""Legitimate Rape"" Pharmaceutical Ad",2.3,2,0.600,2012-08-28,2012,8,en,Crime
7702,"""Legitimate Rape"" Pharmaceutical Ad","""Legitimate Rape"" Pharmaceutical Ad",2.3,2,0.600,2012-08-28,2012,8,en,Comedy
12661,"""cherry"" - Supreme","""cherry"" - Supreme",10.0,1,0.893,2014-03-20,2014,3,en,Documentary
549,#1 Cheerleader Camp,#1 Cheerleader Camp,3.6,34,3.277,2010-07-27,2010,7,en,Comedy
26839,#1 Cheerleader Camp,#1 Cheerleader Camp,3.6,34,3.277,2010-07-27,2010,7,en,Drama
15871,#ALLMYMOVIES,#ALLMYMOVIES,9.0,4,0.840,2015-11-12,2015,11,en,Documentary
21955,#Captured,#Captured,3.2,6,1.418,2017-10-11,2017,10,en,Horror
18541,#FollowFriday,#FollowFriday,3.2,7,1.646,2016-07-01,2016,7,en,Thriller


## tmdb_genre_stats table

In [64]:
tmdb_genre_vc = tmdb_movies_genre.groupby('genre')['vote_count'].describe()
tmdb_genre_vc

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Action,2567.0,712.945462,2098.31628,1.0,4.0,24.0,175.0,22186.0
Adventure,1368.0,1168.625731,2853.210887,1.0,4.0,29.5,420.5,22186.0
Animation,1452.0,304.738292,1163.166918,1.0,2.0,9.0,64.25,12691.0
Comedy,5597.0,238.69305,948.216733,1.0,2.0,9.0,59.0,20175.0
Crime,1484.0,410.179245,1237.899755,1.0,4.0,19.5,176.0,13933.0
Documentary,4945.0,12.559151,40.727434,1.0,1.0,3.0,8.0,713.0
Drama,8207.0,242.684903,953.222677,1.0,3.0,11.0,67.0,18597.0
Family,1538.0,378.99935,1376.421407,1.0,3.0,12.0,61.75,12691.0
Fantasy,1113.0,765.440252,2160.438852,1.0,3.0,20.0,168.0,14587.0
History,611.0,295.702128,933.574584,1.0,2.0,12.0,97.0,10396.0


In [65]:
# reset index
tmdb_genre_vc.reset_index(inplace = True)

In [68]:
mapper = {'count':'movie_count',
         'mean':'vote_count_avg',
         'std':'vote_count_std',}
tmdb_genre_vc.rename(columns = mapper)

Unnamed: 0,genre,movie_count,vote_count_avg,vote_count_std,min,25%,50%,75%,max
0,Action,2567.0,712.945462,2098.31628,1.0,4.0,24.0,175.0,22186.0
1,Adventure,1368.0,1168.625731,2853.210887,1.0,4.0,29.5,420.5,22186.0
2,Animation,1452.0,304.738292,1163.166918,1.0,2.0,9.0,64.25,12691.0
3,Comedy,5597.0,238.69305,948.216733,1.0,2.0,9.0,59.0,20175.0
4,Crime,1484.0,410.179245,1237.899755,1.0,4.0,19.5,176.0,13933.0
5,Documentary,4945.0,12.559151,40.727434,1.0,1.0,3.0,8.0,713.0
6,Drama,8207.0,242.684903,953.222677,1.0,3.0,11.0,67.0,18597.0
7,Family,1538.0,378.99935,1376.421407,1.0,3.0,12.0,61.75,12691.0
8,Fantasy,1113.0,765.440252,2160.438852,1.0,3.0,20.0,168.0,14587.0
9,History,611.0,295.702128,933.574584,1.0,2.0,12.0,97.0,10396.0


In [66]:
# check 
tmdb_genre_vc

Unnamed: 0,genre,count,mean,std,min,25%,50%,75%,max
0,Action,2567.0,712.945462,2098.31628,1.0,4.0,24.0,175.0,22186.0
1,Adventure,1368.0,1168.625731,2853.210887,1.0,4.0,29.5,420.5,22186.0
2,Animation,1452.0,304.738292,1163.166918,1.0,2.0,9.0,64.25,12691.0
3,Comedy,5597.0,238.69305,948.216733,1.0,2.0,9.0,59.0,20175.0
4,Crime,1484.0,410.179245,1237.899755,1.0,4.0,19.5,176.0,13933.0
5,Documentary,4945.0,12.559151,40.727434,1.0,1.0,3.0,8.0,713.0
6,Drama,8207.0,242.684903,953.222677,1.0,3.0,11.0,67.0,18597.0
7,Family,1538.0,378.99935,1376.421407,1.0,3.0,12.0,61.75,12691.0
8,Fantasy,1113.0,765.440252,2160.438852,1.0,3.0,20.0,168.0,14587.0
9,History,611.0,295.702128,933.574584,1.0,2.0,12.0,97.0,10396.0


In [70]:
#pd.DataFrame(tmdb_genre_vc['movie_count'].sort_values()).reset_index()

In [71]:
tmdb_movies.shape

(26290, 13)