# Using TMDB API to get the id names for genres.

In [2]:
import json
import requests
import pandas as pd
import ast

In [3]:
with open("/Users/meganlyons/.secret/tmdbapi.json") as f:
    tmdb = json.load(f)

In [4]:
type(tmdb)

dict

**Calling in my API key from a secret folder**

In [5]:
import json

def get_keys(path):
    with open(path) as f:
        return json.load(f)

In [6]:
keys = get_keys("/Users/meganlyons/.secret/tmdbapi.json")
api_key = keys['api_key']

**Using TMBD API to get id names for genres**

In [7]:
r = requests.get(f"https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US")

In [8]:
## This is a byte string
r.content

b'{"genres":[{"id":28,"name":"Action"},{"id":12,"name":"Adventure"},{"id":16,"name":"Animation"},{"id":35,"name":"Comedy"},{"id":80,"name":"Crime"},{"id":99,"name":"Documentary"},{"id":18,"name":"Drama"},{"id":10751,"name":"Family"},{"id":14,"name":"Fantasy"},{"id":36,"name":"History"},{"id":27,"name":"Horror"},{"id":10402,"name":"Music"},{"id":9648,"name":"Mystery"},{"id":10749,"name":"Romance"},{"id":878,"name":"Science Fiction"},{"id":10770,"name":"TV Movie"},{"id":53,"name":"Thriller"},{"id":10752,"name":"War"},{"id":37,"name":"Western"}]}'

In [9]:
# Converting to a Dictionary
json.loads(r.content)

{'genres': [{'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 16, 'name': 'Animation'},
  {'id': 35, 'name': 'Comedy'},
  {'id': 80, 'name': 'Crime'},
  {'id': 99, 'name': 'Documentary'},
  {'id': 18, 'name': 'Drama'},
  {'id': 10751, 'name': 'Family'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 36, 'name': 'History'},
  {'id': 27, 'name': 'Horror'},
  {'id': 10402, 'name': 'Music'},
  {'id': 9648, 'name': 'Mystery'},
  {'id': 10749, 'name': 'Romance'},
  {'id': 878, 'name': 'Science Fiction'},
  {'id': 10770, 'name': 'TV Movie'},
  {'id': 53, 'name': 'Thriller'},
  {'id': 10752, 'name': 'War'},
  {'id': 37, 'name': 'Western'}]}

In [82]:
response = r.json()

In [83]:
response.keys()

dict_keys(['genres'])

**Putting the genres in a dataframe**

In [84]:
d = pd.DataFrame(response['genres'])

In [85]:
genres = response['genres']

In [86]:
genres

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 80, 'name': 'Crime'},
 {'id': 99, 'name': 'Documentary'},
 {'id': 18, 'name': 'Drama'},
 {'id': 10751, 'name': 'Family'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 36, 'name': 'History'},
 {'id': 27, 'name': 'Horror'},
 {'id': 10402, 'name': 'Music'},
 {'id': 9648, 'name': 'Mystery'},
 {'id': 10749, 'name': 'Romance'},
 {'id': 878, 'name': 'Science Fiction'},
 {'id': 10770, 'name': 'TV Movie'},
 {'id': 53, 'name': 'Thriller'},
 {'id': 10752, 'name': 'War'},
 {'id': 37, 'name': 'Western'}]

In [87]:
genres[0]['id']

28

**Creating a Function to convert genre ids to genre names**

In [90]:
genres
def genrefunction(x):
    newlist = []
    for g in range(0,len(genres)):
        for i in range(0,len(x)): 
            if x[i] == genres[g]['id']:
                newlist.append(genres[g]['name'])
    return newlist


In [89]:
sampleid = [12, 28, 878]

In [91]:
genrefunction(sampleid)

['Action', 'Adventure', 'Science Fiction']

Calling in the data I want to collate with genres

In [92]:
df = pd.read_csv('zippedData/tmdb.movies.csv',index_col = 0)

In [93]:
df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [94]:
# literal eval reads in a string and parses it into a python object
df['genre_id_list'] = df['genre_ids'].apply(ast.literal_eval)

In [95]:
df['new_genres'] = df['genre_id_list'].apply(genrefunction)

In [96]:
df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,genre_id_list,new_genres
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,"[12, 14, 10751]","[Adventure, Family, Fantasy]"
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,"[14, 12, 16, 10751]","[Adventure, Animation, Family, Fantasy]"
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,"[12, 28, 878]","[Action, Adventure, Science Fiction]"
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,"[16, 35, 10751]","[Animation, Comedy, Family]"
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,"[28, 878, 12]","[Action, Adventure, Science Fiction]"


In [97]:
df.to_csv('cleandata/genredf.csv')

In [98]:
budgetdf = pd.read_csv('cleandata/movierevenue.csv', index_col = 0)
budgetdf.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year_released,age,decades,day,month,markup,profitmargin,netprofit
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,11,1,Friday,12,5.532577,0.846921,2351345279
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011,9,0,Friday,5,1.546673,0.607331,635063875
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,2019,1,0,Friday,6,-0.572108,-1.337036,-200237650
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015,5,0,Friday,5,3.243841,0.764364,1072413963
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017,3,0,Friday,12,3.153696,0.759251,999721747


# Creating a function to merge genres

In [99]:
def get_genres(movie_title):
    try:
        genres = df.loc[df['title']==movie_title,'new_genres'].values[0]
    except:
        genres = None
    return genres

In [100]:
get_genres(budgetdf['movie'][0])

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [101]:
budgetdf['genres'] = budgetdf['movie'].apply(get_genres)

In [102]:
budgetdf.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year_released,age,decades,day,month,markup,profitmargin,netprofit,genres
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,11,1,Friday,12,5.532577,0.846921,2351345279,"[Action, Adventure, Fantasy, Science Fiction]"
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011,9,0,Friday,5,1.546673,0.607331,635063875,"[Action, Adventure, Fantasy]"
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,2019,1,0,Friday,6,-0.572108,-1.337036,-200237650,
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015,5,0,Friday,5,3.243841,0.764364,1072413963,"[Action, Adventure, Science Fiction]"
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017,3,0,Friday,12,3.153696,0.759251,999721747,


In [110]:
budgetdf.isna().sum()

id                      0
release_date            0
movie                   0
production_budget       0
domestic_gross          0
worldwide_gross         0
year_released           0
age                     0
decades                 0
day                     0
month                   0
markup                  0
profitmargin            0
netprofit               0
genres               3805
dtype: int64

In [109]:
budgetdf.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year_released,age,decades,day,month,markup,profitmargin,netprofit,genres
0,1,2009-12-18,Avatar,425000000,760507625,2776345279,2009,11,1,Friday,12,5.532577,0.846921,2351345279,"[Action, Adventure, Fantasy, Science Fiction]"
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,2011,9,0,Friday,5,1.546673,0.607331,635063875,"[Action, Adventure, Fantasy]"
2,3,2019-06-07,Dark Phoenix,350000000,42762350,149762350,2019,1,0,Friday,6,-0.572108,-1.337036,-200237650,
3,4,2015-05-01,Avengers: Age of Ultron,330600000,459005868,1403013963,2015,5,0,Friday,5,3.243841,0.764364,1072413963,"[Action, Adventure, Science Fiction]"
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017,3,0,Friday,12,3.153696,0.759251,999721747,


In [10]:
# Exploring missing genres - interesting that Star Wars has some genre names and not in others

In [111]:
budgetdf.loc[budgetdf['movie'].str.contains('Star Wars')]

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year_released,age,decades,day,month,markup,profitmargin,netprofit,genres
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,2017,3,0,Friday,12,3.153696,0.759251,999721747,
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000,936662225,2053311220,2015,5,0,Friday,12,5.710167,0.850972,1747311220,
11,12,2018-05-25,Solo: A Star Wars Story,275000000,213767512,393151347,2018,2,0,Friday,5,0.429641,0.300524,118151347,"[Action, Adventure, Science Fiction]"
44,45,2016-12-16,Rogue One: A Star Wars Story,200000000,532177324,1049102856,2016,4,0,Friday,12,4.245514,0.809361,849102856,"[Action, Adventure, Science Fiction]"
303,4,1999-05-19,Star Wars Ep. I: The Phantom Menace,115000000,474544677,1027044677,1999,21,2,Wednesday,5,7.930823,0.888028,912044677,
304,5,2005-05-19,Star Wars Ep. III: Revenge of the Sith,115000000,380270577,848998877,2005,15,1,Thursday,5,6.382599,0.864546,733998877,
305,6,2002-05-16,Star Wars Ep. II: Attack of the Clones,115000000,310676740,656695615,2002,18,1,Thursday,5,4.710397,0.824881,541695615,
1783,84,1983-05-25,Star Wars Ep. VI: Return of the Jedi,32500000,309205079,572705079,1983,37,3,Wednesday,5,16.621695,0.943252,540205079,
2367,68,1980-05-21,Star Wars Ep. V: The Empire Strikes Back,23000000,290271960,534161334,1980,40,4,Wednesday,5,22.224406,0.956942,511161334,
3464,65,1977-05-25,Star Wars Ep. IV: A New Hope,11000000,460998007,786598007,1977,43,4,Wednesday,5,70.50891,0.986016,775598007,


In [103]:
budgetdf.to_csv('cleandata/Revenue_Budget_Genre.csv')