In [7]:
# NOTE: Run this in Python 2.7
import requests
import urllib
import imdb
import lxml.html
import numpy as np
import pandas as pd
import itertools
%matplotlib inline
import seaborn as sns
import math
import matplotlib.pyplot as plt

In [8]:
def requestResults(url):
    r = requests.get(BASE_URL + url + "&api_key=" + API_KEY)
    return r.json()

# Constants
BASE_URL = "https://api.themoviedb.org/3/"
API_KEY = "9767d17413ec9d9729c2cca238df02da"
GENRE_MAP = {}
for g in requestResults("genre/movie/list?x=1")[u'genres']:
    GENRE_MAP[g['id']] = g['name']

# Download image
def downloadImageToFile(imgpath, filename):
    # Does not return anything
    urllib.urlretrieve("https://image.tmdb.org/t/p/w500" + imgpath, filename)
    
# Get genre and poster path of one movie by title
def genreAndPosterPath(title):
    title_query = urllib.urlencode({'query': title})
    result = requestResults("search/movie?" + title_query + "&language=en-US&page=1&include-adult=false")[u'results'][0]
    genre_ids = result['genre_ids']
    genres = [str(GENRE_MAP[gid]) for gid in genre_ids]
    poster_path = result['poster_path']
    return genres, poster_path

# Get genres from IMDB for one movie by title
def imdbGenresByTitle(title):
    id_ = imdb_access.search_movie(title)[0].__repr__().split('id:')[1].split('[')[0]
    hxs = lxml.html.document_fromstring(requests.get("http://www.imdb.com/title/tt" + id_).content)
    return hxs.xpath("//a[contains(@href, 'genre')]/text()")[1:]


# Get genres from IMDB for one movie by title
def imdbOverview(title):
    if (len(imdb_access.search_movie(title))>0):
        id_ = imdb_access.search_movie(title)[0].__repr__().split('id:')[1].split('[')[0]
        hxs = lxml.html.document_fromstring(requests.get("http://www.imdb.com/title/tt" + id_).content)
        text = hxs.xpath("//div[@class='summary_text']/text()")
        if (len(text)>0):
            return text[0].strip()
    return ''

# Get genres from TMDB for one movie by title
def tmdbGenresByTitle(title):
    title_query = urllib.urlencode({'query': title})
    genre_ids = requestResults("search/movie?" + title_query + "&language=en-US&page=1&include-adult=false")[u'results'][0]['genre_ids']
    return _mapGidsToGenres(genre_ids)

# Private helper function that maps genre_ids to genres
def _mapGidsToGenres(genre_ids):
    return [str(GENRE_MAP[gid]) for gid in genre_ids]

imdb_access = imdb.IMDb()

# EXAMPLES
# requestResults("discover/movie?sort_by=popularity.desc")[u'results'][0] # Get top 10 most popular results
# downloadImageToFile('/tnmL0g604PDRJwGJ5fsUSYKFo9.jpg', 't2s.jpg') # Download an image to the file t2s.jpg

In [95]:
movies_1 = pd.read_csv('./tmdb-movies-1-to-400.csv')
movies_2 = pd.read_csv('./tmdb-movies_start_601_end_801.csv')
movies_combined = pd.concat([movies_1, movies_2])

In [138]:
x = np.array(np.matrix(movies_601['genre_ids'][0]))[0]
def idToName(idArrayStr):
    if idArrayStr == '[]':
        return ''
    idArray = np.array(np.matrix(idArrayStr))[0]
    genres = []
    for gid in idArray:
        if gid != 10769:
            genres.append(GENRE_MAP[gid])
    return ','.join(genres)

def idStrToArray(idArrayStr):
    if idArrayStr == '[]':
        return ''
    idArray = np.array(np.matrix(idArrayStr))[0]
    genres = []
    for gid in idArray:
        if gid != 10769:
            genres.append(str(gid))
    return ','.join(genres)

In [139]:
movies_combined['genres'] = movies_combined['genre_ids'].apply(idToName)
movies_combined['genre_ids_cleaned'] = movies_combined['genre_ids'].apply(idStrToArray)

In [148]:
movies_combined_cleaned = movies_combined.drop(labels=['adult', 'video', 'genre_ids'], axis=1)

In [149]:
movies_combined_cleaned.head()

Unnamed: 0,id,poster_path,title,overview,release_date,popularity,original_title,backdrop_path,keywords,vote_count,vote_average,original_language,genres,genre_ids_cleaned
0,321612,/tWqifoYuwLETmmasnGHO7xBjEtt.jpg,Beauty and the Beast,A live-action adaptation of Disney's version o...,3/17/17,180.45132,Beauty and the Beast,/6aUWe0GSl69wMTSWWexsorMIvwU.jpg,"france,magic,castle,fairy tale,musical,curse,c...",1246,7.1,en,"Fantasy,Music,Romance",141040210749
1,263115,/45Y1G5FEgttPAwjTYic6czC9xCn.jpg,Logan,"In the near future, a weary Logan cares for an...",2/28/17,117.369877,Logan,/5pAGnkFYSsFJ99ZxDIYnhQbQFXs.jpg,"cyborg,experiment,self-destruction,mutant,futu...",2075,7.6,en,"Action,Drama,Science Fiction",2818878
2,335797,/s9ye87pvq2IaDvjv9x4IOXVjvA7.jpg,Sing,A koala named Buster recruits his best friend ...,11/23/16,80.989984,Sing,/fxDXp8un4qNY9b1dLd7SH6CKzC.jpg,"furry,talking animal,singing,anthropomorphic a...",1007,6.7,en,"Animation,Comedy,Drama,Family,Music",1635181075110402
3,293167,/aoUyphk4nwffrwlZRaOa0eijgpr.jpg,Kong: Skull Island,Explore the mysterious and dangerous home of t...,3/8/17,61.933195,Kong: Skull Island,/pGwChWiAY1bdoxL79sXmaFBlYJH.jpg,"monster,expedition,island,prequel,king kong,ka...",876,6.1,en,"Science Fiction,Action,Adventure,Fantasy",878281214
4,135397,/jjBgi2r5cRt36xF6iNUEhzscEcb.jpg,Jurassic World,Twenty-two years after the events of Jurassic ...,6/9/15,51.976724,Jurassic World,/dkMD5qlogeRMiEixC4YNPUvax2T.jpg,"monster,dna,tyrannosaurus rex,velociraptor,isl...",6699,6.5,en,"Action,Adventure,Science Fiction,Thriller",281287853


In [109]:
# check for other 
for i in range(movies_combined.shape[0]):
    row = movies_combined.iloc[i]
    if 'other' in row['genres']:
        print row['title'], ' num genres:', len(row['genres'].split(','))

In [110]:
genre_counts = {}
for i in range(movies_combined.shape[0]):
    row = movies_combined.iloc[i]
    genre_id_str = row['genre_ids']
    genre_ids = []
    if genre_id_str != '[]':
        genre_ids = np.array(np.matrix(genre_id_str))[0]
    for gid in genre_ids:
        if gid not in genre_counts:
            genre_counts[gid] = 1
        else:
            genre_counts[gid] +=1

In [111]:
genre_count_names = {}
for name in genre_counts:
    if (name in GENRE_MAP):
        genre = GENRE_MAP[name]
    else:
        genre = name
    genre_count_names[genre] = genre_counts[name]

In [112]:
genre_count_names

{10769: 109,
 u'Action': 2648,
 u'Adventure': 1667,
 u'Animation': 865,
 u'Comedy': 3675,
 u'Crime': 1501,
 u'Documentary': 306,
 u'Drama': 5505,
 u'Family': 1102,
 u'Fantasy': 995,
 u'History': 453,
 u'Horror': 1456,
 u'Music': 337,
 u'Mystery': 845,
 u'Romance': 1991,
 u'Science Fiction': 1220,
 u'TV Movie': 183,
 u'Thriller': 2977,
 u'War': 376,
 u'Western': 214}